Files
gitlore/src/documents/regenerator.rs
Taylor Eernisse 65583ed5d6 refactor: Remove redundant doc comments throughout codebase
Removes module-level doc comments (//! lines) and excessive inline doc
comments that were duplicating information already evident from:
- Function/struct names (self-documenting code)
- Type signatures (the what is clear from types)
- Implementation context (the how is clear from code)

Affected modules:
- cli/* - Removed command descriptions duplicating clap help text
- core/* - Removed module headers and obvious function docs
- documents/* - Removed extractor/regenerator/truncation docs
- embedding/* - Removed pipeline and chunking docs
- gitlab/* - Removed client and transformer docs (kept type definitions)
- ingestion/* - Removed orchestrator and ingestion docs
- search/* - Removed FTS and vector search docs

Philosophy: Code should be self-documenting. Comments should explain
"why" (business decisions, non-obvious constraints) not "what" (which
the code itself shows). This change reduces noise and maintenance burden
while keeping the codebase just as understandable.

Retains comments for:
- Non-obvious business logic
- Important safety invariants
- Complex algorithm explanations
- Public API boundaries where generated docs matter

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 00:04:32 -05:00

465 lines
16 KiB
Rust

use rusqlite::Connection;
use rusqlite::OptionalExtension;
use tracing::{debug, instrument, warn};
use crate::core::error::Result;
use crate::documents::{
DocumentData, SourceType, extract_discussion_document, extract_issue_document,
extract_mr_document,
};
use crate::ingestion::dirty_tracker::{clear_dirty, get_dirty_sources, record_dirty_error};
#[derive(Debug, Default)]
pub struct RegenerateResult {
pub regenerated: usize,
pub unchanged: usize,
pub errored: usize,
}
#[instrument(
skip(conn, progress_callback),
fields(items_processed, items_skipped, errors)
)]
pub fn regenerate_dirty_documents(
conn: &Connection,
progress_callback: Option<&dyn Fn(usize, usize)>,
) -> Result<RegenerateResult> {
let mut result = RegenerateResult::default();
let mut estimated_total: usize = 0;
loop {
let dirty = get_dirty_sources(conn)?;
if dirty.is_empty() {
break;
}
let remaining: usize = conn
.query_row("SELECT COUNT(*) FROM dirty_sources", [], |row| row.get(0))
.unwrap_or(0_i64) as usize;
let processed_so_far = result.regenerated + result.unchanged + result.errored;
estimated_total = estimated_total.max(processed_so_far + remaining);
for (source_type, source_id) in &dirty {
match regenerate_one(conn, *source_type, *source_id) {
Ok(changed) => {
if changed {
result.regenerated += 1;
} else {
result.unchanged += 1;
}
clear_dirty(conn, *source_type, *source_id)?;
}
Err(e) => {
warn!(
source_type = %source_type,
source_id,
error = %e,
"Failed to regenerate document"
);
record_dirty_error(conn, *source_type, *source_id, &e.to_string())?;
result.errored += 1;
}
}
let processed = result.regenerated + result.unchanged + result.errored;
if let Some(cb) = progress_callback {
cb(processed, estimated_total);
}
}
}
debug!(
regenerated = result.regenerated,
unchanged = result.unchanged,
errored = result.errored,
"Document regeneration complete"
);
tracing::Span::current().record("items_processed", result.regenerated);
tracing::Span::current().record("items_skipped", result.unchanged);
tracing::Span::current().record("errors", result.errored);
Ok(result)
}
fn regenerate_one(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<bool> {
let doc = match source_type {
SourceType::Issue => extract_issue_document(conn, source_id)?,
SourceType::MergeRequest => extract_mr_document(conn, source_id)?,
SourceType::Discussion => extract_discussion_document(conn, source_id)?,
};
let Some(doc) = doc else {
delete_document(conn, source_type, source_id)?;
return Ok(true);
};
let existing_hash = get_existing_hash(conn, source_type, source_id)?;
let changed = existing_hash.as_ref() != Some(&doc.content_hash);
upsert_document(conn, &doc)?;
Ok(changed)
}
fn get_existing_hash(
conn: &Connection,
source_type: SourceType,
source_id: i64,
) -> Result<Option<String>> {
let mut stmt = conn
.prepare("SELECT content_hash FROM documents WHERE source_type = ?1 AND source_id = ?2")?;
let hash: Option<String> = stmt
.query_row(rusqlite::params![source_type.as_str(), source_id], |row| {
row.get(0)
})
.optional()?;
Ok(hash)
}
fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<()> {
conn.execute_batch("SAVEPOINT upsert_doc")?;
match upsert_document_inner(conn, doc) {
Ok(()) => {
conn.execute_batch("RELEASE upsert_doc")?;
Ok(())
}
Err(e) => {
let _ = conn.execute_batch("ROLLBACK TO upsert_doc; RELEASE upsert_doc");
Err(e)
}
}
}
fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<()> {
let existing: Option<(i64, String, String, String)> = conn
.query_row(
"SELECT id, content_hash, labels_hash, paths_hash FROM documents
WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![doc.source_type.as_str(), doc.source_id],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
)
.optional()?;
if let Some((_, ref old_content_hash, ref old_labels_hash, ref old_paths_hash)) = existing
&& old_content_hash == &doc.content_hash
&& old_labels_hash == &doc.labels_hash
&& old_paths_hash == &doc.paths_hash
{
return Ok(());
}
let labels_json = serde_json::to_string(&doc.labels).unwrap_or_else(|_| "[]".to_string());
conn.execute(
"INSERT INTO documents
(source_type, source_id, project_id, author_username, label_names,
labels_hash, paths_hash,
created_at, updated_at, url, title, content_text, content_hash,
is_truncated, truncated_reason)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15)
ON CONFLICT(source_type, source_id) DO UPDATE SET
author_username = excluded.author_username,
label_names = excluded.label_names,
labels_hash = excluded.labels_hash,
paths_hash = excluded.paths_hash,
updated_at = excluded.updated_at,
url = excluded.url,
title = excluded.title,
content_text = excluded.content_text,
content_hash = excluded.content_hash,
is_truncated = excluded.is_truncated,
truncated_reason = excluded.truncated_reason",
rusqlite::params![
doc.source_type.as_str(),
doc.source_id,
doc.project_id,
doc.author_username,
labels_json,
doc.labels_hash,
doc.paths_hash,
doc.created_at,
doc.updated_at,
doc.url,
doc.title,
doc.content_text,
doc.content_hash,
doc.is_truncated as i32,
doc.truncated_reason,
],
)?;
let doc_id = match existing {
Some((id, _, _, _)) => id,
None => get_document_id(conn, doc.source_type, doc.source_id)?,
};
let labels_changed = match &existing {
Some((_, _, old_hash, _)) => old_hash != &doc.labels_hash,
None => true,
};
if labels_changed {
conn.execute(
"DELETE FROM document_labels WHERE document_id = ?1",
[doc_id],
)?;
for label in &doc.labels {
conn.execute(
"INSERT INTO document_labels (document_id, label_name) VALUES (?1, ?2)",
rusqlite::params![doc_id, label],
)?;
}
}
let paths_changed = match &existing {
Some((_, _, _, old_hash)) => old_hash != &doc.paths_hash,
None => true,
};
if paths_changed {
conn.execute(
"DELETE FROM document_paths WHERE document_id = ?1",
[doc_id],
)?;
for path in &doc.paths {
conn.execute(
"INSERT INTO document_paths (document_id, path) VALUES (?1, ?2)",
rusqlite::params![doc_id, path],
)?;
}
}
Ok(())
}
fn delete_document(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
conn.execute(
"DELETE FROM documents WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
)?;
Ok(())
}
fn get_document_id(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<i64> {
let id: i64 = conn.query_row(
"SELECT id FROM documents WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
|row| row.get(0),
)?;
Ok(id)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ingestion::dirty_tracker::mark_dirty;
fn setup_db() -> Connection {
let conn = Connection::open_in_memory().unwrap();
conn.execute_batch("
CREATE TABLE projects (
id INTEGER PRIMARY KEY,
gitlab_project_id INTEGER UNIQUE NOT NULL,
path_with_namespace TEXT NOT NULL,
default_branch TEXT,
web_url TEXT,
created_at INTEGER,
updated_at INTEGER,
raw_payload_id INTEGER
);
INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project');
CREATE TABLE issues (
id INTEGER PRIMARY KEY,
gitlab_id INTEGER UNIQUE NOT NULL,
project_id INTEGER NOT NULL REFERENCES projects(id),
iid INTEGER NOT NULL,
title TEXT,
description TEXT,
state TEXT NOT NULL,
author_username TEXT,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL,
last_seen_at INTEGER NOT NULL,
discussions_synced_for_updated_at INTEGER,
resource_events_synced_for_updated_at INTEGER,
web_url TEXT,
raw_payload_id INTEGER
);
CREATE TABLE labels (
id INTEGER PRIMARY KEY,
gitlab_id INTEGER,
project_id INTEGER NOT NULL REFERENCES projects(id),
name TEXT NOT NULL,
color TEXT,
description TEXT
);
CREATE TABLE issue_labels (
issue_id INTEGER NOT NULL REFERENCES issues(id),
label_id INTEGER NOT NULL REFERENCES labels(id),
PRIMARY KEY(issue_id, label_id)
);
CREATE TABLE documents (
id INTEGER PRIMARY KEY,
source_type TEXT NOT NULL,
source_id INTEGER NOT NULL,
project_id INTEGER NOT NULL,
author_username TEXT,
label_names TEXT,
created_at INTEGER,
updated_at INTEGER,
url TEXT,
title TEXT,
content_text TEXT NOT NULL,
content_hash TEXT NOT NULL,
labels_hash TEXT NOT NULL DEFAULT '',
paths_hash TEXT NOT NULL DEFAULT '',
is_truncated INTEGER NOT NULL DEFAULT 0,
truncated_reason TEXT,
UNIQUE(source_type, source_id)
);
CREATE TABLE document_labels (
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
label_name TEXT NOT NULL,
PRIMARY KEY(document_id, label_name)
);
CREATE TABLE document_paths (
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
path TEXT NOT NULL,
PRIMARY KEY(document_id, path)
);
CREATE TABLE dirty_sources (
source_type TEXT NOT NULL,
source_id INTEGER NOT NULL,
queued_at INTEGER NOT NULL,
attempt_count INTEGER NOT NULL DEFAULT 0,
last_attempt_at INTEGER,
last_error TEXT,
next_attempt_at INTEGER,
PRIMARY KEY(source_type, source_id)
);
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
").unwrap();
conn
}
#[test]
fn test_regenerate_creates_document() {
let conn = setup_db();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test Issue', 'Description here', 'opened', 'alice', 1000, 2000, 3000)",
[],
).unwrap();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let result = regenerate_dirty_documents(&conn, None).unwrap();
assert_eq!(result.regenerated, 1);
assert_eq!(result.unchanged, 0);
assert_eq!(result.errored, 0);
let count: i64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
.unwrap();
assert_eq!(count, 1);
let content: String = conn
.query_row("SELECT content_text FROM documents", [], |r| r.get(0))
.unwrap();
assert!(content.contains("[[Issue]] #42: Test Issue"));
}
#[test]
fn test_regenerate_unchanged() {
let conn = setup_db();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'Desc', 'opened', 'alice', 1000, 2000, 3000)",
[],
).unwrap();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let r1 = regenerate_dirty_documents(&conn, None).unwrap();
assert_eq!(r1.regenerated, 1);
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let r2 = regenerate_dirty_documents(&conn, None).unwrap();
assert_eq!(r2.unchanged, 1);
assert_eq!(r2.regenerated, 0);
}
#[test]
fn test_regenerate_deleted_source() {
let conn = setup_db();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
[],
).unwrap();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
regenerate_dirty_documents(&conn, None).unwrap();
conn.execute("PRAGMA foreign_keys = OFF", []).unwrap();
conn.execute("DELETE FROM issues WHERE id = 1", []).unwrap();
conn.execute("PRAGMA foreign_keys = ON", []).unwrap();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let result = regenerate_dirty_documents(&conn, None).unwrap();
assert_eq!(result.regenerated, 1);
let count: i64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
.unwrap();
assert_eq!(count, 0);
}
#[test]
fn test_regenerate_drains_queue() {
let conn = setup_db();
for i in 1..=10 {
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (?1, ?2, 1, ?1, 'Test', 'opened', 1000, 2000, 3000)",
rusqlite::params![i, i * 10],
).unwrap();
mark_dirty(&conn, SourceType::Issue, i).unwrap();
}
let result = regenerate_dirty_documents(&conn, None).unwrap();
assert_eq!(result.regenerated, 10);
let dirty = get_dirty_sources(&conn).unwrap();
assert!(dirty.is_empty());
}
#[test]
fn test_triple_hash_fast_path() {
let conn = setup_db();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
[],
).unwrap();
conn.execute(
"INSERT INTO labels (id, project_id, name) VALUES (1, 1, 'bug')",
[],
)
.unwrap();
conn.execute(
"INSERT INTO issue_labels (issue_id, label_id) VALUES (1, 1)",
[],
)
.unwrap();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
regenerate_dirty_documents(&conn, None).unwrap();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let result = regenerate_dirty_documents(&conn, None).unwrap();
assert_eq!(result.unchanged, 1);
let label_count: i64 = conn
.query_row("SELECT COUNT(*) FROM document_labels", [], |r| r.get(0))
.unwrap();
assert_eq!(label_count, 1);
}
}