Three defensive improvements from peer code review: Replace unreachable!() in GitLab client retry loops: Both request() and request_with_headers() had unreachable!() after their for loops. While the logic was sound (the final iteration always reaches the return/break), any refactor to the loop condition would turn this into a runtime panic. Restructured both to store last_response with explicit break, making the control flow self-documenting and the .expect() message useful if ever violated. Doctor model name comparison asymmetry: Ollama model names were stripped of their tag (:latest, :v1.5) for comparison, but the configured model name was compared as-is. A config value like "nomic-embed-text:v1.5" would never match. Now strips the tag from both sides before comparing. Regenerator savepoint cleanup and progress accuracy: - upsert_document's error path did ROLLBACK TO but never RELEASE, leaving a dangling savepoint that could nest on the next call. Added RELEASE after rollback so the connection is clean. - estimated_total for progress reporting was computed once at start but the dirty queue can grow during processing. Now recounts each loop iteration with max() so the progress fraction never goes backwards. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
504 lines
18 KiB
Rust
504 lines
18 KiB
Rust
use rusqlite::Connection;
|
|
use rusqlite::OptionalExtension;
|
|
use tracing::{debug, instrument, warn};
|
|
|
|
use crate::core::error::Result;
|
|
use crate::documents::{
|
|
DocumentData, SourceType, extract_discussion_document, extract_issue_document,
|
|
extract_mr_document,
|
|
};
|
|
use crate::ingestion::dirty_tracker::{clear_dirty, get_dirty_sources, record_dirty_error};
|
|
|
|
/// Result of a document regeneration run.
|
|
#[derive(Debug, Default)]
|
|
pub struct RegenerateResult {
|
|
pub regenerated: usize,
|
|
pub unchanged: usize,
|
|
pub errored: usize,
|
|
}
|
|
|
|
/// Drain the dirty_sources queue, regenerating documents for each entry.
|
|
///
|
|
/// Uses per-item error handling (fail-soft) and drains the queue completely
|
|
/// via a bounded batch loop. Each dirty item is processed independently.
|
|
///
|
|
/// `progress_callback` reports `(processed, estimated_total)` after each item.
|
|
#[instrument(
|
|
skip(conn, progress_callback),
|
|
fields(items_processed, items_skipped, errors)
|
|
)]
|
|
pub fn regenerate_dirty_documents(
|
|
conn: &Connection,
|
|
progress_callback: Option<&dyn Fn(usize, usize)>,
|
|
) -> Result<RegenerateResult> {
|
|
let mut result = RegenerateResult::default();
|
|
|
|
// Estimated total for progress reporting. Recount each loop iteration
|
|
// so the denominator grows if new items are enqueued during processing
|
|
// (the queue can grow while we drain it). We use max() so the value
|
|
// never shrinks — preventing the progress fraction from going backwards.
|
|
let mut estimated_total: usize = 0;
|
|
|
|
loop {
|
|
let dirty = get_dirty_sources(conn)?;
|
|
if dirty.is_empty() {
|
|
break;
|
|
}
|
|
|
|
// Recount remaining + already-processed to get the true total.
|
|
let remaining: usize = conn
|
|
.query_row("SELECT COUNT(*) FROM dirty_sources", [], |row| row.get(0))
|
|
.unwrap_or(0_i64) as usize;
|
|
let processed_so_far = result.regenerated + result.unchanged + result.errored;
|
|
estimated_total = estimated_total.max(processed_so_far + remaining);
|
|
|
|
for (source_type, source_id) in &dirty {
|
|
match regenerate_one(conn, *source_type, *source_id) {
|
|
Ok(changed) => {
|
|
if changed {
|
|
result.regenerated += 1;
|
|
} else {
|
|
result.unchanged += 1;
|
|
}
|
|
clear_dirty(conn, *source_type, *source_id)?;
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
source_type = %source_type,
|
|
source_id,
|
|
error = %e,
|
|
"Failed to regenerate document"
|
|
);
|
|
record_dirty_error(conn, *source_type, *source_id, &e.to_string())?;
|
|
result.errored += 1;
|
|
}
|
|
}
|
|
|
|
let processed = result.regenerated + result.unchanged + result.errored;
|
|
if let Some(cb) = progress_callback {
|
|
cb(processed, estimated_total);
|
|
}
|
|
}
|
|
}
|
|
|
|
debug!(
|
|
regenerated = result.regenerated,
|
|
unchanged = result.unchanged,
|
|
errored = result.errored,
|
|
"Document regeneration complete"
|
|
);
|
|
|
|
tracing::Span::current().record("items_processed", result.regenerated);
|
|
tracing::Span::current().record("items_skipped", result.unchanged);
|
|
tracing::Span::current().record("errors", result.errored);
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Regenerate a single document. Returns true if content_hash changed.
|
|
fn regenerate_one(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<bool> {
|
|
let doc = match source_type {
|
|
SourceType::Issue => extract_issue_document(conn, source_id)?,
|
|
SourceType::MergeRequest => extract_mr_document(conn, source_id)?,
|
|
SourceType::Discussion => extract_discussion_document(conn, source_id)?,
|
|
};
|
|
|
|
let Some(doc) = doc else {
|
|
// Source was deleted — remove the document (cascade handles FTS/embeddings)
|
|
delete_document(conn, source_type, source_id)?;
|
|
return Ok(true);
|
|
};
|
|
|
|
let existing_hash = get_existing_hash(conn, source_type, source_id)?;
|
|
let changed = existing_hash.as_ref() != Some(&doc.content_hash);
|
|
|
|
// Always upsert: labels/paths can change independently of content_hash
|
|
upsert_document(conn, &doc)?;
|
|
|
|
Ok(changed)
|
|
}
|
|
|
|
/// Get existing content hash for a document, if it exists.
|
|
fn get_existing_hash(
|
|
conn: &Connection,
|
|
source_type: SourceType,
|
|
source_id: i64,
|
|
) -> Result<Option<String>> {
|
|
let mut stmt = conn
|
|
.prepare("SELECT content_hash FROM documents WHERE source_type = ?1 AND source_id = ?2")?;
|
|
|
|
let hash: Option<String> = stmt
|
|
.query_row(rusqlite::params![source_type.as_str(), source_id], |row| {
|
|
row.get(0)
|
|
})
|
|
.optional()?;
|
|
|
|
Ok(hash)
|
|
}
|
|
|
|
/// Upsert a document with triple-hash write optimization.
|
|
///
|
|
/// Wrapped in a SAVEPOINT to ensure atomicity of the multi-statement write
|
|
/// (document row + labels + paths). Without this, a crash between statements
|
|
/// could leave the document with a stale labels_hash but missing label rows.
|
|
fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<()> {
|
|
conn.execute_batch("SAVEPOINT upsert_doc")?;
|
|
match upsert_document_inner(conn, doc) {
|
|
Ok(()) => {
|
|
conn.execute_batch("RELEASE upsert_doc")?;
|
|
Ok(())
|
|
}
|
|
Err(e) => {
|
|
// ROLLBACK TO restores the savepoint but leaves it active.
|
|
// RELEASE removes it so the connection is clean for the next call.
|
|
let _ = conn.execute_batch("ROLLBACK TO upsert_doc; RELEASE upsert_doc");
|
|
Err(e)
|
|
}
|
|
}
|
|
}
|
|
|
|
fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<()> {
|
|
// Check existing hashes before writing
|
|
let existing: Option<(i64, String, String, String)> = conn
|
|
.query_row(
|
|
"SELECT id, content_hash, labels_hash, paths_hash FROM documents
|
|
WHERE source_type = ?1 AND source_id = ?2",
|
|
rusqlite::params![doc.source_type.as_str(), doc.source_id],
|
|
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
|
|
)
|
|
.optional()?;
|
|
|
|
// Fast path: skip ALL writes when nothing changed (prevents WAL churn)
|
|
if let Some((_, ref old_content_hash, ref old_labels_hash, ref old_paths_hash)) = existing
|
|
&& old_content_hash == &doc.content_hash
|
|
&& old_labels_hash == &doc.labels_hash
|
|
&& old_paths_hash == &doc.paths_hash
|
|
{
|
|
return Ok(());
|
|
}
|
|
|
|
let labels_json = serde_json::to_string(&doc.labels).unwrap_or_else(|_| "[]".to_string());
|
|
|
|
// Upsert document row
|
|
conn.execute(
|
|
"INSERT INTO documents
|
|
(source_type, source_id, project_id, author_username, label_names,
|
|
labels_hash, paths_hash,
|
|
created_at, updated_at, url, title, content_text, content_hash,
|
|
is_truncated, truncated_reason)
|
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15)
|
|
ON CONFLICT(source_type, source_id) DO UPDATE SET
|
|
author_username = excluded.author_username,
|
|
label_names = excluded.label_names,
|
|
labels_hash = excluded.labels_hash,
|
|
paths_hash = excluded.paths_hash,
|
|
updated_at = excluded.updated_at,
|
|
url = excluded.url,
|
|
title = excluded.title,
|
|
content_text = excluded.content_text,
|
|
content_hash = excluded.content_hash,
|
|
is_truncated = excluded.is_truncated,
|
|
truncated_reason = excluded.truncated_reason",
|
|
rusqlite::params![
|
|
doc.source_type.as_str(),
|
|
doc.source_id,
|
|
doc.project_id,
|
|
doc.author_username,
|
|
labels_json,
|
|
doc.labels_hash,
|
|
doc.paths_hash,
|
|
doc.created_at,
|
|
doc.updated_at,
|
|
doc.url,
|
|
doc.title,
|
|
doc.content_text,
|
|
doc.content_hash,
|
|
doc.is_truncated as i32,
|
|
doc.truncated_reason,
|
|
],
|
|
)?;
|
|
|
|
// Get document ID
|
|
let doc_id = match existing {
|
|
Some((id, _, _, _)) => id,
|
|
None => get_document_id(conn, doc.source_type, doc.source_id)?,
|
|
};
|
|
|
|
// Only update labels if hash changed
|
|
let labels_changed = match &existing {
|
|
Some((_, _, old_hash, _)) => old_hash != &doc.labels_hash,
|
|
None => true,
|
|
};
|
|
if labels_changed {
|
|
conn.execute(
|
|
"DELETE FROM document_labels WHERE document_id = ?1",
|
|
[doc_id],
|
|
)?;
|
|
for label in &doc.labels {
|
|
conn.execute(
|
|
"INSERT INTO document_labels (document_id, label_name) VALUES (?1, ?2)",
|
|
rusqlite::params![doc_id, label],
|
|
)?;
|
|
}
|
|
}
|
|
|
|
// Only update paths if hash changed
|
|
let paths_changed = match &existing {
|
|
Some((_, _, _, old_hash)) => old_hash != &doc.paths_hash,
|
|
None => true,
|
|
};
|
|
if paths_changed {
|
|
conn.execute(
|
|
"DELETE FROM document_paths WHERE document_id = ?1",
|
|
[doc_id],
|
|
)?;
|
|
for path in &doc.paths {
|
|
conn.execute(
|
|
"INSERT INTO document_paths (document_id, path) VALUES (?1, ?2)",
|
|
rusqlite::params![doc_id, path],
|
|
)?;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Delete a document by source identity.
|
|
fn delete_document(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
|
|
conn.execute(
|
|
"DELETE FROM documents WHERE source_type = ?1 AND source_id = ?2",
|
|
rusqlite::params![source_type.as_str(), source_id],
|
|
)?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Get document ID by source type and source ID.
|
|
fn get_document_id(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<i64> {
|
|
let id: i64 = conn.query_row(
|
|
"SELECT id FROM documents WHERE source_type = ?1 AND source_id = ?2",
|
|
rusqlite::params![source_type.as_str(), source_id],
|
|
|row| row.get(0),
|
|
)?;
|
|
Ok(id)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::ingestion::dirty_tracker::mark_dirty;
|
|
|
|
fn setup_db() -> Connection {
|
|
let conn = Connection::open_in_memory().unwrap();
|
|
conn.execute_batch("
|
|
CREATE TABLE projects (
|
|
id INTEGER PRIMARY KEY,
|
|
gitlab_project_id INTEGER UNIQUE NOT NULL,
|
|
path_with_namespace TEXT NOT NULL,
|
|
default_branch TEXT,
|
|
web_url TEXT,
|
|
created_at INTEGER,
|
|
updated_at INTEGER,
|
|
raw_payload_id INTEGER
|
|
);
|
|
INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project');
|
|
|
|
CREATE TABLE issues (
|
|
id INTEGER PRIMARY KEY,
|
|
gitlab_id INTEGER UNIQUE NOT NULL,
|
|
project_id INTEGER NOT NULL REFERENCES projects(id),
|
|
iid INTEGER NOT NULL,
|
|
title TEXT,
|
|
description TEXT,
|
|
state TEXT NOT NULL,
|
|
author_username TEXT,
|
|
created_at INTEGER NOT NULL,
|
|
updated_at INTEGER NOT NULL,
|
|
last_seen_at INTEGER NOT NULL,
|
|
discussions_synced_for_updated_at INTEGER,
|
|
resource_events_synced_for_updated_at INTEGER,
|
|
web_url TEXT,
|
|
raw_payload_id INTEGER
|
|
);
|
|
CREATE TABLE labels (
|
|
id INTEGER PRIMARY KEY,
|
|
gitlab_id INTEGER,
|
|
project_id INTEGER NOT NULL REFERENCES projects(id),
|
|
name TEXT NOT NULL,
|
|
color TEXT,
|
|
description TEXT
|
|
);
|
|
CREATE TABLE issue_labels (
|
|
issue_id INTEGER NOT NULL REFERENCES issues(id),
|
|
label_id INTEGER NOT NULL REFERENCES labels(id),
|
|
PRIMARY KEY(issue_id, label_id)
|
|
);
|
|
|
|
CREATE TABLE documents (
|
|
id INTEGER PRIMARY KEY,
|
|
source_type TEXT NOT NULL,
|
|
source_id INTEGER NOT NULL,
|
|
project_id INTEGER NOT NULL,
|
|
author_username TEXT,
|
|
label_names TEXT,
|
|
created_at INTEGER,
|
|
updated_at INTEGER,
|
|
url TEXT,
|
|
title TEXT,
|
|
content_text TEXT NOT NULL,
|
|
content_hash TEXT NOT NULL,
|
|
labels_hash TEXT NOT NULL DEFAULT '',
|
|
paths_hash TEXT NOT NULL DEFAULT '',
|
|
is_truncated INTEGER NOT NULL DEFAULT 0,
|
|
truncated_reason TEXT,
|
|
UNIQUE(source_type, source_id)
|
|
);
|
|
CREATE TABLE document_labels (
|
|
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
label_name TEXT NOT NULL,
|
|
PRIMARY KEY(document_id, label_name)
|
|
);
|
|
CREATE TABLE document_paths (
|
|
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
path TEXT NOT NULL,
|
|
PRIMARY KEY(document_id, path)
|
|
);
|
|
CREATE TABLE dirty_sources (
|
|
source_type TEXT NOT NULL,
|
|
source_id INTEGER NOT NULL,
|
|
queued_at INTEGER NOT NULL,
|
|
attempt_count INTEGER NOT NULL DEFAULT 0,
|
|
last_attempt_at INTEGER,
|
|
last_error TEXT,
|
|
next_attempt_at INTEGER,
|
|
PRIMARY KEY(source_type, source_id)
|
|
);
|
|
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
|
|
").unwrap();
|
|
conn
|
|
}
|
|
|
|
#[test]
|
|
fn test_regenerate_creates_document() {
|
|
let conn = setup_db();
|
|
conn.execute(
|
|
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test Issue', 'Description here', 'opened', 'alice', 1000, 2000, 3000)",
|
|
[],
|
|
).unwrap();
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
|
|
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
|
assert_eq!(result.regenerated, 1);
|
|
assert_eq!(result.unchanged, 0);
|
|
assert_eq!(result.errored, 0);
|
|
|
|
// Verify document was created
|
|
let count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
|
|
.unwrap();
|
|
assert_eq!(count, 1);
|
|
|
|
let content: String = conn
|
|
.query_row("SELECT content_text FROM documents", [], |r| r.get(0))
|
|
.unwrap();
|
|
assert!(content.contains("[[Issue]] #42: Test Issue"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_regenerate_unchanged() {
|
|
let conn = setup_db();
|
|
conn.execute(
|
|
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'Desc', 'opened', 'alice', 1000, 2000, 3000)",
|
|
[],
|
|
).unwrap();
|
|
|
|
// First regeneration creates the document
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
let r1 = regenerate_dirty_documents(&conn, None).unwrap();
|
|
assert_eq!(r1.regenerated, 1);
|
|
|
|
// Second regeneration — same data, should be unchanged
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
let r2 = regenerate_dirty_documents(&conn, None).unwrap();
|
|
assert_eq!(r2.unchanged, 1);
|
|
assert_eq!(r2.regenerated, 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_regenerate_deleted_source() {
|
|
let conn = setup_db();
|
|
conn.execute(
|
|
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
|
|
[],
|
|
).unwrap();
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
regenerate_dirty_documents(&conn, None).unwrap();
|
|
|
|
// Delete the issue and re-mark dirty
|
|
conn.execute("PRAGMA foreign_keys = OFF", []).unwrap();
|
|
conn.execute("DELETE FROM issues WHERE id = 1", []).unwrap();
|
|
conn.execute("PRAGMA foreign_keys = ON", []).unwrap();
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
|
|
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
|
assert_eq!(result.regenerated, 1); // Deletion counts as "changed"
|
|
|
|
let count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
|
|
.unwrap();
|
|
assert_eq!(count, 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_regenerate_drains_queue() {
|
|
let conn = setup_db();
|
|
for i in 1..=10 {
|
|
conn.execute(
|
|
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (?1, ?2, 1, ?1, 'Test', 'opened', 1000, 2000, 3000)",
|
|
rusqlite::params![i, i * 10],
|
|
).unwrap();
|
|
mark_dirty(&conn, SourceType::Issue, i).unwrap();
|
|
}
|
|
|
|
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
|
assert_eq!(result.regenerated, 10);
|
|
|
|
// Queue should be empty
|
|
let dirty = get_dirty_sources(&conn).unwrap();
|
|
assert!(dirty.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_triple_hash_fast_path() {
|
|
let conn = setup_db();
|
|
conn.execute(
|
|
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
|
|
[],
|
|
).unwrap();
|
|
conn.execute(
|
|
"INSERT INTO labels (id, project_id, name) VALUES (1, 1, 'bug')",
|
|
[],
|
|
)
|
|
.unwrap();
|
|
conn.execute(
|
|
"INSERT INTO issue_labels (issue_id, label_id) VALUES (1, 1)",
|
|
[],
|
|
)
|
|
.unwrap();
|
|
|
|
// First run creates document
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
regenerate_dirty_documents(&conn, None).unwrap();
|
|
|
|
// Second run — triple hash match, should skip ALL writes
|
|
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
|
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
|
assert_eq!(result.unchanged, 1);
|
|
|
|
// Labels should still be present (not deleted and re-inserted)
|
|
let label_count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM document_labels", [], |r| r.get(0))
|
|
.unwrap();
|
|
assert_eq!(label_count, 1);
|
|
}
|
|
}
|