perf(documents): eliminate redundant hash query in regeneration

The document regenerator was making two queries per document:
1. get_existing_hash() — SELECT content_hash
2. upsert_document_inner() — SELECT id, content_hash, labels_hash, paths_hash

Query 2 already returns the content_hash needed for change detection.
Remove get_existing_hash() entirely and compute content_changed inside
upsert_document_inner() from the existing row data.

upsert_document_inner now returns Result<bool> (true = content changed)
which propagates up through upsert_document and regenerate_one,
replacing the separate pre-check. The triple-hash fast-path (all three
hashes match → return Ok(false) with no writes) is preserved.

This halves the query count for unchanged documents, which dominate
incremental syncs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-06 22:42:26 -05:00
parent 8cf14fb69b
commit 2bfa4f1f8c

View File

@@ -95,38 +95,15 @@ fn regenerate_one(conn: &Connection, source_type: SourceType, source_id: i64) ->
return Ok(true);
};
let existing_hash = get_existing_hash(conn, source_type, source_id)?;
let changed = existing_hash.as_ref() != Some(&doc.content_hash);
upsert_document(conn, &doc)?;
Ok(changed)
upsert_document(conn, &doc)
}
fn get_existing_hash(
conn: &Connection,
source_type: SourceType,
source_id: i64,
) -> Result<Option<String>> {
let mut stmt = conn.prepare_cached(
"SELECT content_hash FROM documents WHERE source_type = ?1 AND source_id = ?2",
)?;
let hash: Option<String> = stmt
.query_row(rusqlite::params![source_type.as_str(), source_id], |row| {
row.get(0)
})
.optional()?;
Ok(hash)
}
fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<()> {
fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<bool> {
conn.execute_batch("SAVEPOINT upsert_doc")?;
match upsert_document_inner(conn, doc) {
Ok(()) => {
Ok(changed) => {
conn.execute_batch("RELEASE upsert_doc")?;
Ok(())
Ok(changed)
}
Err(e) => {
let _ = conn.execute_batch("ROLLBACK TO upsert_doc; RELEASE upsert_doc");
@@ -135,7 +112,7 @@ fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<()> {
}
}
fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<()> {
fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<bool> {
let existing: Option<(i64, String, String, String)> = conn
.query_row(
"SELECT id, content_hash, labels_hash, paths_hash FROM documents
@@ -145,12 +122,17 @@ fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<()> {
)
.optional()?;
let content_changed = match &existing {
Some((_, old_content_hash, _, _)) => old_content_hash != &doc.content_hash,
None => true,
};
if let Some((_, ref old_content_hash, ref old_labels_hash, ref old_paths_hash)) = existing
&& old_content_hash == &doc.content_hash
&& old_labels_hash == &doc.labels_hash
&& old_paths_hash == &doc.paths_hash
{
return Ok(());
return Ok(false);
}
let labels_json = serde_json::to_string(&doc.labels).unwrap_or_else(|_| "[]".to_string());
@@ -260,7 +242,7 @@ fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<()> {
}
}
Ok(())
Ok(content_changed)
}
fn delete_document(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {