fix(embedding): Harden pipeline against chunk overflow, config drift, and partial failures

Reduces CHUNK_MAX_BYTES from 32KB to 6KB and CHUNK_OVERLAP_CHARS from 500 to 200 to stay within nomic-embed-text's 8,192-token context window. This commit addresses all downstream consequences of that reduction: - Config drift detection: find_pending_documents and count_pending_documents now take model_name and compare chunk_max_bytes, model, and dims against stored metadata. Documents embedded with stale config are automatically re-queued. - Overflow guard: documents producing >= CHUNK_ROWID_MULTIPLIER chunks are skipped with a sentinel error recorded in embedding_metadata, preventing both rowid collision and infinite re-processing loops. - Deferred clearing: old embeddings are no longer cleared before attempting new ones. clear_document_embeddings is deferred until the first successful chunk embedding, so if all chunks fail the document retains its previous embeddings rather than losing all data. - Savepoints: each page of DB writes is wrapped in a SQLite savepoint so a crash mid-page rolls back atomically instead of leaving partial state (cleared embeddings with no replacements). - Per-chunk retry on context overflow: when a batch fails with a context-length error, each chunk is retried individually so one oversized chunk doesn't poison the entire batch. - Adaptive dedup in vector search: replaces the static 3x over-fetch multiplier with a dynamic one based on actual max chunks per document (using the new chunk_count column with a fallback COUNT query for pre-migration data). Also replaces partial_cmp with total_cmp for f64 distance sorting. - Stores chunk_max_bytes and chunk_count (on sentinel rows) in embedding_metadata to support config drift detection and adaptive dedup without runtime queries. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 09:35:08 -05:00
parent 2a52594a60
commit 7d07f95d4c
5 changed files with 275 additions and 59 deletions
--- a/src/search/vector.rs
+++ b/src/search/vector.rs
@@ -12,10 +12,39 @@ pub struct VectorResult {
    pub distance: f64,
 }

+/// Query the maximum number of chunks per document for adaptive dedup sizing.
+fn max_chunks_per_document(conn: &Connection) -> i64 {
+    // Fast path: stored chunk_count on sentinel rows (post-migration 010)
+    let stored: Option<i64> = conn
+        .query_row(
+            "SELECT MAX(chunk_count) FROM embedding_metadata
+             WHERE chunk_index = 0 AND chunk_count IS NOT NULL",
+            [],
+            |row| row.get(0),
+        )
+        .unwrap_or(None);
+
+    if let Some(max) = stored {
+        return max;
+    }
+
+    // Fallback for pre-migration data: count chunks per document
+    conn.query_row(
+        "SELECT COALESCE(MAX(cnt), 1) FROM (
+            SELECT COUNT(*) as cnt FROM embedding_metadata
+            WHERE last_error IS NULL GROUP BY document_id
+        )",
+        [],
+        |row| row.get(0),
+    )
+    .unwrap_or(1)
+}
+
 /// Search documents using sqlite-vec KNN query.
 ///
-/// Over-fetches 3x limit to handle chunk deduplication (multiple chunks per
-/// document produce multiple KNN results for the same document_id).
+/// Over-fetches by an adaptive multiplier based on actual max chunks per document
+/// to handle chunk deduplication (multiple chunks per document produce multiple
+/// KNN results for the same document_id).
 /// Returns deduplicated results with best (lowest) distance per document.
 pub fn search_vector(
    conn: &Connection,
@@ -32,7 +61,9 @@ pub fn search_vector(
        .flat_map(|f| f.to_le_bytes())
        .collect();

-    let k = limit * 3; // Over-fetch for dedup
+    let max_chunks = max_chunks_per_document(conn);
+    let multiplier = ((max_chunks as usize * 3 / 2) + 1).max(8);
+    let k = limit * multiplier;

    let mut stmt = conn.prepare(
        "SELECT rowid, distance
@@ -69,7 +100,7 @@ pub fn search_vector(
            distance,
        })
        .collect();
-    results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
+    results.sort_by(|a, b| a.distance.total_cmp(&b.distance));
    results.truncate(limit);

    Ok(results)
@@ -132,7 +163,7 @@ mod tests {
            .into_iter()
            .map(|(document_id, distance)| VectorResult { document_id, distance })
            .collect();
-        results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
+        results.sort_by(|a, b| a.distance.total_cmp(&b.distance));
        results.truncate(limit);
        results
    }