fix(embedding): Harden pipeline against chunk overflow, config drift, and partial failures
Reduces CHUNK_MAX_BYTES from 32KB to 6KB and CHUNK_OVERLAP_CHARS from 500 to 200 to stay within nomic-embed-text's 8,192-token context window. This commit addresses all downstream consequences of that reduction: - Config drift detection: find_pending_documents and count_pending_documents now take model_name and compare chunk_max_bytes, model, and dims against stored metadata. Documents embedded with stale config are automatically re-queued. - Overflow guard: documents producing >= CHUNK_ROWID_MULTIPLIER chunks are skipped with a sentinel error recorded in embedding_metadata, preventing both rowid collision and infinite re-processing loops. - Deferred clearing: old embeddings are no longer cleared before attempting new ones. clear_document_embeddings is deferred until the first successful chunk embedding, so if all chunks fail the document retains its previous embeddings rather than losing all data. - Savepoints: each page of DB writes is wrapped in a SQLite savepoint so a crash mid-page rolls back atomically instead of leaving partial state (cleared embeddings with no replacements). - Per-chunk retry on context overflow: when a batch fails with a context-length error, each chunk is retried individually so one oversized chunk doesn't poison the entire batch. - Adaptive dedup in vector search: replaces the static 3x over-fetch multiplier with a dynamic one based on actual max chunks per document (using the new chunk_count column with a fallback COUNT query for pre-migration data). Also replaces partial_cmp with total_cmp for f64 distance sorting. - Stores chunk_max_bytes and chunk_count (on sentinel rows) in embedding_metadata to support config drift detection and adaptive dedup without runtime queries. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -12,10 +12,39 @@ pub struct VectorResult {
|
||||
pub distance: f64,
|
||||
}
|
||||
|
||||
/// Query the maximum number of chunks per document for adaptive dedup sizing.
|
||||
fn max_chunks_per_document(conn: &Connection) -> i64 {
|
||||
// Fast path: stored chunk_count on sentinel rows (post-migration 010)
|
||||
let stored: Option<i64> = conn
|
||||
.query_row(
|
||||
"SELECT MAX(chunk_count) FROM embedding_metadata
|
||||
WHERE chunk_index = 0 AND chunk_count IS NOT NULL",
|
||||
[],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.unwrap_or(None);
|
||||
|
||||
if let Some(max) = stored {
|
||||
return max;
|
||||
}
|
||||
|
||||
// Fallback for pre-migration data: count chunks per document
|
||||
conn.query_row(
|
||||
"SELECT COALESCE(MAX(cnt), 1) FROM (
|
||||
SELECT COUNT(*) as cnt FROM embedding_metadata
|
||||
WHERE last_error IS NULL GROUP BY document_id
|
||||
)",
|
||||
[],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.unwrap_or(1)
|
||||
}
|
||||
|
||||
/// Search documents using sqlite-vec KNN query.
|
||||
///
|
||||
/// Over-fetches 3x limit to handle chunk deduplication (multiple chunks per
|
||||
/// document produce multiple KNN results for the same document_id).
|
||||
/// Over-fetches by an adaptive multiplier based on actual max chunks per document
|
||||
/// to handle chunk deduplication (multiple chunks per document produce multiple
|
||||
/// KNN results for the same document_id).
|
||||
/// Returns deduplicated results with best (lowest) distance per document.
|
||||
pub fn search_vector(
|
||||
conn: &Connection,
|
||||
@@ -32,7 +61,9 @@ pub fn search_vector(
|
||||
.flat_map(|f| f.to_le_bytes())
|
||||
.collect();
|
||||
|
||||
let k = limit * 3; // Over-fetch for dedup
|
||||
let max_chunks = max_chunks_per_document(conn);
|
||||
let multiplier = ((max_chunks as usize * 3 / 2) + 1).max(8);
|
||||
let k = limit * multiplier;
|
||||
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT rowid, distance
|
||||
@@ -69,7 +100,7 @@ pub fn search_vector(
|
||||
distance,
|
||||
})
|
||||
.collect();
|
||||
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
|
||||
results.sort_by(|a, b| a.distance.total_cmp(&b.distance));
|
||||
results.truncate(limit);
|
||||
|
||||
Ok(results)
|
||||
@@ -132,7 +163,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|(document_id, distance)| VectorResult { document_id, distance })
|
||||
.collect();
|
||||
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
|
||||
results.sort_by(|a, b| a.distance.total_cmp(&b.distance));
|
||||
results.truncate(limit);
|
||||
results
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user