fix(embedding): Harden pipeline against chunk overflow, config drift, and partial failures
Reduces CHUNK_MAX_BYTES from 32KB to 6KB and CHUNK_OVERLAP_CHARS from 500 to 200 to stay within nomic-embed-text's 8,192-token context window. This commit addresses all downstream consequences of that reduction: - Config drift detection: find_pending_documents and count_pending_documents now take model_name and compare chunk_max_bytes, model, and dims against stored metadata. Documents embedded with stale config are automatically re-queued. - Overflow guard: documents producing >= CHUNK_ROWID_MULTIPLIER chunks are skipped with a sentinel error recorded in embedding_metadata, preventing both rowid collision and infinite re-processing loops. - Deferred clearing: old embeddings are no longer cleared before attempting new ones. clear_document_embeddings is deferred until the first successful chunk embedding, so if all chunks fail the document retains its previous embeddings rather than losing all data. - Savepoints: each page of DB writes is wrapped in a SQLite savepoint so a crash mid-page rolls back atomically instead of leaving partial state (cleared embeddings with no replacements). - Per-chunk retry on context overflow: when a batch fails with a context-length error, each chunk is retried individually so one oversized chunk doesn't poison the entire batch. - Adaptive dedup in vector search: replaces the static 3x over-fetch multiplier with a dynamic one based on actual max chunks per document (using the new chunk_count column with a fallback COUNT query for pre-migration data). Also replaces partial_cmp with total_cmp for f64 distance sorting. - Stores chunk_max_bytes and chunk_count (on sentinel rows) in embedding_metadata to support config drift detection and adaptive dedup without runtime queries. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
||||
|
||||
/// A document that needs embedding or re-embedding.
|
||||
#[derive(Debug)]
|
||||
@@ -12,17 +13,20 @@ pub struct PendingDocument {
|
||||
pub content_hash: String,
|
||||
}
|
||||
|
||||
/// Find documents that need embedding: new (no metadata) or changed (hash mismatch).
|
||||
/// Find documents that need embedding: new (no metadata), changed (hash mismatch),
|
||||
/// or config-drifted (chunk_max_bytes/model/dims mismatch).
|
||||
///
|
||||
/// Uses keyset pagination (WHERE d.id > last_id) and returns up to `page_size` results.
|
||||
pub fn find_pending_documents(
|
||||
conn: &Connection,
|
||||
page_size: usize,
|
||||
last_id: i64,
|
||||
model_name: &str,
|
||||
) -> Result<Vec<PendingDocument>> {
|
||||
// Documents that either:
|
||||
// 1. Have no embedding_metadata at all (new)
|
||||
// 2. Have metadata where document_hash != content_hash (changed)
|
||||
// 3. Config drift: chunk_max_bytes, model, or dims mismatch (or pre-migration NULL)
|
||||
let sql = r#"
|
||||
SELECT d.id, d.content_text, d.content_hash
|
||||
FROM documents d
|
||||
@@ -37,6 +41,16 @@ pub fn find_pending_documents(
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND (
|
||||
em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?3
|
||||
OR em.model != ?4
|
||||
OR em.dims != ?5
|
||||
)
|
||||
)
|
||||
)
|
||||
ORDER BY d.id
|
||||
LIMIT ?2
|
||||
@@ -44,35 +58,56 @@ pub fn find_pending_documents(
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let rows = stmt
|
||||
.query_map(rusqlite::params![last_id, page_size as i64], |row| {
|
||||
Ok(PendingDocument {
|
||||
document_id: row.get(0)?,
|
||||
content_text: row.get(1)?,
|
||||
content_hash: row.get(2)?,
|
||||
})
|
||||
})?
|
||||
.query_map(
|
||||
rusqlite::params![
|
||||
last_id,
|
||||
page_size as i64,
|
||||
CHUNK_MAX_BYTES as i64,
|
||||
model_name,
|
||||
EXPECTED_DIMS as i64,
|
||||
],
|
||||
|row| {
|
||||
Ok(PendingDocument {
|
||||
document_id: row.get(0)?,
|
||||
content_text: row.get(1)?,
|
||||
content_hash: row.get(2)?,
|
||||
})
|
||||
},
|
||||
)?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Count total documents that need embedding.
|
||||
pub fn count_pending_documents(conn: &Connection) -> Result<i64> {
|
||||
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
|
||||
let count: i64 = conn.query_row(
|
||||
r#"
|
||||
SELECT COUNT(*)
|
||||
FROM documents d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
WHERE (
|
||||
NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND (
|
||||
em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?1
|
||||
OR em.model != ?2
|
||||
OR em.dims != ?3
|
||||
)
|
||||
)
|
||||
)
|
||||
"#,
|
||||
[],
|
||||
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
Ok(count)
|
||||
|
||||
Reference in New Issue
Block a user