fix(embedding): Harden pipeline against chunk overflow, config drift, and partial failures

Reduces CHUNK_MAX_BYTES from 32KB to 6KB and CHUNK_OVERLAP_CHARS from
500 to 200 to stay within nomic-embed-text's 8,192-token context
window. This commit addresses all downstream consequences of that
reduction:

- Config drift detection: find_pending_documents and
  count_pending_documents now take model_name and compare
  chunk_max_bytes, model, and dims against stored metadata. Documents
  embedded with stale config are automatically re-queued.

- Overflow guard: documents producing >= CHUNK_ROWID_MULTIPLIER chunks
  are skipped with a sentinel error recorded in embedding_metadata,
  preventing both rowid collision and infinite re-processing loops.

- Deferred clearing: old embeddings are no longer cleared before
  attempting new ones. clear_document_embeddings is deferred until the
  first successful chunk embedding, so if all chunks fail the document
  retains its previous embeddings rather than losing all data.

- Savepoints: each page of DB writes is wrapped in a SQLite savepoint
  so a crash mid-page rolls back atomically instead of leaving partial
  state (cleared embeddings with no replacements).

- Per-chunk retry on context overflow: when a batch fails with a
  context-length error, each chunk is retried individually so one
  oversized chunk doesn't poison the entire batch.

- Adaptive dedup in vector search: replaces the static 3x over-fetch
  multiplier with a dynamic one based on actual max chunks per document
  (using the new chunk_count column with a fallback COUNT query for
  pre-migration data). Also replaces partial_cmp with total_cmp for
  f64 distance sorting.

- Stores chunk_max_bytes and chunk_count (on sentinel rows) in
  embedding_metadata to support config drift detection and adaptive
  dedup without runtime queries.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-03 09:35:08 -05:00
parent 2a52594a60
commit 7d07f95d4c
5 changed files with 275 additions and 59 deletions

View File

@@ -3,6 +3,7 @@
use rusqlite::Connection;
use crate::core::error::Result;
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
/// A document that needs embedding or re-embedding.
#[derive(Debug)]
@@ -12,17 +13,20 @@ pub struct PendingDocument {
pub content_hash: String,
}
/// Find documents that need embedding: new (no metadata) or changed (hash mismatch).
/// Find documents that need embedding: new (no metadata), changed (hash mismatch),
/// or config-drifted (chunk_max_bytes/model/dims mismatch).
///
/// Uses keyset pagination (WHERE d.id > last_id) and returns up to `page_size` results.
pub fn find_pending_documents(
conn: &Connection,
page_size: usize,
last_id: i64,
model_name: &str,
) -> Result<Vec<PendingDocument>> {
// Documents that either:
// 1. Have no embedding_metadata at all (new)
// 2. Have metadata where document_hash != content_hash (changed)
// 3. Config drift: chunk_max_bytes, model, or dims mismatch (or pre-migration NULL)
let sql = r#"
SELECT d.id, d.content_text, d.content_hash
FROM documents d
@@ -37,6 +41,16 @@ pub fn find_pending_documents(
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND (
em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?3
OR em.model != ?4
OR em.dims != ?5
)
)
)
ORDER BY d.id
LIMIT ?2
@@ -44,35 +58,56 @@ pub fn find_pending_documents(
let mut stmt = conn.prepare(sql)?;
let rows = stmt
.query_map(rusqlite::params![last_id, page_size as i64], |row| {
Ok(PendingDocument {
document_id: row.get(0)?,
content_text: row.get(1)?,
content_hash: row.get(2)?,
})
})?
.query_map(
rusqlite::params![
last_id,
page_size as i64,
CHUNK_MAX_BYTES as i64,
model_name,
EXPECTED_DIMS as i64,
],
|row| {
Ok(PendingDocument {
document_id: row.get(0)?,
content_text: row.get(1)?,
content_hash: row.get(2)?,
})
},
)?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(rows)
}
/// Count total documents that need embedding.
pub fn count_pending_documents(conn: &Connection) -> Result<i64> {
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
let count: i64 = conn.query_row(
r#"
SELECT COUNT(*)
FROM documents d
WHERE NOT EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
WHERE (
NOT EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND (
em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?1
OR em.model != ?2
OR em.dims != ?3
)
)
)
"#,
[],
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|row| row.get(0),
)?;
Ok(count)