fix: Savepoint leak in embedding pipeline, atomic fail_job, RRF dedup

Three correctness fixes found during peer code review:

Embedding pipeline savepoint leak (HIGH severity):
The SAVEPOINT embed_page / RELEASE embed_page pattern had ~10 `?`
propagation points between them. Any error from record_embedding_error,
clear_document_embeddings, or store_embedding would exit the function
without rolling back, leaving the SQLite connection in a broken
transactional state and causing cascading failures for the rest of the
session. Fixed by extracting page processing into `embed_page()` and
wrapping with explicit rollback-on-error handling.

Dependent queue fail_job race (MEDIUM severity):
fail_job performed a SELECT followed by a separate UPDATE on the
attempts counter without a transaction. Under concurrent lock
reclamation, the attempts value could be read stale. Replaced with a
single atomic UPDATE that increments attempts and computes exponential
backoff entirely in SQL, also halving DB round-trips. Added explicit
error when the job no longer exists.

RRF duplicate document score inflation (MEDIUM severity):
If a retriever returned the same document_id multiple times, the RRF
score accumulated multiple rank contributions while the rank only
recorded the first occurrence. Moved the score accumulation inside the
`if is_none` guard so only the first occurrence per list contributes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-04 14:16:38 -05:00
parent 266ed78e73
commit 1fdc6d03cc
3 changed files with 279 additions and 235 deletions

View File

@@ -66,227 +66,33 @@ pub async fn embed_documents(
// process crashes mid-page, the savepoint is never released and
// SQLite rolls back — preventing partial document states where old
// embeddings are cleared but new ones haven't been written yet.
//
// We use a closure + match to ensure the savepoint is always
// rolled back on error — bare `execute_batch("SAVEPOINT")` with `?`
// propagation would leak the savepoint and leave the connection in
// a broken transactional state.
conn.execute_batch("SAVEPOINT embed_page")?;
// Build chunk work items for this page
let mut all_chunks: Vec<ChunkWork> = Vec::new();
let mut page_normal_docs: usize = 0;
for doc in &pending {
// Always advance the cursor, even for skipped docs, to avoid re-fetching
last_id = doc.document_id;
if doc.content_text.is_empty() {
result.skipped += 1;
processed += 1;
continue;
let page_result = embed_page(
conn,
client,
model_name,
&pending,
&mut result,
&mut last_id,
&mut processed,
total,
&progress_callback,
)
.await;
match page_result {
Ok(()) => {
conn.execute_batch("RELEASE embed_page")?;
}
let chunks = split_into_chunks(&doc.content_text);
let total_chunks = chunks.len();
// Overflow guard: skip documents that produce too many chunks.
// Must run BEFORE clear_document_embeddings so existing embeddings
// are preserved when we skip.
if total_chunks as i64 > CHUNK_ROWID_MULTIPLIER {
warn!(
doc_id = doc.document_id,
chunk_count = total_chunks,
max = CHUNK_ROWID_MULTIPLIER,
"Document produces too many chunks, skipping to prevent rowid collision"
);
// Record a sentinel error so the document is not re-detected as
// pending on subsequent runs (prevents infinite re-processing).
record_embedding_error(
conn,
doc.document_id,
0, // sentinel chunk_index
&doc.content_hash,
"overflow-sentinel",
model_name,
&format!(
"Document produces {} chunks, exceeding max {}",
total_chunks, CHUNK_ROWID_MULTIPLIER
),
)?;
result.skipped += 1;
processed += 1;
if let Some(ref cb) = progress_callback {
cb(processed, total);
}
continue;
}
// Don't clear existing embeddings here — defer until the first
// successful chunk embedding so that if ALL chunks for a document
// fail, old embeddings survive instead of leaving zero data.
for (chunk_index, text) in chunks {
all_chunks.push(ChunkWork {
doc_id: doc.document_id,
chunk_index,
total_chunks,
doc_hash: doc.content_hash.clone(),
chunk_hash: sha256_hash(&text),
text,
});
}
page_normal_docs += 1;
// Don't fire progress here — wait until embedding completes below.
}
// Track documents whose old embeddings have been cleared.
// We defer clearing until the first successful chunk embedding so
// that if ALL chunks for a document fail, old embeddings survive.
let mut cleared_docs: HashSet<i64> = HashSet::new();
// Process chunks in batches of BATCH_SIZE
for batch in all_chunks.chunks(BATCH_SIZE) {
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
match client.embed_batch(texts).await {
Ok(embeddings) => {
for (i, embedding) in embeddings.iter().enumerate() {
if i >= batch.len() {
break;
}
let chunk = &batch[i];
if embedding.len() != EXPECTED_DIMS {
warn!(
doc_id = chunk.doc_id,
chunk_index = chunk.chunk_index,
got_dims = embedding.len(),
expected = EXPECTED_DIMS,
"Dimension mismatch, skipping"
);
record_embedding_error(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
&format!(
"Dimension mismatch: got {}, expected {}",
embedding.len(),
EXPECTED_DIMS
),
)?;
result.failed += 1;
continue;
}
// Clear old embeddings on first successful chunk for this document
if !cleared_docs.contains(&chunk.doc_id) {
clear_document_embeddings(conn, chunk.doc_id)?;
cleared_docs.insert(chunk.doc_id);
}
store_embedding(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
embedding,
chunk.total_chunks,
)?;
result.embedded += 1;
}
}
Err(e) => {
// Batch failed — retry each chunk individually so one
// oversized chunk doesn't poison the entire batch.
let err_str = e.to_string();
let err_lower = err_str.to_lowercase();
// Ollama error messages vary across versions. Match broadly
// against known patterns to detect context-window overflow.
let is_context_error = err_lower.contains("context length")
|| err_lower.contains("too long")
|| err_lower.contains("maximum context")
|| err_lower.contains("token limit")
|| err_lower.contains("exceeds")
|| (err_lower.contains("413") && err_lower.contains("http"));
if is_context_error && batch.len() > 1 {
warn!(
"Batch failed with context length error, retrying chunks individually"
);
for chunk in batch {
match client.embed_batch(vec![chunk.text.clone()]).await {
Ok(embeddings)
if !embeddings.is_empty()
&& embeddings[0].len() == EXPECTED_DIMS =>
{
// Clear old embeddings on first successful chunk
if !cleared_docs.contains(&chunk.doc_id) {
clear_document_embeddings(conn, chunk.doc_id)?;
cleared_docs.insert(chunk.doc_id);
}
store_embedding(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
&embeddings[0],
chunk.total_chunks,
)?;
result.embedded += 1;
}
_ => {
warn!(
doc_id = chunk.doc_id,
chunk_index = chunk.chunk_index,
chunk_bytes = chunk.text.len(),
"Chunk too large for model context window"
);
record_embedding_error(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
"Chunk exceeds model context window",
)?;
result.failed += 1;
}
}
}
} else {
warn!(error = %e, "Batch embedding failed");
for chunk in batch {
record_embedding_error(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
&e.to_string(),
)?;
result.failed += 1;
}
}
}
Err(e) => {
let _ = conn.execute_batch("ROLLBACK TO embed_page; RELEASE embed_page");
return Err(e);
}
}
// Fire progress for all normal documents after embedding completes.
// This ensures progress reflects actual embedding work, not just chunking.
processed += page_normal_docs;
if let Some(ref cb) = progress_callback {
cb(processed, total);
}
// Commit all DB writes for this page atomically.
conn.execute_batch("RELEASE embed_page")?;
}
info!(
@@ -303,6 +109,240 @@ pub async fn embed_documents(
Ok(result)
}
/// Process a single page of pending documents within an active savepoint.
///
/// All `?` propagation from this function is caught by the caller, which
/// rolls back the savepoint on error.
#[allow(clippy::too_many_arguments)]
async fn embed_page(
conn: &Connection,
client: &OllamaClient,
model_name: &str,
pending: &[crate::embedding::change_detector::PendingDocument],
result: &mut EmbedResult,
last_id: &mut i64,
processed: &mut usize,
total: usize,
progress_callback: &Option<Box<dyn Fn(usize, usize)>>,
) -> Result<()> {
// Build chunk work items for this page
let mut all_chunks: Vec<ChunkWork> = Vec::new();
let mut page_normal_docs: usize = 0;
for doc in pending {
// Always advance the cursor, even for skipped docs, to avoid re-fetching
*last_id = doc.document_id;
if doc.content_text.is_empty() {
result.skipped += 1;
*processed += 1;
continue;
}
let chunks = split_into_chunks(&doc.content_text);
let total_chunks = chunks.len();
// Overflow guard: skip documents that produce too many chunks.
// Must run BEFORE clear_document_embeddings so existing embeddings
// are preserved when we skip.
if total_chunks as i64 > CHUNK_ROWID_MULTIPLIER {
warn!(
doc_id = doc.document_id,
chunk_count = total_chunks,
max = CHUNK_ROWID_MULTIPLIER,
"Document produces too many chunks, skipping to prevent rowid collision"
);
// Record a sentinel error so the document is not re-detected as
// pending on subsequent runs (prevents infinite re-processing).
record_embedding_error(
conn,
doc.document_id,
0, // sentinel chunk_index
&doc.content_hash,
"overflow-sentinel",
model_name,
&format!(
"Document produces {} chunks, exceeding max {}",
total_chunks, CHUNK_ROWID_MULTIPLIER
),
)?;
result.skipped += 1;
*processed += 1;
if let Some(cb) = progress_callback {
cb(*processed, total);
}
continue;
}
// Don't clear existing embeddings here — defer until the first
// successful chunk embedding so that if ALL chunks for a document
// fail, old embeddings survive instead of leaving zero data.
for (chunk_index, text) in chunks {
all_chunks.push(ChunkWork {
doc_id: doc.document_id,
chunk_index,
total_chunks,
doc_hash: doc.content_hash.clone(),
chunk_hash: sha256_hash(&text),
text,
});
}
page_normal_docs += 1;
// Don't fire progress here — wait until embedding completes below.
}
// Track documents whose old embeddings have been cleared.
// We defer clearing until the first successful chunk embedding so
// that if ALL chunks for a document fail, old embeddings survive.
let mut cleared_docs: HashSet<i64> = HashSet::new();
// Process chunks in batches of BATCH_SIZE
for batch in all_chunks.chunks(BATCH_SIZE) {
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
match client.embed_batch(texts).await {
Ok(embeddings) => {
for (i, embedding) in embeddings.iter().enumerate() {
if i >= batch.len() {
break;
}
let chunk = &batch[i];
if embedding.len() != EXPECTED_DIMS {
warn!(
doc_id = chunk.doc_id,
chunk_index = chunk.chunk_index,
got_dims = embedding.len(),
expected = EXPECTED_DIMS,
"Dimension mismatch, skipping"
);
record_embedding_error(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
&format!(
"Dimension mismatch: got {}, expected {}",
embedding.len(),
EXPECTED_DIMS
),
)?;
result.failed += 1;
continue;
}
// Clear old embeddings on first successful chunk for this document
if !cleared_docs.contains(&chunk.doc_id) {
clear_document_embeddings(conn, chunk.doc_id)?;
cleared_docs.insert(chunk.doc_id);
}
store_embedding(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
embedding,
chunk.total_chunks,
)?;
result.embedded += 1;
}
}
Err(e) => {
// Batch failed — retry each chunk individually so one
// oversized chunk doesn't poison the entire batch.
let err_str = e.to_string();
let err_lower = err_str.to_lowercase();
// Ollama error messages vary across versions. Match broadly
// against known patterns to detect context-window overflow.
let is_context_error = err_lower.contains("context length")
|| err_lower.contains("too long")
|| err_lower.contains("maximum context")
|| err_lower.contains("token limit")
|| err_lower.contains("exceeds")
|| (err_lower.contains("413") && err_lower.contains("http"));
if is_context_error && batch.len() > 1 {
warn!("Batch failed with context length error, retrying chunks individually");
for chunk in batch {
match client.embed_batch(vec![chunk.text.clone()]).await {
Ok(embeddings)
if !embeddings.is_empty()
&& embeddings[0].len() == EXPECTED_DIMS =>
{
// Clear old embeddings on first successful chunk
if !cleared_docs.contains(&chunk.doc_id) {
clear_document_embeddings(conn, chunk.doc_id)?;
cleared_docs.insert(chunk.doc_id);
}
store_embedding(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
&embeddings[0],
chunk.total_chunks,
)?;
result.embedded += 1;
}
_ => {
warn!(
doc_id = chunk.doc_id,
chunk_index = chunk.chunk_index,
chunk_bytes = chunk.text.len(),
"Chunk too large for model context window"
);
record_embedding_error(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
"Chunk exceeds model context window",
)?;
result.failed += 1;
}
}
}
} else {
warn!(error = %e, "Batch embedding failed");
for chunk in batch {
record_embedding_error(
conn,
chunk.doc_id,
chunk.chunk_index,
&chunk.doc_hash,
&chunk.chunk_hash,
model_name,
&e.to_string(),
)?;
result.failed += 1;
}
}
}
}
}
// Fire progress for all normal documents after embedding completes.
// This ensures progress reflects actual embedding work, not just chunking.
*processed += page_normal_docs;
if let Some(cb) = progress_callback {
cb(*processed, total);
}
Ok(())
}
/// Clear all embeddings and metadata for a document.
fn clear_document_embeddings(conn: &Connection, document_id: i64) -> Result<()> {
conn.execute(