fix: Savepoint leak in embedding pipeline, atomic fail_job, RRF dedup
Three correctness fixes found during peer code review: Embedding pipeline savepoint leak (HIGH severity): The SAVEPOINT embed_page / RELEASE embed_page pattern had ~10 `?` propagation points between them. Any error from record_embedding_error, clear_document_embeddings, or store_embedding would exit the function without rolling back, leaving the SQLite connection in a broken transactional state and causing cascading failures for the rest of the session. Fixed by extracting page processing into `embed_page()` and wrapping with explicit rollback-on-error handling. Dependent queue fail_job race (MEDIUM severity): fail_job performed a SELECT followed by a separate UPDATE on the attempts counter without a transaction. Under concurrent lock reclamation, the attempts value could be read stale. Replaced with a single atomic UPDATE that increments attempts and computes exponential backoff entirely in SQL, also halving DB round-trips. Added explicit error when the job no longer exists. RRF duplicate document score inflation (MEDIUM severity): If a retriever returned the same document_id multiple times, the RRF score accumulated multiple rank contributions while the rank only recorded the first occurrence. Moved the score accumulation inside the `if is_none` guard so only the first occurrence per list contributes. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -122,28 +122,30 @@ pub fn complete_job(conn: &Connection, job_id: i64) -> Result<()> {
|
|||||||
/// Mark a job as failed. Increments attempts, sets next_retry_at with exponential
|
/// Mark a job as failed. Increments attempts, sets next_retry_at with exponential
|
||||||
/// backoff, clears locked_at, and records the error.
|
/// backoff, clears locked_at, and records the error.
|
||||||
///
|
///
|
||||||
/// Backoff: 30s * 2^(attempts-1), capped at 480s.
|
/// Backoff: 30s * 2^(attempts), capped at 480s. Uses a single atomic UPDATE
|
||||||
|
/// to avoid a read-then-write race on the `attempts` counter.
|
||||||
pub fn fail_job(conn: &Connection, job_id: i64, error: &str) -> Result<()> {
|
pub fn fail_job(conn: &Connection, job_id: i64, error: &str) -> Result<()> {
|
||||||
let now = now_ms();
|
let now = now_ms();
|
||||||
|
|
||||||
// Get current attempts (propagate error if job no longer exists)
|
// Atomic increment + backoff calculation in one UPDATE.
|
||||||
let current_attempts: i32 = conn.query_row(
|
// MIN(attempts, 4) caps the shift to prevent overflow; the overall
|
||||||
"SELECT attempts FROM pending_dependent_fetches WHERE id = ?1",
|
// backoff is clamped to 480 000 ms via MIN(..., 480000).
|
||||||
rusqlite::params![job_id],
|
let changes = conn.execute(
|
||||||
|row| row.get(0),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let new_attempts = current_attempts + 1;
|
|
||||||
let backoff_ms: i64 = (30_000i64 * (1i64 << (new_attempts - 1).min(4))).min(480_000);
|
|
||||||
let next_retry = now + backoff_ms;
|
|
||||||
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE pending_dependent_fetches
|
"UPDATE pending_dependent_fetches
|
||||||
SET attempts = ?1, next_retry_at = ?2, locked_at = NULL, last_error = ?3
|
SET attempts = attempts + 1,
|
||||||
WHERE id = ?4",
|
next_retry_at = ?1 + MIN(30000 * (1 << MIN(attempts, 4)), 480000),
|
||||||
rusqlite::params![new_attempts, next_retry, error, job_id],
|
locked_at = NULL,
|
||||||
|
last_error = ?2
|
||||||
|
WHERE id = ?3",
|
||||||
|
rusqlite::params![now, error, job_id],
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
if changes == 0 {
|
||||||
|
return Err(crate::core::error::LoreError::Other(
|
||||||
|
"fail_job: job not found (may have been reclaimed or completed)".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -66,227 +66,33 @@ pub async fn embed_documents(
|
|||||||
// process crashes mid-page, the savepoint is never released and
|
// process crashes mid-page, the savepoint is never released and
|
||||||
// SQLite rolls back — preventing partial document states where old
|
// SQLite rolls back — preventing partial document states where old
|
||||||
// embeddings are cleared but new ones haven't been written yet.
|
// embeddings are cleared but new ones haven't been written yet.
|
||||||
|
//
|
||||||
|
// We use a closure + match to ensure the savepoint is always
|
||||||
|
// rolled back on error — bare `execute_batch("SAVEPOINT")` with `?`
|
||||||
|
// propagation would leak the savepoint and leave the connection in
|
||||||
|
// a broken transactional state.
|
||||||
conn.execute_batch("SAVEPOINT embed_page")?;
|
conn.execute_batch("SAVEPOINT embed_page")?;
|
||||||
|
let page_result = embed_page(
|
||||||
// Build chunk work items for this page
|
conn,
|
||||||
let mut all_chunks: Vec<ChunkWork> = Vec::new();
|
client,
|
||||||
let mut page_normal_docs: usize = 0;
|
model_name,
|
||||||
|
&pending,
|
||||||
for doc in &pending {
|
&mut result,
|
||||||
// Always advance the cursor, even for skipped docs, to avoid re-fetching
|
&mut last_id,
|
||||||
last_id = doc.document_id;
|
&mut processed,
|
||||||
|
total,
|
||||||
if doc.content_text.is_empty() {
|
&progress_callback,
|
||||||
result.skipped += 1;
|
)
|
||||||
processed += 1;
|
.await;
|
||||||
continue;
|
match page_result {
|
||||||
|
Ok(()) => {
|
||||||
|
conn.execute_batch("RELEASE embed_page")?;
|
||||||
}
|
}
|
||||||
|
Err(e) => {
|
||||||
let chunks = split_into_chunks(&doc.content_text);
|
let _ = conn.execute_batch("ROLLBACK TO embed_page; RELEASE embed_page");
|
||||||
let total_chunks = chunks.len();
|
return Err(e);
|
||||||
|
|
||||||
// Overflow guard: skip documents that produce too many chunks.
|
|
||||||
// Must run BEFORE clear_document_embeddings so existing embeddings
|
|
||||||
// are preserved when we skip.
|
|
||||||
if total_chunks as i64 > CHUNK_ROWID_MULTIPLIER {
|
|
||||||
warn!(
|
|
||||||
doc_id = doc.document_id,
|
|
||||||
chunk_count = total_chunks,
|
|
||||||
max = CHUNK_ROWID_MULTIPLIER,
|
|
||||||
"Document produces too many chunks, skipping to prevent rowid collision"
|
|
||||||
);
|
|
||||||
// Record a sentinel error so the document is not re-detected as
|
|
||||||
// pending on subsequent runs (prevents infinite re-processing).
|
|
||||||
record_embedding_error(
|
|
||||||
conn,
|
|
||||||
doc.document_id,
|
|
||||||
0, // sentinel chunk_index
|
|
||||||
&doc.content_hash,
|
|
||||||
"overflow-sentinel",
|
|
||||||
model_name,
|
|
||||||
&format!(
|
|
||||||
"Document produces {} chunks, exceeding max {}",
|
|
||||||
total_chunks, CHUNK_ROWID_MULTIPLIER
|
|
||||||
),
|
|
||||||
)?;
|
|
||||||
result.skipped += 1;
|
|
||||||
processed += 1;
|
|
||||||
if let Some(ref cb) = progress_callback {
|
|
||||||
cb(processed, total);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Don't clear existing embeddings here — defer until the first
|
|
||||||
// successful chunk embedding so that if ALL chunks for a document
|
|
||||||
// fail, old embeddings survive instead of leaving zero data.
|
|
||||||
|
|
||||||
for (chunk_index, text) in chunks {
|
|
||||||
all_chunks.push(ChunkWork {
|
|
||||||
doc_id: doc.document_id,
|
|
||||||
chunk_index,
|
|
||||||
total_chunks,
|
|
||||||
doc_hash: doc.content_hash.clone(),
|
|
||||||
chunk_hash: sha256_hash(&text),
|
|
||||||
text,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
page_normal_docs += 1;
|
|
||||||
// Don't fire progress here — wait until embedding completes below.
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track documents whose old embeddings have been cleared.
|
|
||||||
// We defer clearing until the first successful chunk embedding so
|
|
||||||
// that if ALL chunks for a document fail, old embeddings survive.
|
|
||||||
let mut cleared_docs: HashSet<i64> = HashSet::new();
|
|
||||||
|
|
||||||
// Process chunks in batches of BATCH_SIZE
|
|
||||||
for batch in all_chunks.chunks(BATCH_SIZE) {
|
|
||||||
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
|
|
||||||
|
|
||||||
match client.embed_batch(texts).await {
|
|
||||||
Ok(embeddings) => {
|
|
||||||
for (i, embedding) in embeddings.iter().enumerate() {
|
|
||||||
if i >= batch.len() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let chunk = &batch[i];
|
|
||||||
|
|
||||||
if embedding.len() != EXPECTED_DIMS {
|
|
||||||
warn!(
|
|
||||||
doc_id = chunk.doc_id,
|
|
||||||
chunk_index = chunk.chunk_index,
|
|
||||||
got_dims = embedding.len(),
|
|
||||||
expected = EXPECTED_DIMS,
|
|
||||||
"Dimension mismatch, skipping"
|
|
||||||
);
|
|
||||||
record_embedding_error(
|
|
||||||
conn,
|
|
||||||
chunk.doc_id,
|
|
||||||
chunk.chunk_index,
|
|
||||||
&chunk.doc_hash,
|
|
||||||
&chunk.chunk_hash,
|
|
||||||
model_name,
|
|
||||||
&format!(
|
|
||||||
"Dimension mismatch: got {}, expected {}",
|
|
||||||
embedding.len(),
|
|
||||||
EXPECTED_DIMS
|
|
||||||
),
|
|
||||||
)?;
|
|
||||||
result.failed += 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear old embeddings on first successful chunk for this document
|
|
||||||
if !cleared_docs.contains(&chunk.doc_id) {
|
|
||||||
clear_document_embeddings(conn, chunk.doc_id)?;
|
|
||||||
cleared_docs.insert(chunk.doc_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
store_embedding(
|
|
||||||
conn,
|
|
||||||
chunk.doc_id,
|
|
||||||
chunk.chunk_index,
|
|
||||||
&chunk.doc_hash,
|
|
||||||
&chunk.chunk_hash,
|
|
||||||
model_name,
|
|
||||||
embedding,
|
|
||||||
chunk.total_chunks,
|
|
||||||
)?;
|
|
||||||
result.embedded += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
// Batch failed — retry each chunk individually so one
|
|
||||||
// oversized chunk doesn't poison the entire batch.
|
|
||||||
let err_str = e.to_string();
|
|
||||||
let err_lower = err_str.to_lowercase();
|
|
||||||
// Ollama error messages vary across versions. Match broadly
|
|
||||||
// against known patterns to detect context-window overflow.
|
|
||||||
let is_context_error = err_lower.contains("context length")
|
|
||||||
|| err_lower.contains("too long")
|
|
||||||
|| err_lower.contains("maximum context")
|
|
||||||
|| err_lower.contains("token limit")
|
|
||||||
|| err_lower.contains("exceeds")
|
|
||||||
|| (err_lower.contains("413") && err_lower.contains("http"));
|
|
||||||
|
|
||||||
if is_context_error && batch.len() > 1 {
|
|
||||||
warn!(
|
|
||||||
"Batch failed with context length error, retrying chunks individually"
|
|
||||||
);
|
|
||||||
for chunk in batch {
|
|
||||||
match client.embed_batch(vec![chunk.text.clone()]).await {
|
|
||||||
Ok(embeddings)
|
|
||||||
if !embeddings.is_empty()
|
|
||||||
&& embeddings[0].len() == EXPECTED_DIMS =>
|
|
||||||
{
|
|
||||||
// Clear old embeddings on first successful chunk
|
|
||||||
if !cleared_docs.contains(&chunk.doc_id) {
|
|
||||||
clear_document_embeddings(conn, chunk.doc_id)?;
|
|
||||||
cleared_docs.insert(chunk.doc_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
store_embedding(
|
|
||||||
conn,
|
|
||||||
chunk.doc_id,
|
|
||||||
chunk.chunk_index,
|
|
||||||
&chunk.doc_hash,
|
|
||||||
&chunk.chunk_hash,
|
|
||||||
model_name,
|
|
||||||
&embeddings[0],
|
|
||||||
chunk.total_chunks,
|
|
||||||
)?;
|
|
||||||
result.embedded += 1;
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
warn!(
|
|
||||||
doc_id = chunk.doc_id,
|
|
||||||
chunk_index = chunk.chunk_index,
|
|
||||||
chunk_bytes = chunk.text.len(),
|
|
||||||
"Chunk too large for model context window"
|
|
||||||
);
|
|
||||||
record_embedding_error(
|
|
||||||
conn,
|
|
||||||
chunk.doc_id,
|
|
||||||
chunk.chunk_index,
|
|
||||||
&chunk.doc_hash,
|
|
||||||
&chunk.chunk_hash,
|
|
||||||
model_name,
|
|
||||||
"Chunk exceeds model context window",
|
|
||||||
)?;
|
|
||||||
result.failed += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
warn!(error = %e, "Batch embedding failed");
|
|
||||||
for chunk in batch {
|
|
||||||
record_embedding_error(
|
|
||||||
conn,
|
|
||||||
chunk.doc_id,
|
|
||||||
chunk.chunk_index,
|
|
||||||
&chunk.doc_hash,
|
|
||||||
&chunk.chunk_hash,
|
|
||||||
model_name,
|
|
||||||
&e.to_string(),
|
|
||||||
)?;
|
|
||||||
result.failed += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fire progress for all normal documents after embedding completes.
|
|
||||||
// This ensures progress reflects actual embedding work, not just chunking.
|
|
||||||
processed += page_normal_docs;
|
|
||||||
if let Some(ref cb) = progress_callback {
|
|
||||||
cb(processed, total);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Commit all DB writes for this page atomically.
|
|
||||||
conn.execute_batch("RELEASE embed_page")?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
@@ -303,6 +109,240 @@ pub async fn embed_documents(
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Process a single page of pending documents within an active savepoint.
|
||||||
|
///
|
||||||
|
/// All `?` propagation from this function is caught by the caller, which
|
||||||
|
/// rolls back the savepoint on error.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
async fn embed_page(
|
||||||
|
conn: &Connection,
|
||||||
|
client: &OllamaClient,
|
||||||
|
model_name: &str,
|
||||||
|
pending: &[crate::embedding::change_detector::PendingDocument],
|
||||||
|
result: &mut EmbedResult,
|
||||||
|
last_id: &mut i64,
|
||||||
|
processed: &mut usize,
|
||||||
|
total: usize,
|
||||||
|
progress_callback: &Option<Box<dyn Fn(usize, usize)>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// Build chunk work items for this page
|
||||||
|
let mut all_chunks: Vec<ChunkWork> = Vec::new();
|
||||||
|
let mut page_normal_docs: usize = 0;
|
||||||
|
|
||||||
|
for doc in pending {
|
||||||
|
// Always advance the cursor, even for skipped docs, to avoid re-fetching
|
||||||
|
*last_id = doc.document_id;
|
||||||
|
|
||||||
|
if doc.content_text.is_empty() {
|
||||||
|
result.skipped += 1;
|
||||||
|
*processed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunks = split_into_chunks(&doc.content_text);
|
||||||
|
let total_chunks = chunks.len();
|
||||||
|
|
||||||
|
// Overflow guard: skip documents that produce too many chunks.
|
||||||
|
// Must run BEFORE clear_document_embeddings so existing embeddings
|
||||||
|
// are preserved when we skip.
|
||||||
|
if total_chunks as i64 > CHUNK_ROWID_MULTIPLIER {
|
||||||
|
warn!(
|
||||||
|
doc_id = doc.document_id,
|
||||||
|
chunk_count = total_chunks,
|
||||||
|
max = CHUNK_ROWID_MULTIPLIER,
|
||||||
|
"Document produces too many chunks, skipping to prevent rowid collision"
|
||||||
|
);
|
||||||
|
// Record a sentinel error so the document is not re-detected as
|
||||||
|
// pending on subsequent runs (prevents infinite re-processing).
|
||||||
|
record_embedding_error(
|
||||||
|
conn,
|
||||||
|
doc.document_id,
|
||||||
|
0, // sentinel chunk_index
|
||||||
|
&doc.content_hash,
|
||||||
|
"overflow-sentinel",
|
||||||
|
model_name,
|
||||||
|
&format!(
|
||||||
|
"Document produces {} chunks, exceeding max {}",
|
||||||
|
total_chunks, CHUNK_ROWID_MULTIPLIER
|
||||||
|
),
|
||||||
|
)?;
|
||||||
|
result.skipped += 1;
|
||||||
|
*processed += 1;
|
||||||
|
if let Some(cb) = progress_callback {
|
||||||
|
cb(*processed, total);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't clear existing embeddings here — defer until the first
|
||||||
|
// successful chunk embedding so that if ALL chunks for a document
|
||||||
|
// fail, old embeddings survive instead of leaving zero data.
|
||||||
|
|
||||||
|
for (chunk_index, text) in chunks {
|
||||||
|
all_chunks.push(ChunkWork {
|
||||||
|
doc_id: doc.document_id,
|
||||||
|
chunk_index,
|
||||||
|
total_chunks,
|
||||||
|
doc_hash: doc.content_hash.clone(),
|
||||||
|
chunk_hash: sha256_hash(&text),
|
||||||
|
text,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
page_normal_docs += 1;
|
||||||
|
// Don't fire progress here — wait until embedding completes below.
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track documents whose old embeddings have been cleared.
|
||||||
|
// We defer clearing until the first successful chunk embedding so
|
||||||
|
// that if ALL chunks for a document fail, old embeddings survive.
|
||||||
|
let mut cleared_docs: HashSet<i64> = HashSet::new();
|
||||||
|
|
||||||
|
// Process chunks in batches of BATCH_SIZE
|
||||||
|
for batch in all_chunks.chunks(BATCH_SIZE) {
|
||||||
|
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
|
||||||
|
|
||||||
|
match client.embed_batch(texts).await {
|
||||||
|
Ok(embeddings) => {
|
||||||
|
for (i, embedding) in embeddings.iter().enumerate() {
|
||||||
|
if i >= batch.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let chunk = &batch[i];
|
||||||
|
|
||||||
|
if embedding.len() != EXPECTED_DIMS {
|
||||||
|
warn!(
|
||||||
|
doc_id = chunk.doc_id,
|
||||||
|
chunk_index = chunk.chunk_index,
|
||||||
|
got_dims = embedding.len(),
|
||||||
|
expected = EXPECTED_DIMS,
|
||||||
|
"Dimension mismatch, skipping"
|
||||||
|
);
|
||||||
|
record_embedding_error(
|
||||||
|
conn,
|
||||||
|
chunk.doc_id,
|
||||||
|
chunk.chunk_index,
|
||||||
|
&chunk.doc_hash,
|
||||||
|
&chunk.chunk_hash,
|
||||||
|
model_name,
|
||||||
|
&format!(
|
||||||
|
"Dimension mismatch: got {}, expected {}",
|
||||||
|
embedding.len(),
|
||||||
|
EXPECTED_DIMS
|
||||||
|
),
|
||||||
|
)?;
|
||||||
|
result.failed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear old embeddings on first successful chunk for this document
|
||||||
|
if !cleared_docs.contains(&chunk.doc_id) {
|
||||||
|
clear_document_embeddings(conn, chunk.doc_id)?;
|
||||||
|
cleared_docs.insert(chunk.doc_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
store_embedding(
|
||||||
|
conn,
|
||||||
|
chunk.doc_id,
|
||||||
|
chunk.chunk_index,
|
||||||
|
&chunk.doc_hash,
|
||||||
|
&chunk.chunk_hash,
|
||||||
|
model_name,
|
||||||
|
embedding,
|
||||||
|
chunk.total_chunks,
|
||||||
|
)?;
|
||||||
|
result.embedded += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Batch failed — retry each chunk individually so one
|
||||||
|
// oversized chunk doesn't poison the entire batch.
|
||||||
|
let err_str = e.to_string();
|
||||||
|
let err_lower = err_str.to_lowercase();
|
||||||
|
// Ollama error messages vary across versions. Match broadly
|
||||||
|
// against known patterns to detect context-window overflow.
|
||||||
|
let is_context_error = err_lower.contains("context length")
|
||||||
|
|| err_lower.contains("too long")
|
||||||
|
|| err_lower.contains("maximum context")
|
||||||
|
|| err_lower.contains("token limit")
|
||||||
|
|| err_lower.contains("exceeds")
|
||||||
|
|| (err_lower.contains("413") && err_lower.contains("http"));
|
||||||
|
|
||||||
|
if is_context_error && batch.len() > 1 {
|
||||||
|
warn!("Batch failed with context length error, retrying chunks individually");
|
||||||
|
for chunk in batch {
|
||||||
|
match client.embed_batch(vec![chunk.text.clone()]).await {
|
||||||
|
Ok(embeddings)
|
||||||
|
if !embeddings.is_empty()
|
||||||
|
&& embeddings[0].len() == EXPECTED_DIMS =>
|
||||||
|
{
|
||||||
|
// Clear old embeddings on first successful chunk
|
||||||
|
if !cleared_docs.contains(&chunk.doc_id) {
|
||||||
|
clear_document_embeddings(conn, chunk.doc_id)?;
|
||||||
|
cleared_docs.insert(chunk.doc_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
store_embedding(
|
||||||
|
conn,
|
||||||
|
chunk.doc_id,
|
||||||
|
chunk.chunk_index,
|
||||||
|
&chunk.doc_hash,
|
||||||
|
&chunk.chunk_hash,
|
||||||
|
model_name,
|
||||||
|
&embeddings[0],
|
||||||
|
chunk.total_chunks,
|
||||||
|
)?;
|
||||||
|
result.embedded += 1;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
warn!(
|
||||||
|
doc_id = chunk.doc_id,
|
||||||
|
chunk_index = chunk.chunk_index,
|
||||||
|
chunk_bytes = chunk.text.len(),
|
||||||
|
"Chunk too large for model context window"
|
||||||
|
);
|
||||||
|
record_embedding_error(
|
||||||
|
conn,
|
||||||
|
chunk.doc_id,
|
||||||
|
chunk.chunk_index,
|
||||||
|
&chunk.doc_hash,
|
||||||
|
&chunk.chunk_hash,
|
||||||
|
model_name,
|
||||||
|
"Chunk exceeds model context window",
|
||||||
|
)?;
|
||||||
|
result.failed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
warn!(error = %e, "Batch embedding failed");
|
||||||
|
for chunk in batch {
|
||||||
|
record_embedding_error(
|
||||||
|
conn,
|
||||||
|
chunk.doc_id,
|
||||||
|
chunk.chunk_index,
|
||||||
|
&chunk.doc_hash,
|
||||||
|
&chunk.chunk_hash,
|
||||||
|
model_name,
|
||||||
|
&e.to_string(),
|
||||||
|
)?;
|
||||||
|
result.failed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fire progress for all normal documents after embedding completes.
|
||||||
|
// This ensures progress reflects actual embedding work, not just chunking.
|
||||||
|
*processed += page_normal_docs;
|
||||||
|
if let Some(cb) = progress_callback {
|
||||||
|
cb(*processed, total);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Clear all embeddings and metadata for a document.
|
/// Clear all embeddings and metadata for a document.
|
||||||
fn clear_document_embeddings(conn: &Connection, document_id: i64) -> Result<()> {
|
fn clear_document_embeddings(conn: &Connection, document_id: i64) -> Result<()> {
|
||||||
conn.execute(
|
conn.execute(
|
||||||
|
|||||||
@@ -33,8 +33,10 @@ pub fn rank_rrf(vector_results: &[(i64, f64)], fts_results: &[(i64, f64)]) -> Ve
|
|||||||
for (i, &(doc_id, _)) in vector_results.iter().enumerate() {
|
for (i, &(doc_id, _)) in vector_results.iter().enumerate() {
|
||||||
let rank = i + 1; // 1-indexed
|
let rank = i + 1; // 1-indexed
|
||||||
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
||||||
entry.0 += 1.0 / (RRF_K + rank as f64);
|
// Only count the first occurrence per list to prevent duplicates
|
||||||
|
// from inflating the score.
|
||||||
if entry.1.is_none() {
|
if entry.1.is_none() {
|
||||||
|
entry.0 += 1.0 / (RRF_K + rank as f64);
|
||||||
entry.1 = Some(rank);
|
entry.1 = Some(rank);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -42,8 +44,8 @@ pub fn rank_rrf(vector_results: &[(i64, f64)], fts_results: &[(i64, f64)]) -> Ve
|
|||||||
for (i, &(doc_id, _)) in fts_results.iter().enumerate() {
|
for (i, &(doc_id, _)) in fts_results.iter().enumerate() {
|
||||||
let rank = i + 1; // 1-indexed
|
let rank = i + 1; // 1-indexed
|
||||||
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
||||||
entry.0 += 1.0 / (RRF_K + rank as f64);
|
|
||||||
if entry.2.is_none() {
|
if entry.2.is_none() {
|
||||||
|
entry.0 += 1.0 / (RRF_K + rank as f64);
|
||||||
entry.2 = Some(rank);
|
entry.2 = Some(rank);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user