feat(embed): docs_embedded tracking, buffer reuse, retry hardening
Embedding pipeline improvements building on the concurrent batching foundation: - Track docs_embedded vs chunks_embedded separately. A document counts as embedded only when ALL its chunks succeed, giving accurate progress reporting. The sync command reads docs_embedded for its document count. - Reuse a single Vec<u8> buffer (embed_buf) across all store_embedding calls instead of allocating per chunk. Eliminates ~3KB allocation per 768-dim embedding. - Detect and record errors when Ollama silently returns fewer embeddings than inputs (batch mismatch). Previously these dropped chunks were invisible. - Improve retry error messages: distinguish "retry returned unexpected result" (wrong dims/count) from "retry request failed" (network error) instead of generic "chunk too large" message. - Convert all hot-path SQL from conn.execute() to prepare_cached() for statement cache reuse (clear_document_embeddings, store_embedding, record_embedding_error). - Record embedding_metadata errors for empty documents so they don't appear as perpetually pending on subsequent runs. - Accept concurrency parameter (configurable via config.embedding.concurrency) instead of hardcoded EMBED_CONCURRENCY=2. - Add schema version pre-flight check in embed command to fail fast with actionable error instead of cryptic SQL errors. - Fix --retry-failed to use DELETE instead of UPDATE. UPDATE clears last_error but the row still matches config params in the LEFT JOIN, making the doc permanently invisible to find_pending_documents. DELETE removes the row entirely so the LEFT JOIN returns NULL. Regression test added (old_update_approach_leaves_doc_invisible). - Add chunking forward-progress guard: after floor_char_boundary() rounds backward, ensure start advances by at least one full character to prevent infinite loops on multi-byte sequences (box-drawing chars, smart quotes). Test cases cover the exact patterns that caused production hangs on document 18526. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,9 +41,19 @@ pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
|
||||
split_at
|
||||
}
|
||||
.max(1);
|
||||
let old_start = start;
|
||||
start += advance;
|
||||
// Ensure start lands on a char boundary after overlap subtraction
|
||||
start = floor_char_boundary(content, start);
|
||||
// Guarantee forward progress: multi-byte chars can cause
|
||||
// floor_char_boundary to round back to old_start
|
||||
if start <= old_start {
|
||||
start = old_start
|
||||
+ content[old_start..]
|
||||
.chars()
|
||||
.next()
|
||||
.map_or(1, |c| c.len_utf8());
|
||||
}
|
||||
chunk_index += 1;
|
||||
}
|
||||
|
||||
@@ -219,4 +229,105 @@ mod tests {
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_box_drawing_heavy_content() {
|
||||
// Simulates a document with many box-drawing characters (3-byte UTF-8)
|
||||
// like the ─ (U+2500) character found in markdown tables
|
||||
let mut content = String::new();
|
||||
// Normal text header
|
||||
content.push_str("# Title\n\nSome description text.\n\n");
|
||||
// Table header with box drawing
|
||||
content.push('┌');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('┬');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push_str("┐\n"); // clippy: push_str is correct here (multi-char)
|
||||
// Table rows
|
||||
for row in 0..50 {
|
||||
content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row));
|
||||
content.push('├');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('┼');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push_str("┤\n"); // push_str for multi-char
|
||||
}
|
||||
content.push('└');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('┴');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push_str("┘\n"); // push_str for multi-char
|
||||
|
||||
eprintln!(
|
||||
"Content size: {} bytes, {} chars",
|
||||
content.len(),
|
||||
content.chars().count()
|
||||
);
|
||||
let start = std::time::Instant::now();
|
||||
let chunks = split_into_chunks(&content);
|
||||
let elapsed = start.elapsed();
|
||||
eprintln!(
|
||||
"Chunking took {:?}, produced {} chunks",
|
||||
elapsed,
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// Should complete in reasonable time
|
||||
assert!(
|
||||
elapsed.as_secs() < 5,
|
||||
"Chunking took too long: {:?}",
|
||||
elapsed
|
||||
);
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_real_doc_18526_pattern() {
|
||||
// Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
|
||||
// followed by newlines, creating a pattern where chunk windows
|
||||
// land in spaceless regions
|
||||
let mut content = String::new();
|
||||
content.push_str("Header text with spaces\n\n");
|
||||
// Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
|
||||
for _ in 0..800 {
|
||||
content.push('─'); // 3 bytes each = 2400 bytes
|
||||
}
|
||||
content.push('\n');
|
||||
content.push_str("Some more text.\n\n");
|
||||
// Another long run
|
||||
for _ in 0..800 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('\n');
|
||||
content.push_str("End text.\n");
|
||||
|
||||
eprintln!("Content size: {} bytes", content.len());
|
||||
let start = std::time::Instant::now();
|
||||
let chunks = split_into_chunks(&content);
|
||||
let elapsed = start.elapsed();
|
||||
eprintln!(
|
||||
"Chunking took {:?}, produced {} chunks",
|
||||
elapsed,
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
assert!(
|
||||
elapsed.as_secs() < 2,
|
||||
"Chunking took too long: {:?}",
|
||||
elapsed
|
||||
);
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user