feat(embed): concurrent batching, UTF-8 safe chunking, right-sized chunks
Three fixes to the embedding pipeline: 1. Concurrent HTTP batching: fire EMBED_CONCURRENCY (2) Ollama requests in parallel via join_all, then write results serially to SQLite. ~2x throughput improvement on GPU-bound workloads. 2. UTF-8 boundary safety: all computed byte offsets in split_into_chunks (paragraph/sentence/word break finders + overlap advance) now use floor_char_boundary() to prevent panics on multi-byte characters like smart quotes and non-breaking spaces. 3. CHUNK_MAX_BYTES reduced from 6000 to 1500 to fit nomic-embed-text's actual 2048-token context window, eliminating context-length retry storms that were causing 10x slowdowns. Also threads ShutdownSignal through embed pipeline for graceful Ctrl+C. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
pub const CHUNK_MAX_BYTES: usize = 6_000;
|
||||
pub const CHUNK_MAX_BYTES: usize = 1_500;
|
||||
|
||||
pub const EXPECTED_DIMS: usize = 768;
|
||||
|
||||
@@ -42,6 +42,8 @@ pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
|
||||
}
|
||||
.max(1);
|
||||
start += advance;
|
||||
// Ensure start lands on a char boundary after overlap subtraction
|
||||
start = floor_char_boundary(content, start);
|
||||
chunk_index += 1;
|
||||
}
|
||||
|
||||
@@ -49,7 +51,7 @@ pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
|
||||
}
|
||||
|
||||
fn find_paragraph_break(window: &str) -> Option<usize> {
|
||||
let search_start = window.len() * 2 / 3;
|
||||
let search_start = floor_char_boundary(window, window.len() * 2 / 3);
|
||||
window[search_start..]
|
||||
.rfind("\n\n")
|
||||
.map(|pos| search_start + pos + 2)
|
||||
@@ -57,7 +59,7 @@ fn find_paragraph_break(window: &str) -> Option<usize> {
|
||||
}
|
||||
|
||||
fn find_sentence_break(window: &str) -> Option<usize> {
|
||||
let search_start = window.len() / 2;
|
||||
let search_start = floor_char_boundary(window, window.len() / 2);
|
||||
for pat in &[". ", "? ", "! "] {
|
||||
if let Some(pos) = window[search_start..].rfind(pat) {
|
||||
return Some(search_start + pos + pat.len());
|
||||
@@ -72,7 +74,7 @@ fn find_sentence_break(window: &str) -> Option<usize> {
|
||||
}
|
||||
|
||||
fn find_word_break(window: &str) -> Option<usize> {
|
||||
let search_start = window.len() / 2;
|
||||
let search_start = floor_char_boundary(window, window.len() / 2);
|
||||
window[search_start..]
|
||||
.rfind(' ')
|
||||
.map(|pos| search_start + pos + 1)
|
||||
@@ -180,4 +182,41 @@ mod tests {
|
||||
assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibyte_characters_no_panic() {
|
||||
// Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
|
||||
// placed at positions likely to hit len()*2/3 and len()/2 boundaries
|
||||
let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
|
||||
let mut content = String::new();
|
||||
while content.len() < CHUNK_MAX_BYTES * 3 {
|
||||
content.push_str(segment);
|
||||
}
|
||||
// Should not panic on multi-byte boundary
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
for (_, chunk) in &chunks {
|
||||
assert!(!chunk.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nbsp_at_overlap_boundary() {
|
||||
// Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
|
||||
// placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
|
||||
let mut content = String::new();
|
||||
// Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
|
||||
// near where the overlap subtraction would land
|
||||
let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
|
||||
while content.len() < target - 2 {
|
||||
content.push('a');
|
||||
}
|
||||
content.push('\u{a0}'); // 2-byte char right at the overlap boundary
|
||||
while content.len() < CHUNK_MAX_BYTES * 3 {
|
||||
content.push('b');
|
||||
}
|
||||
// Should not panic
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user