feat(embed): concurrent batching, UTF-8 safe chunking, right-sized chunks
Three fixes to the embedding pipeline: 1. Concurrent HTTP batching: fire EMBED_CONCURRENCY (2) Ollama requests in parallel via join_all, then write results serially to SQLite. ~2x throughput improvement on GPU-bound workloads. 2. UTF-8 boundary safety: all computed byte offsets in split_into_chunks (paragraph/sentence/word break finders + overlap advance) now use floor_char_boundary() to prevent panics on multi-byte characters like smart quotes and non-breaking spaces. 3. CHUNK_MAX_BYTES reduced from 6000 to 1500 to fit nomic-embed-text's actual 2048-token context window, eliminating context-length retry storms that were causing 10x slowdowns. Also threads ShutdownSignal through embed pipeline for graceful Ctrl+C. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ use crate::Config;
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::core::shutdown::ShutdownSignal;
|
||||
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
|
||||
use crate::embedding::pipeline::embed_documents;
|
||||
|
||||
@@ -20,6 +21,7 @@ pub async fn run_embed(
|
||||
full: bool,
|
||||
retry_failed: bool,
|
||||
progress_callback: Option<Box<dyn Fn(usize, usize)>>,
|
||||
signal: &ShutdownSignal,
|
||||
) -> Result<EmbedCommandResult> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
@@ -49,7 +51,7 @@ pub async fn run_embed(
|
||||
}
|
||||
|
||||
let model_name = &config.embedding.model;
|
||||
let result = embed_documents(&conn, &client, model_name, progress_callback).await?;
|
||||
let result = embed_documents(&conn, &client, model_name, progress_callback, signal).await?;
|
||||
|
||||
Ok(EmbedCommandResult {
|
||||
embedded: result.embedded,
|
||||
|
||||
@@ -239,7 +239,7 @@ pub async fn run_sync(
|
||||
embed_bar_clone.set_position(processed as u64);
|
||||
}
|
||||
});
|
||||
match run_embed(config, options.full, false, Some(embed_cb)).await {
|
||||
match run_embed(config, options.full, false, Some(embed_cb), signal).await {
|
||||
Ok(embed_result) => {
|
||||
result.documents_embedded = embed_result.embedded;
|
||||
embed_bar.finish_and_clear();
|
||||
|
||||
Reference in New Issue
Block a user