Files
gitlore/src/cli/commands/embed.rs
Taylor Eernisse 39cb0cb087 feat(embed): concurrent batching, UTF-8 safe chunking, right-sized chunks
Three fixes to the embedding pipeline:

1. Concurrent HTTP batching: fire EMBED_CONCURRENCY (2) Ollama requests
   in parallel via join_all, then write results serially to SQLite.
   ~2x throughput improvement on GPU-bound workloads.

2. UTF-8 boundary safety: all computed byte offsets in split_into_chunks
   (paragraph/sentence/word break finders + overlap advance) now use
   floor_char_boundary() to prevent panics on multi-byte characters
   like smart quotes and non-breaking spaces.

3. CHUNK_MAX_BYTES reduced from 6000 to 1500 to fit nomic-embed-text's
   actual 2048-token context window, eliminating context-length retry
   storms that were causing 10x slowdowns.

Also threads ShutdownSignal through embed pipeline for graceful Ctrl+C.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 14:48:34 -05:00

87 lines
2.3 KiB
Rust

use console::style;
use serde::Serialize;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::paths::get_db_path;
use crate::core::shutdown::ShutdownSignal;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::embedding::pipeline::embed_documents;
#[derive(Debug, Default, Serialize)]
pub struct EmbedCommandResult {
pub embedded: usize,
pub failed: usize,
pub skipped: usize,
}
pub async fn run_embed(
config: &Config,
full: bool,
retry_failed: bool,
progress_callback: Option<Box<dyn Fn(usize, usize)>>,
signal: &ShutdownSignal,
) -> Result<EmbedCommandResult> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let ollama_config = OllamaConfig {
base_url: config.embedding.base_url.clone(),
model: config.embedding.model.clone(),
..OllamaConfig::default()
};
let client = OllamaClient::new(ollama_config);
client.health_check().await?;
if full {
conn.execute_batch(
"BEGIN;
DELETE FROM embedding_metadata;
DELETE FROM embeddings;
COMMIT;",
)?;
} else if retry_failed {
conn.execute(
"UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
WHERE last_error IS NOT NULL",
[],
)?;
}
let model_name = &config.embedding.model;
let result = embed_documents(&conn, &client, model_name, progress_callback, signal).await?;
Ok(EmbedCommandResult {
embedded: result.embedded,
failed: result.failed,
skipped: result.skipped,
})
}
pub fn print_embed(result: &EmbedCommandResult) {
println!("{} Embedding complete", style("done").green().bold(),);
println!(" Embedded: {}", result.embedded);
if result.failed > 0 {
println!(" Failed: {}", style(result.failed).red());
}
if result.skipped > 0 {
println!(" Skipped: {}", result.skipped);
}
}
#[derive(Serialize)]
struct EmbedJsonOutput<'a> {
ok: bool,
data: &'a EmbedCommandResult,
}
pub fn print_embed_json(result: &EmbedCommandResult) {
let output = EmbedJsonOutput {
ok: true,
data: result,
};
println!("{}", serde_json::to_string(&output).unwrap());
}