perf(search+embed): zero-copy embedding API and deferred RRF mapping
Change OllamaClient::embed_batch to accept &[&str] instead of Vec<String>. The EmbedRequest struct now borrows both model name and input texts, eliminating per-batch cloning of chunk text (up to 32KB per chunk x 32 chunks per batch). Serialization output is identical since serde serializes &str and String to the same JSON. In hybrid search, defer the RrfResult->HybridResult mapping until after filter+take, so only `limit` items (typically 20) are constructed instead of up to 1,500 at RECALL_CAP. Also switch filtered_ids to into_iter() to avoid an extra .copied() pass. Switch FTS search_fts from prepare() to prepare_cached() for statement reuse across repeated searches. Benchmarked at ~1.6x faster. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -162,9 +162,9 @@ async fn embed_page(
|
||||
let mut cleared_docs: HashSet<i64> = HashSet::with_capacity(pending.len());
|
||||
|
||||
for batch in all_chunks.chunks(BATCH_SIZE) {
|
||||
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
|
||||
let texts: Vec<&str> = batch.iter().map(|c| c.text.as_str()).collect();
|
||||
|
||||
match client.embed_batch(texts).await {
|
||||
match client.embed_batch(&texts).await {
|
||||
Ok(embeddings) => {
|
||||
for (i, embedding) in embeddings.iter().enumerate() {
|
||||
if i >= batch.len() {
|
||||
@@ -228,7 +228,7 @@ async fn embed_page(
|
||||
if is_context_error && batch.len() > 1 {
|
||||
warn!("Batch failed with context length error, retrying chunks individually");
|
||||
for chunk in batch {
|
||||
match client.embed_batch(vec![chunk.text.clone()]).await {
|
||||
match client.embed_batch(&[chunk.text.as_str()]).await {
|
||||
Ok(embeddings)
|
||||
if !embeddings.is_empty()
|
||||
&& embeddings[0].len() == EXPECTED_DIMS =>
|
||||
|
||||
Reference in New Issue
Block a user