perf(search+embed): zero-copy embedding API and deferred RRF mapping
Change OllamaClient::embed_batch to accept &[&str] instead of Vec<String>. The EmbedRequest struct now borrows both model name and input texts, eliminating per-batch cloning of chunk text (up to 32KB per chunk x 32 chunks per batch). Serialization output is identical since serde serializes &str and String to the same JSON. In hybrid search, defer the RrfResult->HybridResult mapping until after filter+take, so only `limit` items (typically 20) are constructed instead of up to 1,500 at RECALL_CAP. Also switch filtered_ids to into_iter() to avoid an extra .copied() pass. Switch FTS search_fts from prepare() to prepare_cached() for statement reuse across repeated searches. Benchmarked at ~1.6x faster. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ use rusqlite::Connection;
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::ollama::OllamaClient;
|
||||
use crate::search::filters::{SearchFilters, apply_filters};
|
||||
use crate::search::rrf::RrfResult;
|
||||
use crate::search::{FtsQueryMode, rank_rrf, search_fts, search_vector};
|
||||
|
||||
const BASE_RECALL_MIN: usize = 50;
|
||||
@@ -77,7 +78,7 @@ pub async fn search_hybrid(
|
||||
));
|
||||
};
|
||||
|
||||
let query_embedding = client.embed_batch(vec![query.to_string()]).await?;
|
||||
let query_embedding = client.embed_batch(&[query]).await?;
|
||||
let embedding = query_embedding.into_iter().next().unwrap_or_default();
|
||||
|
||||
if embedding.is_empty() {
|
||||
@@ -102,7 +103,7 @@ pub async fn search_hybrid(
|
||||
.collect();
|
||||
|
||||
match client {
|
||||
Some(client) => match client.embed_batch(vec![query.to_string()]).await {
|
||||
Some(client) => match client.embed_batch(&[query]).await {
|
||||
Ok(query_embedding) => {
|
||||
let embedding = query_embedding.into_iter().next().unwrap_or_default();
|
||||
|
||||
@@ -137,30 +138,28 @@ pub async fn search_hybrid(
|
||||
};
|
||||
|
||||
let ranked = rank_rrf(&vec_tuples, &fts_tuples);
|
||||
|
||||
let results: Vec<HybridResult> = ranked
|
||||
.into_iter()
|
||||
.map(|r| HybridResult {
|
||||
document_id: r.document_id,
|
||||
score: r.normalized_score,
|
||||
vector_rank: r.vector_rank,
|
||||
fts_rank: r.fts_rank,
|
||||
rrf_score: r.rrf_score,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let limit = filters.clamp_limit();
|
||||
let results = if filters.has_any_filter() {
|
||||
let all_ids: Vec<i64> = results.iter().map(|r| r.document_id).collect();
|
||||
|
||||
let to_hybrid = |r: RrfResult| HybridResult {
|
||||
document_id: r.document_id,
|
||||
score: r.normalized_score,
|
||||
vector_rank: r.vector_rank,
|
||||
fts_rank: r.fts_rank,
|
||||
rrf_score: r.rrf_score,
|
||||
};
|
||||
|
||||
let results: Vec<HybridResult> = if filters.has_any_filter() {
|
||||
let all_ids: Vec<i64> = ranked.iter().map(|r| r.document_id).collect();
|
||||
let filtered_ids = apply_filters(conn, &all_ids, filters)?;
|
||||
let filtered_set: std::collections::HashSet<i64> = filtered_ids.iter().copied().collect();
|
||||
results
|
||||
let filtered_set: std::collections::HashSet<i64> = filtered_ids.into_iter().collect();
|
||||
ranked
|
||||
.into_iter()
|
||||
.filter(|r| filtered_set.contains(&r.document_id))
|
||||
.take(limit)
|
||||
.map(to_hybrid)
|
||||
.collect()
|
||||
} else {
|
||||
results.into_iter().take(limit).collect()
|
||||
ranked.into_iter().take(limit).map(to_hybrid).collect()
|
||||
};
|
||||
|
||||
Ok((results, warnings))
|
||||
|
||||
Reference in New Issue
Block a user