perf(search+embed): zero-copy embedding API and deferred RRF mapping

Change OllamaClient::embed_batch to accept &[&str] instead of Vec<String>. The EmbedRequest struct now borrows both model name and input texts, eliminating per-batch cloning of chunk text (up to 32KB per chunk x 32 chunks per batch). Serialization output is identical since serde serializes &str and String to the same JSON. In hybrid search, defer the RrfResult->HybridResult mapping until after filter+take, so only `limit` items (typically 20) are constructed instead of up to 1,500 at RECALL_CAP. Also switch filtered_ids to into_iter() to avoid an extra .copied() pass. Switch FTS search_fts from prepare() to prepare_cached() for statement reuse across repeated searches. Benchmarked at ~1.6x faster. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-05 17:35:53 -05:00
parent 16beb35a69
commit 3e9cf2358e
4 changed files with 30 additions and 31 deletions
--- a/src/embedding/ollama.rs
+++ b/src/embedding/ollama.rs
@@ -27,9 +27,9 @@ pub struct OllamaClient {
 }

 #[derive(Serialize)]
-struct EmbedRequest {
-    model: String,
-    input: Vec<String>,
+struct EmbedRequest<'a> {
+    model: &'a str,
+    input: Vec<&'a str>,
 }

 #[derive(Deserialize)]
@@ -101,12 +101,12 @@ impl OllamaClient {
        Ok(())
    }

-    pub async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
+    pub async fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
        let url = format!("{}/api/embed", self.config.base_url);

        let request = EmbedRequest {
-            model: self.config.model.clone(),
-            input: texts,
+            model: &self.config.model,
+            input: texts.to_vec(),
        };

        let response = self
@@ -181,8 +181,8 @@ mod tests {
    #[test]
    fn test_embed_request_serialization() {
        let request = EmbedRequest {
-            model: "nomic-embed-text".to_string(),
-            input: vec!["hello".to_string(), "world".to_string()],
+            model: "nomic-embed-text",
+            input: vec!["hello", "world"],
        };
        let json = serde_json::to_string(&request).unwrap();
        assert!(json.contains("\"model\":\"nomic-embed-text\""));
--- a/src/embedding/pipeline.rs
+++ b/src/embedding/pipeline.rs
@@ -162,9 +162,9 @@ async fn embed_page(
    let mut cleared_docs: HashSet<i64> = HashSet::with_capacity(pending.len());

    for batch in all_chunks.chunks(BATCH_SIZE) {
-        let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
+        let texts: Vec<&str> = batch.iter().map(|c| c.text.as_str()).collect();

-        match client.embed_batch(texts).await {
+        match client.embed_batch(&texts).await {
            Ok(embeddings) => {
                for (i, embedding) in embeddings.iter().enumerate() {
                    if i >= batch.len() {
@@ -228,7 +228,7 @@ async fn embed_page(
                if is_context_error && batch.len() > 1 {
                    warn!("Batch failed with context length error, retrying chunks individually");
                    for chunk in batch {
-                        match client.embed_batch(vec![chunk.text.clone()]).await {
+                        match client.embed_batch(&[chunk.text.as_str()]).await {
                            Ok(embeddings)
                                if !embeddings.is_empty()
                                    && embeddings[0].len() == EXPECTED_DIMS =>