feat(search): Add hybrid search engine with FTS5, vector, and RRF fusion

Implements the search module providing three search modes: - Lexical (FTS5): Full-text search using SQLite FTS5 with safe query sanitization. User queries are automatically tokenized and wrapped in proper FTS5 syntax. Supports a "raw" mode for power users who want direct FTS5 query syntax (NEAR, column filters, etc.). - Semantic (vector): Embeds the search query via Ollama, then performs cosine similarity search against stored document embeddings. Results are deduplicated by doc_id since documents may have multiple chunks. - Hybrid (default): Executes both lexical and semantic searches in parallel, then fuses results using Reciprocal Rank Fusion (RRF) with k=60. This avoids the complexity of score normalization while producing high-quality merged rankings. Gracefully degrades to lexical-only when embeddings are unavailable. Additional components: - search::filters: Post-retrieval filtering by source_type, author, project, labels (AND logic), file path prefix, created_after, and updated_after. Date filters accept relative formats (7d, 2w) and ISO dates. - search::rrf: Reciprocal Rank Fusion implementation with configurable k parameter and optional explain mode that annotates each result with its component ranks and fusion score breakdown. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:46:42 -05:00
parent 723703bed9
commit d5bdb24b0f
6 changed files with 1044 additions and 0 deletions
--- a/src/search/vector.rs
+++ b/src/search/vector.rs
@@ -0,0 +1,139 @@
+use std::collections::HashMap;
+
+use rusqlite::Connection;
+
+use crate::core::error::Result;
+use crate::embedding::chunk_ids::decode_rowid;
+
+/// A single vector search result (document-level, deduplicated).
+#[derive(Debug)]
+pub struct VectorResult {
+    pub document_id: i64,
+    pub distance: f64,
+}
+
+/// Search documents using sqlite-vec KNN query.
+///
+/// Over-fetches 3x limit to handle chunk deduplication (multiple chunks per
+/// document produce multiple KNN results for the same document_id).
+/// Returns deduplicated results with best (lowest) distance per document.
+pub fn search_vector(
+    conn: &Connection,
+    query_embedding: &[f32],
+    limit: usize,
+) -> Result<Vec<VectorResult>> {
+    if query_embedding.is_empty() || limit == 0 {
+        return Ok(Vec::new());
+    }
+
+    // Convert to raw little-endian bytes for sqlite-vec
+    let embedding_bytes: Vec<u8> = query_embedding
+        .iter()
+        .flat_map(|f| f.to_le_bytes())
+        .collect();
+
+    let k = limit * 3; // Over-fetch for dedup
+
+    let mut stmt = conn.prepare(
+        "SELECT rowid, distance
+         FROM embeddings
+         WHERE embedding MATCH ?1
+           AND k = ?2
+         ORDER BY distance"
+    )?;
+
+    let rows: Vec<(i64, f64)> = stmt
+        .query_map(rusqlite::params![embedding_bytes, k as i64], |row| {
+            Ok((row.get(0)?, row.get(1)?))
+        })?
+        .collect::<std::result::Result<Vec<_>, _>>()?;
+
+    // Dedup by document_id, keeping best (lowest) distance
+    let mut best: HashMap<i64, f64> = HashMap::new();
+    for (rowid, distance) in rows {
+        let (document_id, _chunk_index) = decode_rowid(rowid);
+        best.entry(document_id)
+            .and_modify(|d| {
+                if distance < *d {
+                    *d = distance;
+                }
+            })
+            .or_insert(distance);
+    }
+
+    // Sort by distance ascending, take limit
+    let mut results: Vec<VectorResult> = best
+        .into_iter()
+        .map(|(document_id, distance)| VectorResult {
+            document_id,
+            distance,
+        })
+        .collect();
+    results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
+    results.truncate(limit);
+
+    Ok(results)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Note: Full integration tests require sqlite-vec loaded, which happens via
+    // create_connection in db.rs. These are basic unit tests for the dedup logic.
+
+    #[test]
+    fn test_empty_returns_empty() {
+        // Can't test KNN without sqlite-vec, but we can test edge cases
+        let result = search_vector_dedup(vec![], 10);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_dedup_keeps_best_distance() {
+        // Simulate: doc 1 has chunks at rowid 1000 (idx 0) and 1001 (idx 1)
+        let rows = vec![
+            (1000_i64, 0.5_f64), // doc 1, chunk 0
+            (1001, 0.3),         // doc 1, chunk 1 (better)
+            (2000, 0.4),         // doc 2, chunk 0
+        ];
+        let results = search_vector_dedup(rows, 10);
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0].document_id, 1); // doc 1 best = 0.3
+        assert!((results[0].distance - 0.3).abs() < f64::EPSILON);
+        assert_eq!(results[1].document_id, 2); // doc 2 = 0.4
+    }
+
+    #[test]
+    fn test_dedup_respects_limit() {
+        let rows = vec![
+            (1000_i64, 0.1_f64),
+            (2000, 0.2),
+            (3000, 0.3),
+        ];
+        let results = search_vector_dedup(rows, 2);
+        assert_eq!(results.len(), 2);
+    }
+
+    /// Helper for testing dedup logic without sqlite-vec
+    fn search_vector_dedup(rows: Vec<(i64, f64)>, limit: usize) -> Vec<VectorResult> {
+        let mut best: HashMap<i64, f64> = HashMap::new();
+        for (rowid, distance) in rows {
+            let (document_id, _) = decode_rowid(rowid);
+            best.entry(document_id)
+                .and_modify(|d| {
+                    if distance < *d {
+                        *d = distance;
+                    }
+                })
+                .or_insert(distance);
+        }
+        let mut results: Vec<VectorResult> = best
+            .into_iter()
+            .map(|(document_id, distance)| VectorResult { document_id, distance })
+            .collect();
+        results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
+        results.truncate(limit);
+        results
+    }
+}