feat(search): Add hybrid search engine with FTS5, vector, and RRF fusion

Implements the search module providing three search modes: - Lexical (FTS5): Full-text search using SQLite FTS5 with safe query sanitization. User queries are automatically tokenized and wrapped in proper FTS5 syntax. Supports a "raw" mode for power users who want direct FTS5 query syntax (NEAR, column filters, etc.). - Semantic (vector): Embeds the search query via Ollama, then performs cosine similarity search against stored document embeddings. Results are deduplicated by doc_id since documents may have multiple chunks. - Hybrid (default): Executes both lexical and semantic searches in parallel, then fuses results using Reciprocal Rank Fusion (RRF) with k=60. This avoids the complexity of score normalization while producing high-quality merged rankings. Gracefully degrades to lexical-only when embeddings are unavailable. Additional components: - search::filters: Post-retrieval filtering by source_type, author, project, labels (AND logic), file path prefix, created_after, and updated_after. Date filters accept relative formats (7d, 2w) and ISO dates. - search::rrf: Reciprocal Rank Fusion implementation with configurable k parameter and optional explain mode that annotates each result with its component ranks and fusion score breakdown. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:46:42 -05:00
parent 723703bed9
commit d5bdb24b0f
6 changed files with 1044 additions and 0 deletions
--- a/src/search/rrf.rs
+++ b/src/search/rrf.rs
@@ -0,0 +1,178 @@
+use std::collections::HashMap;
+
+const RRF_K: f64 = 60.0;
+
+/// A single result from Reciprocal Rank Fusion, containing both raw and
+/// normalized scores plus per-list rank provenance for --explain output.
+pub struct RrfResult {
+    pub document_id: i64,
+    /// Raw RRF score: sum of 1/(k + rank) across all lists.
+    pub rrf_score: f64,
+    /// Normalized to [0, 1] where the best result is 1.0.
+    pub normalized_score: f64,
+    /// 1-indexed rank in the vector results list, if present.
+    pub vector_rank: Option<usize>,
+    /// 1-indexed rank in the FTS results list, if present.
+    pub fts_rank: Option<usize>,
+}
+
+/// Combine vector and FTS retrieval results using Reciprocal Rank Fusion.
+///
+/// Input tuples are `(document_id, score/distance)` — already sorted by each retriever.
+/// Ranks are 1-indexed (first result = rank 1).
+///
+/// Score = sum of 1/(k + rank) for each list containing the document.
+pub fn rank_rrf(
+    vector_results: &[(i64, f64)],
+    fts_results: &[(i64, f64)],
+) -> Vec<RrfResult> {
+    if vector_results.is_empty() && fts_results.is_empty() {
+        return Vec::new();
+    }
+
+    // (rrf_score, vector_rank, fts_rank)
+    let mut scores: HashMap<i64, (f64, Option<usize>, Option<usize>)> = HashMap::new();
+
+    for (i, &(doc_id, _)) in vector_results.iter().enumerate() {
+        let rank = i + 1; // 1-indexed
+        let entry = scores.entry(doc_id).or_insert((0.0, None, None));
+        entry.0 += 1.0 / (RRF_K + rank as f64);
+        if entry.1.is_none() {
+            entry.1 = Some(rank);
+        }
+    }
+
+    for (i, &(doc_id, _)) in fts_results.iter().enumerate() {
+        let rank = i + 1; // 1-indexed
+        let entry = scores.entry(doc_id).or_insert((0.0, None, None));
+        entry.0 += 1.0 / (RRF_K + rank as f64);
+        if entry.2.is_none() {
+            entry.2 = Some(rank);
+        }
+    }
+
+    let mut results: Vec<RrfResult> = scores
+        .into_iter()
+        .map(|(doc_id, (rrf_score, vector_rank, fts_rank))| RrfResult {
+            document_id: doc_id,
+            rrf_score,
+            normalized_score: 0.0, // filled in below
+            vector_rank,
+            fts_rank,
+        })
+        .collect();
+
+    // Sort descending by rrf_score
+    results.sort_by(|a, b| b.rrf_score.partial_cmp(&a.rrf_score).unwrap_or(std::cmp::Ordering::Equal));
+
+    // Normalize: best = 1.0
+    if let Some(max_score) = results.first().map(|r| r.rrf_score) {
+        if max_score > 0.0 {
+            for result in &mut results {
+                result.normalized_score = result.rrf_score / max_score;
+            }
+        }
+    }
+
+    results
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_dual_list_ranks_higher() {
+        let vector = vec![(1, 0.1), (2, 0.2)];
+        let fts = vec![(1, 5.0), (3, 3.0)];
+        let results = rank_rrf(&vector, &fts);
+
+        // Doc 1 appears in both lists, should rank highest
+        assert_eq!(results[0].document_id, 1);
+
+        // Doc 1 score should be higher than doc 2 and doc 3
+        let doc1 = &results[0];
+        let doc2_score = results.iter().find(|r| r.document_id == 2).unwrap().rrf_score;
+        let doc3_score = results.iter().find(|r| r.document_id == 3).unwrap().rrf_score;
+        assert!(doc1.rrf_score > doc2_score);
+        assert!(doc1.rrf_score > doc3_score);
+    }
+
+    #[test]
+    fn test_single_list_included() {
+        let vector = vec![(1, 0.1)];
+        let fts = vec![(2, 5.0)];
+        let results = rank_rrf(&vector, &fts);
+
+        assert_eq!(results.len(), 2);
+        let doc_ids: Vec<i64> = results.iter().map(|r| r.document_id).collect();
+        assert!(doc_ids.contains(&1));
+        assert!(doc_ids.contains(&2));
+    }
+
+    #[test]
+    fn test_normalization() {
+        let vector = vec![(1, 0.1), (2, 0.2)];
+        let fts = vec![(1, 5.0), (3, 3.0)];
+        let results = rank_rrf(&vector, &fts);
+
+        // Best result should have normalized_score = 1.0
+        assert!((results[0].normalized_score - 1.0).abs() < f64::EPSILON);
+
+        // All scores in [0, 1]
+        for r in &results {
+            assert!(r.normalized_score >= 0.0);
+            assert!(r.normalized_score <= 1.0);
+        }
+    }
+
+    #[test]
+    fn test_empty_inputs() {
+        let results = rank_rrf(&[], &[]);
+        assert!(results.is_empty());
+    }
+
+    #[test]
+    fn test_ranks_are_1_indexed() {
+        let vector = vec![(10, 0.1), (20, 0.2)];
+        let fts = vec![(10, 5.0), (30, 3.0)];
+        let results = rank_rrf(&vector, &fts);
+
+        let doc10 = results.iter().find(|r| r.document_id == 10).unwrap();
+        assert_eq!(doc10.vector_rank, Some(1));
+        assert_eq!(doc10.fts_rank, Some(1));
+
+        let doc20 = results.iter().find(|r| r.document_id == 20).unwrap();
+        assert_eq!(doc20.vector_rank, Some(2));
+        assert_eq!(doc20.fts_rank, None);
+
+        let doc30 = results.iter().find(|r| r.document_id == 30).unwrap();
+        assert_eq!(doc30.vector_rank, None);
+        assert_eq!(doc30.fts_rank, Some(2));
+    }
+
+    #[test]
+    fn test_raw_and_normalized_scores() {
+        let vector = vec![(1, 0.1)];
+        let fts = vec![(1, 5.0)];
+        let results = rank_rrf(&vector, &fts);
+
+        assert_eq!(results.len(), 1);
+        let r = &results[0];
+
+        // RRF score = 1/(60+1) + 1/(60+1) = 2/61
+        let expected = 2.0 / 61.0;
+        assert!((r.rrf_score - expected).abs() < 1e-10);
+        assert!((r.normalized_score - 1.0).abs() < f64::EPSILON);
+    }
+
+    #[test]
+    fn test_one_empty_list() {
+        let vector = vec![(1, 0.1), (2, 0.2)];
+        let results = rank_rrf(&vector, &[]);
+
+        assert_eq!(results.len(), 2);
+        // Single result should still have normalized_score = 1.0
+        assert!((results[0].normalized_score - 1.0).abs() < f64::EPSILON);
+    }
+}