feat(search): Add hybrid search engine with FTS5, vector, and RRF fusion
Implements the search module providing three search modes: - Lexical (FTS5): Full-text search using SQLite FTS5 with safe query sanitization. User queries are automatically tokenized and wrapped in proper FTS5 syntax. Supports a "raw" mode for power users who want direct FTS5 query syntax (NEAR, column filters, etc.). - Semantic (vector): Embeds the search query via Ollama, then performs cosine similarity search against stored document embeddings. Results are deduplicated by doc_id since documents may have multiple chunks. - Hybrid (default): Executes both lexical and semantic searches in parallel, then fuses results using Reciprocal Rank Fusion (RRF) with k=60. This avoids the complexity of score normalization while producing high-quality merged rankings. Gracefully degrades to lexical-only when embeddings are unavailable. Additional components: - search::filters: Post-retrieval filtering by source_type, author, project, labels (AND logic), file path prefix, created_after, and updated_after. Date filters accept relative formats (7d, 2w) and ISO dates. - search::rrf: Reciprocal Rank Fusion implementation with configurable k parameter and optional explain mode that annotates each result with its component ranks and fusion score breakdown. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
178
src/search/rrf.rs
Normal file
178
src/search/rrf.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
const RRF_K: f64 = 60.0;
|
||||
|
||||
/// A single result from Reciprocal Rank Fusion, containing both raw and
|
||||
/// normalized scores plus per-list rank provenance for --explain output.
|
||||
pub struct RrfResult {
|
||||
pub document_id: i64,
|
||||
/// Raw RRF score: sum of 1/(k + rank) across all lists.
|
||||
pub rrf_score: f64,
|
||||
/// Normalized to [0, 1] where the best result is 1.0.
|
||||
pub normalized_score: f64,
|
||||
/// 1-indexed rank in the vector results list, if present.
|
||||
pub vector_rank: Option<usize>,
|
||||
/// 1-indexed rank in the FTS results list, if present.
|
||||
pub fts_rank: Option<usize>,
|
||||
}
|
||||
|
||||
/// Combine vector and FTS retrieval results using Reciprocal Rank Fusion.
|
||||
///
|
||||
/// Input tuples are `(document_id, score/distance)` — already sorted by each retriever.
|
||||
/// Ranks are 1-indexed (first result = rank 1).
|
||||
///
|
||||
/// Score = sum of 1/(k + rank) for each list containing the document.
|
||||
pub fn rank_rrf(
|
||||
vector_results: &[(i64, f64)],
|
||||
fts_results: &[(i64, f64)],
|
||||
) -> Vec<RrfResult> {
|
||||
if vector_results.is_empty() && fts_results.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// (rrf_score, vector_rank, fts_rank)
|
||||
let mut scores: HashMap<i64, (f64, Option<usize>, Option<usize>)> = HashMap::new();
|
||||
|
||||
for (i, &(doc_id, _)) in vector_results.iter().enumerate() {
|
||||
let rank = i + 1; // 1-indexed
|
||||
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
||||
entry.0 += 1.0 / (RRF_K + rank as f64);
|
||||
if entry.1.is_none() {
|
||||
entry.1 = Some(rank);
|
||||
}
|
||||
}
|
||||
|
||||
for (i, &(doc_id, _)) in fts_results.iter().enumerate() {
|
||||
let rank = i + 1; // 1-indexed
|
||||
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
||||
entry.0 += 1.0 / (RRF_K + rank as f64);
|
||||
if entry.2.is_none() {
|
||||
entry.2 = Some(rank);
|
||||
}
|
||||
}
|
||||
|
||||
let mut results: Vec<RrfResult> = scores
|
||||
.into_iter()
|
||||
.map(|(doc_id, (rrf_score, vector_rank, fts_rank))| RrfResult {
|
||||
document_id: doc_id,
|
||||
rrf_score,
|
||||
normalized_score: 0.0, // filled in below
|
||||
vector_rank,
|
||||
fts_rank,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort descending by rrf_score
|
||||
results.sort_by(|a, b| b.rrf_score.partial_cmp(&a.rrf_score).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// Normalize: best = 1.0
|
||||
if let Some(max_score) = results.first().map(|r| r.rrf_score) {
|
||||
if max_score > 0.0 {
|
||||
for result in &mut results {
|
||||
result.normalized_score = result.rrf_score / max_score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_dual_list_ranks_higher() {
|
||||
let vector = vec![(1, 0.1), (2, 0.2)];
|
||||
let fts = vec![(1, 5.0), (3, 3.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
// Doc 1 appears in both lists, should rank highest
|
||||
assert_eq!(results[0].document_id, 1);
|
||||
|
||||
// Doc 1 score should be higher than doc 2 and doc 3
|
||||
let doc1 = &results[0];
|
||||
let doc2_score = results.iter().find(|r| r.document_id == 2).unwrap().rrf_score;
|
||||
let doc3_score = results.iter().find(|r| r.document_id == 3).unwrap().rrf_score;
|
||||
assert!(doc1.rrf_score > doc2_score);
|
||||
assert!(doc1.rrf_score > doc3_score);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_list_included() {
|
||||
let vector = vec![(1, 0.1)];
|
||||
let fts = vec![(2, 5.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
let doc_ids: Vec<i64> = results.iter().map(|r| r.document_id).collect();
|
||||
assert!(doc_ids.contains(&1));
|
||||
assert!(doc_ids.contains(&2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalization() {
|
||||
let vector = vec![(1, 0.1), (2, 0.2)];
|
||||
let fts = vec![(1, 5.0), (3, 3.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
// Best result should have normalized_score = 1.0
|
||||
assert!((results[0].normalized_score - 1.0).abs() < f64::EPSILON);
|
||||
|
||||
// All scores in [0, 1]
|
||||
for r in &results {
|
||||
assert!(r.normalized_score >= 0.0);
|
||||
assert!(r.normalized_score <= 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_inputs() {
|
||||
let results = rank_rrf(&[], &[]);
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ranks_are_1_indexed() {
|
||||
let vector = vec![(10, 0.1), (20, 0.2)];
|
||||
let fts = vec![(10, 5.0), (30, 3.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
let doc10 = results.iter().find(|r| r.document_id == 10).unwrap();
|
||||
assert_eq!(doc10.vector_rank, Some(1));
|
||||
assert_eq!(doc10.fts_rank, Some(1));
|
||||
|
||||
let doc20 = results.iter().find(|r| r.document_id == 20).unwrap();
|
||||
assert_eq!(doc20.vector_rank, Some(2));
|
||||
assert_eq!(doc20.fts_rank, None);
|
||||
|
||||
let doc30 = results.iter().find(|r| r.document_id == 30).unwrap();
|
||||
assert_eq!(doc30.vector_rank, None);
|
||||
assert_eq!(doc30.fts_rank, Some(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_and_normalized_scores() {
|
||||
let vector = vec![(1, 0.1)];
|
||||
let fts = vec![(1, 5.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
let r = &results[0];
|
||||
|
||||
// RRF score = 1/(60+1) + 1/(60+1) = 2/61
|
||||
let expected = 2.0 / 61.0;
|
||||
assert!((r.rrf_score - expected).abs() < 1e-10);
|
||||
assert!((r.normalized_score - 1.0).abs() < f64::EPSILON);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_empty_list() {
|
||||
let vector = vec![(1, 0.1), (2, 0.2)];
|
||||
let results = rank_rrf(&vector, &[]);
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
// Single result should still have normalized_score = 1.0
|
||||
assert!((results[0].normalized_score - 1.0).abs() < f64::EPSILON);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user