feat(search): Add hybrid search engine with FTS5, vector, and RRF fusion
Implements the search module providing three search modes: - Lexical (FTS5): Full-text search using SQLite FTS5 with safe query sanitization. User queries are automatically tokenized and wrapped in proper FTS5 syntax. Supports a "raw" mode for power users who want direct FTS5 query syntax (NEAR, column filters, etc.). - Semantic (vector): Embeds the search query via Ollama, then performs cosine similarity search against stored document embeddings. Results are deduplicated by doc_id since documents may have multiple chunks. - Hybrid (default): Executes both lexical and semantic searches in parallel, then fuses results using Reciprocal Rank Fusion (RRF) with k=60. This avoids the complexity of score normalization while producing high-quality merged rankings. Gracefully degrades to lexical-only when embeddings are unavailable. Additional components: - search::filters: Post-retrieval filtering by source_type, author, project, labels (AND logic), file path prefix, created_after, and updated_after. Date filters accept relative formats (7d, 2w) and ISO dates. - search::rrf: Reciprocal Rank Fusion implementation with configurable k parameter and optional explain mode that annotates each result with its component ranks and fusion score breakdown. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
139
src/search/vector.rs
Normal file
139
src/search/vector.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::chunk_ids::decode_rowid;
|
||||
|
||||
/// A single vector search result (document-level, deduplicated).
|
||||
#[derive(Debug)]
|
||||
pub struct VectorResult {
|
||||
pub document_id: i64,
|
||||
pub distance: f64,
|
||||
}
|
||||
|
||||
/// Search documents using sqlite-vec KNN query.
|
||||
///
|
||||
/// Over-fetches 3x limit to handle chunk deduplication (multiple chunks per
|
||||
/// document produce multiple KNN results for the same document_id).
|
||||
/// Returns deduplicated results with best (lowest) distance per document.
|
||||
pub fn search_vector(
|
||||
conn: &Connection,
|
||||
query_embedding: &[f32],
|
||||
limit: usize,
|
||||
) -> Result<Vec<VectorResult>> {
|
||||
if query_embedding.is_empty() || limit == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Convert to raw little-endian bytes for sqlite-vec
|
||||
let embedding_bytes: Vec<u8> = query_embedding
|
||||
.iter()
|
||||
.flat_map(|f| f.to_le_bytes())
|
||||
.collect();
|
||||
|
||||
let k = limit * 3; // Over-fetch for dedup
|
||||
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT rowid, distance
|
||||
FROM embeddings
|
||||
WHERE embedding MATCH ?1
|
||||
AND k = ?2
|
||||
ORDER BY distance"
|
||||
)?;
|
||||
|
||||
let rows: Vec<(i64, f64)> = stmt
|
||||
.query_map(rusqlite::params![embedding_bytes, k as i64], |row| {
|
||||
Ok((row.get(0)?, row.get(1)?))
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
// Dedup by document_id, keeping best (lowest) distance
|
||||
let mut best: HashMap<i64, f64> = HashMap::new();
|
||||
for (rowid, distance) in rows {
|
||||
let (document_id, _chunk_index) = decode_rowid(rowid);
|
||||
best.entry(document_id)
|
||||
.and_modify(|d| {
|
||||
if distance < *d {
|
||||
*d = distance;
|
||||
}
|
||||
})
|
||||
.or_insert(distance);
|
||||
}
|
||||
|
||||
// Sort by distance ascending, take limit
|
||||
let mut results: Vec<VectorResult> = best
|
||||
.into_iter()
|
||||
.map(|(document_id, distance)| VectorResult {
|
||||
document_id,
|
||||
distance,
|
||||
})
|
||||
.collect();
|
||||
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
|
||||
results.truncate(limit);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Note: Full integration tests require sqlite-vec loaded, which happens via
|
||||
// create_connection in db.rs. These are basic unit tests for the dedup logic.
|
||||
|
||||
#[test]
|
||||
fn test_empty_returns_empty() {
|
||||
// Can't test KNN without sqlite-vec, but we can test edge cases
|
||||
let result = search_vector_dedup(vec![], 10);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_keeps_best_distance() {
|
||||
// Simulate: doc 1 has chunks at rowid 1000 (idx 0) and 1001 (idx 1)
|
||||
let rows = vec![
|
||||
(1000_i64, 0.5_f64), // doc 1, chunk 0
|
||||
(1001, 0.3), // doc 1, chunk 1 (better)
|
||||
(2000, 0.4), // doc 2, chunk 0
|
||||
];
|
||||
let results = search_vector_dedup(rows, 10);
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].document_id, 1); // doc 1 best = 0.3
|
||||
assert!((results[0].distance - 0.3).abs() < f64::EPSILON);
|
||||
assert_eq!(results[1].document_id, 2); // doc 2 = 0.4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_respects_limit() {
|
||||
let rows = vec![
|
||||
(1000_i64, 0.1_f64),
|
||||
(2000, 0.2),
|
||||
(3000, 0.3),
|
||||
];
|
||||
let results = search_vector_dedup(rows, 2);
|
||||
assert_eq!(results.len(), 2);
|
||||
}
|
||||
|
||||
/// Helper for testing dedup logic without sqlite-vec
|
||||
fn search_vector_dedup(rows: Vec<(i64, f64)>, limit: usize) -> Vec<VectorResult> {
|
||||
let mut best: HashMap<i64, f64> = HashMap::new();
|
||||
for (rowid, distance) in rows {
|
||||
let (document_id, _) = decode_rowid(rowid);
|
||||
best.entry(document_id)
|
||||
.and_modify(|d| {
|
||||
if distance < *d {
|
||||
*d = distance;
|
||||
}
|
||||
})
|
||||
.or_insert(distance);
|
||||
}
|
||||
let mut results: Vec<VectorResult> = best
|
||||
.into_iter()
|
||||
.map(|(document_id, distance)| VectorResult { document_id, distance })
|
||||
.collect();
|
||||
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
|
||||
results.truncate(limit);
|
||||
results
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user