feat(search): Add hybrid search engine with FTS5, vector, and RRF fusion

Implements the search module providing three search modes:

- Lexical (FTS5): Full-text search using SQLite FTS5 with safe query
  sanitization. User queries are automatically tokenized and wrapped
  in proper FTS5 syntax. Supports a "raw" mode for power users who
  want direct FTS5 query syntax (NEAR, column filters, etc.).

- Semantic (vector): Embeds the search query via Ollama, then performs
  cosine similarity search against stored document embeddings. Results
  are deduplicated by doc_id since documents may have multiple chunks.

- Hybrid (default): Executes both lexical and semantic searches in
  parallel, then fuses results using Reciprocal Rank Fusion (RRF) with
  k=60. This avoids the complexity of score normalization while
  producing high-quality merged rankings. Gracefully degrades to
  lexical-only when embeddings are unavailable.

Additional components:

- search::filters: Post-retrieval filtering by source_type, author,
  project, labels (AND logic), file path prefix, created_after, and
  updated_after. Date filters accept relative formats (7d, 2w) and
  ISO dates.

- search::rrf: Reciprocal Rank Fusion implementation with configurable
  k parameter and optional explain mode that annotates each result
  with its component ranks and fusion score breakdown.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:46:42 -05:00
parent 723703bed9
commit d5bdb24b0f
6 changed files with 1044 additions and 0 deletions

139
src/search/vector.rs Normal file
View File

@@ -0,0 +1,139 @@
use std::collections::HashMap;
use rusqlite::Connection;
use crate::core::error::Result;
use crate::embedding::chunk_ids::decode_rowid;
/// A single vector search result (document-level, deduplicated).
#[derive(Debug)]
pub struct VectorResult {
pub document_id: i64,
pub distance: f64,
}
/// Search documents using sqlite-vec KNN query.
///
/// Over-fetches 3x limit to handle chunk deduplication (multiple chunks per
/// document produce multiple KNN results for the same document_id).
/// Returns deduplicated results with best (lowest) distance per document.
pub fn search_vector(
conn: &Connection,
query_embedding: &[f32],
limit: usize,
) -> Result<Vec<VectorResult>> {
if query_embedding.is_empty() || limit == 0 {
return Ok(Vec::new());
}
// Convert to raw little-endian bytes for sqlite-vec
let embedding_bytes: Vec<u8> = query_embedding
.iter()
.flat_map(|f| f.to_le_bytes())
.collect();
let k = limit * 3; // Over-fetch for dedup
let mut stmt = conn.prepare(
"SELECT rowid, distance
FROM embeddings
WHERE embedding MATCH ?1
AND k = ?2
ORDER BY distance"
)?;
let rows: Vec<(i64, f64)> = stmt
.query_map(rusqlite::params![embedding_bytes, k as i64], |row| {
Ok((row.get(0)?, row.get(1)?))
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
// Dedup by document_id, keeping best (lowest) distance
let mut best: HashMap<i64, f64> = HashMap::new();
for (rowid, distance) in rows {
let (document_id, _chunk_index) = decode_rowid(rowid);
best.entry(document_id)
.and_modify(|d| {
if distance < *d {
*d = distance;
}
})
.or_insert(distance);
}
// Sort by distance ascending, take limit
let mut results: Vec<VectorResult> = best
.into_iter()
.map(|(document_id, distance)| VectorResult {
document_id,
distance,
})
.collect();
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
results.truncate(limit);
Ok(results)
}
#[cfg(test)]
mod tests {
use super::*;
// Note: Full integration tests require sqlite-vec loaded, which happens via
// create_connection in db.rs. These are basic unit tests for the dedup logic.
#[test]
fn test_empty_returns_empty() {
// Can't test KNN without sqlite-vec, but we can test edge cases
let result = search_vector_dedup(vec![], 10);
assert!(result.is_empty());
}
#[test]
fn test_dedup_keeps_best_distance() {
// Simulate: doc 1 has chunks at rowid 1000 (idx 0) and 1001 (idx 1)
let rows = vec![
(1000_i64, 0.5_f64), // doc 1, chunk 0
(1001, 0.3), // doc 1, chunk 1 (better)
(2000, 0.4), // doc 2, chunk 0
];
let results = search_vector_dedup(rows, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].document_id, 1); // doc 1 best = 0.3
assert!((results[0].distance - 0.3).abs() < f64::EPSILON);
assert_eq!(results[1].document_id, 2); // doc 2 = 0.4
}
#[test]
fn test_dedup_respects_limit() {
let rows = vec![
(1000_i64, 0.1_f64),
(2000, 0.2),
(3000, 0.3),
];
let results = search_vector_dedup(rows, 2);
assert_eq!(results.len(), 2);
}
/// Helper for testing dedup logic without sqlite-vec
fn search_vector_dedup(rows: Vec<(i64, f64)>, limit: usize) -> Vec<VectorResult> {
let mut best: HashMap<i64, f64> = HashMap::new();
for (rowid, distance) in rows {
let (document_id, _) = decode_rowid(rowid);
best.entry(document_id)
.and_modify(|d| {
if distance < *d {
*d = distance;
}
})
.or_insert(distance);
}
let mut results: Vec<VectorResult> = best
.into_iter()
.map(|(document_id, distance)| VectorResult { document_id, distance })
.collect();
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
results.truncate(limit);
results
}
}