feat(search): Add hybrid search engine with FTS5, vector, and RRF fusion
Implements the search module providing three search modes: - Lexical (FTS5): Full-text search using SQLite FTS5 with safe query sanitization. User queries are automatically tokenized and wrapped in proper FTS5 syntax. Supports a "raw" mode for power users who want direct FTS5 query syntax (NEAR, column filters, etc.). - Semantic (vector): Embeds the search query via Ollama, then performs cosine similarity search against stored document embeddings. Results are deduplicated by doc_id since documents may have multiple chunks. - Hybrid (default): Executes both lexical and semantic searches in parallel, then fuses results using Reciprocal Rank Fusion (RRF) with k=60. This avoids the complexity of score normalization while producing high-quality merged rankings. Gracefully degrades to lexical-only when embeddings are unavailable. Additional components: - search::filters: Post-retrieval filtering by source_type, author, project, labels (AND logic), file path prefix, created_after, and updated_after. Date filters accept relative formats (7d, 2w) and ISO dates. - search::rrf: Reciprocal Rank Fusion implementation with configurable k parameter and optional explain mode that annotates each result with its component ranks and fusion score breakdown. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
228
src/search/fts.rs
Normal file
228
src/search/fts.rs
Normal file
@@ -0,0 +1,228 @@
|
||||
use crate::core::error::Result;
|
||||
use rusqlite::Connection;
|
||||
|
||||
/// FTS query mode.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum FtsQueryMode {
|
||||
/// Safe mode: each token wrapped in quotes, trailing * preserved on alphanumeric tokens.
|
||||
Safe,
|
||||
/// Raw mode: query passed directly to FTS5 (for advanced users).
|
||||
Raw,
|
||||
}
|
||||
|
||||
/// A single FTS5 search result.
|
||||
#[derive(Debug)]
|
||||
pub struct FtsResult {
|
||||
pub document_id: i64,
|
||||
pub bm25_score: f64,
|
||||
pub snippet: String,
|
||||
}
|
||||
|
||||
/// Convert raw user input into a safe FTS5 query.
|
||||
///
|
||||
/// Safe mode:
|
||||
/// - Splits on whitespace
|
||||
/// - Wraps each token in double quotes (escaping internal quotes)
|
||||
/// - Preserves trailing `*` on alphanumeric-only tokens (prefix search)
|
||||
///
|
||||
/// Raw mode: passes through unchanged.
|
||||
pub fn to_fts_query(raw: &str, mode: FtsQueryMode) -> String {
|
||||
match mode {
|
||||
FtsQueryMode::Raw => raw.to_string(),
|
||||
FtsQueryMode::Safe => {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let tokens: Vec<String> = trimmed
|
||||
.split_whitespace()
|
||||
.map(|token| {
|
||||
// Check if token ends with * and the rest is alphanumeric
|
||||
if token.ends_with('*') {
|
||||
let stem = &token[..token.len() - 1];
|
||||
if !stem.is_empty() && stem.chars().all(|c| c.is_alphanumeric() || c == '_') {
|
||||
// Preserve prefix search: "stem"*
|
||||
let escaped = stem.replace('"', "\"\"");
|
||||
return format!("\"{}\"*", escaped);
|
||||
}
|
||||
}
|
||||
// Default: wrap in quotes, escape internal quotes
|
||||
let escaped = token.replace('"', "\"\"");
|
||||
format!("\"{}\"", escaped)
|
||||
})
|
||||
.collect();
|
||||
|
||||
tokens.join(" ")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute an FTS5 search query.
|
||||
///
|
||||
/// Returns results ranked by BM25 score (lower = better match) with
|
||||
/// contextual snippets highlighting matches.
|
||||
pub fn search_fts(
|
||||
conn: &Connection,
|
||||
query: &str,
|
||||
limit: usize,
|
||||
mode: FtsQueryMode,
|
||||
) -> Result<Vec<FtsResult>> {
|
||||
let fts_query = to_fts_query(query, mode);
|
||||
if fts_query.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let sql = r#"
|
||||
SELECT d.id, bm25(documents_fts) AS score,
|
||||
snippet(documents_fts, 1, '<mark>', '</mark>', '...', 64) AS snip
|
||||
FROM documents_fts
|
||||
JOIN documents d ON d.id = documents_fts.rowid
|
||||
WHERE documents_fts MATCH ?1
|
||||
ORDER BY score
|
||||
LIMIT ?2
|
||||
"#;
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let results = stmt
|
||||
.query_map(rusqlite::params![fts_query, limit as i64], |row| {
|
||||
Ok(FtsResult {
|
||||
document_id: row.get(0)?,
|
||||
bm25_score: row.get(1)?,
|
||||
snippet: row.get(2)?,
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Generate a fallback snippet for results without FTS snippets.
|
||||
/// Truncates at a word boundary and appends "...".
|
||||
pub fn generate_fallback_snippet(content_text: &str, max_chars: usize) -> String {
|
||||
if content_text.chars().count() <= max_chars {
|
||||
return content_text.to_string();
|
||||
}
|
||||
|
||||
// Collect the char boundary at max_chars to slice correctly for multi-byte content
|
||||
let byte_end = content_text
|
||||
.char_indices()
|
||||
.nth(max_chars)
|
||||
.map(|(i, _)| i)
|
||||
.unwrap_or(content_text.len());
|
||||
let truncated = &content_text[..byte_end];
|
||||
|
||||
// Walk backward to find a word boundary (space)
|
||||
if let Some(last_space) = truncated.rfind(' ') {
|
||||
format!("{}...", &truncated[..last_space])
|
||||
} else {
|
||||
format!("{}...", truncated)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the best snippet: prefer FTS snippet, fall back to truncated content.
|
||||
pub fn get_result_snippet(fts_snippet: Option<&str>, content_text: &str) -> String {
|
||||
match fts_snippet {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => generate_fallback_snippet(content_text, 200),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_basic() {
|
||||
let result = to_fts_query("auth error", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"auth\" \"error\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_prefix() {
|
||||
let result = to_fts_query("auth*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"auth\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_special_chars() {
|
||||
let result = to_fts_query("C++", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"C++\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_dash() {
|
||||
let result = to_fts_query("-DWITH_SSL", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"-DWITH_SSL\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_quotes() {
|
||||
let result = to_fts_query("he said \"hello\"", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"he\" \"said\" \"\"\"hello\"\"\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_mode_passthrough() {
|
||||
let result = to_fts_query("auth OR error", FtsQueryMode::Raw);
|
||||
assert_eq!(result, "auth OR error");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_query() {
|
||||
let result = to_fts_query("", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "");
|
||||
|
||||
let result = to_fts_query(" ", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix_only_alphanumeric() {
|
||||
// Non-alphanumeric prefix: C++* should NOT be treated as prefix search
|
||||
let result = to_fts_query("C++*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"C++*\"");
|
||||
|
||||
// Pure alphanumeric prefix: auth* should be prefix search
|
||||
let result = to_fts_query("auth*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"auth\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix_with_underscore() {
|
||||
let result = to_fts_query("jwt_token*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"jwt_token\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fallback_snippet_short() {
|
||||
let result = generate_fallback_snippet("Short content", 200);
|
||||
assert_eq!(result, "Short content");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fallback_snippet_word_boundary() {
|
||||
let content = "This is a moderately long piece of text that should be truncated at a word boundary for readability purposes";
|
||||
let result = generate_fallback_snippet(content, 50);
|
||||
assert!(result.ends_with("..."));
|
||||
assert!(result.len() <= 55); // 50 + "..."
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_result_snippet_prefers_fts() {
|
||||
let result = get_result_snippet(Some("FTS <mark>match</mark>"), "full content text");
|
||||
assert_eq!(result, "FTS <mark>match</mark>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_result_snippet_fallback() {
|
||||
let result = get_result_snippet(None, "full content text");
|
||||
assert_eq!(result, "full content text");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_result_snippet_empty_fts() {
|
||||
let result = get_result_snippet(Some(""), "full content text");
|
||||
assert_eq!(result, "full content text");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user