test: Add test suites for embedding, FTS, hybrid search, and golden queries

Four new test modules covering the search infrastructure:

- tests/embedding.rs: Unit tests for the embedding pipeline including
  chunk ID encoding/decoding, change detection, and document chunking
  with overlap verification.

- tests/fts_search.rs: Integration tests for FTS5 search including
  safe query sanitization, multi-term queries, prefix matching, and
  the raw FTS mode for power users.

- tests/hybrid_search.rs: End-to-end tests for hybrid search mode
  including RRF fusion correctness, graceful degradation when
  embeddings are unavailable, and filter application.

- tests/golden_query_tests.rs: Golden query tests using fixtures
  from tests/fixtures/golden_queries.json to verify search quality
  against known-good query/result pairs. Ensures ranking stability
  across implementation changes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:47:19 -05:00
parent daf5a73019
commit d235f2b4dd
5 changed files with 931 additions and 0 deletions

206
tests/hybrid_search.rs Normal file
View File

@@ -0,0 +1,206 @@
//! Integration tests for hybrid search combining FTS + vector.
//!
//! Tests all three search modes (lexical, semantic, hybrid) and
//! verifies graceful degradation when embeddings are unavailable.
use lore::core::db::create_connection;
use lore::search::{FtsQueryMode, SearchFilters, SearchMode, search_fts, search_hybrid};
use rusqlite::Connection;
use std::path::PathBuf;
use tempfile::TempDir;
fn create_test_db() -> (TempDir, Connection) {
let tmp = TempDir::new().unwrap();
let db_path = tmp.path().join("test.db");
let conn = create_connection(&db_path).unwrap();
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
for version in 1..=9 {
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| {
e.file_name()
.to_string_lossy()
.starts_with(&format!("{:03}", version))
})
.collect();
assert!(!entries.is_empty(), "Migration {} not found", version);
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
conn.execute_batch(&sql)
.unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
}
conn.execute(
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
[],
)
.unwrap();
(tmp, conn)
}
fn insert_document(conn: &Connection, id: i64, source_type: &str, title: &str, content: &str) {
conn.execute(
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url, author_username)
VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://example.com/' || ?1, 'testuser')",
rusqlite::params![id, source_type, title, content],
)
.unwrap();
}
#[test]
fn lexical_mode_uses_fts_only() {
let (_tmp, conn) = create_test_db();
insert_document(&conn, 1, "issue", "Authentication bug", "OAuth token refresh fails silently.");
insert_document(&conn, 2, "issue", "Database migration", "Migration script crashes on PostgreSQL.");
let filters = SearchFilters {
limit: 10,
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let (results, warnings) = rt
.block_on(search_hybrid(
&conn,
None,
"authentication",
SearchMode::Lexical,
&filters,
FtsQueryMode::Safe,
))
.unwrap();
assert!(!results.is_empty(), "Lexical search should find results");
assert_eq!(results[0].document_id, 1);
// Lexical mode should not produce Ollama-related warnings
assert!(
warnings.iter().all(|w| !w.contains("Ollama")),
"Lexical mode should not warn about Ollama"
);
}
#[test]
fn lexical_mode_no_embeddings_required() {
// Use in-memory DB without sqlite-vec for pure FTS
let conn = Connection::open_in_memory().unwrap();
conn.pragma_update(None, "foreign_keys", "ON").unwrap();
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
// Only apply through migration 008 (FTS5, no embeddings)
for version in 1..=8 {
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| {
e.file_name()
.to_string_lossy()
.starts_with(&format!("{:03}", version))
})
.collect();
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
conn.execute_batch(&sql).unwrap();
}
conn.execute(
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
[],
)
.unwrap();
conn.execute(
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
VALUES (1, 'issue', 1, 1, 'Test issue', 'Content about testing and verification.', 'h1', 'https://example.com/1')",
[],
)
.unwrap();
let results = search_fts(&conn, "testing", 10, FtsQueryMode::Safe).unwrap();
assert!(!results.is_empty(), "FTS should work without embeddings tables");
}
#[test]
fn hybrid_mode_degrades_to_fts_without_client() {
let (_tmp, conn) = create_test_db();
insert_document(&conn, 1, "issue", "Performance issue", "Application is slow under load.");
let filters = SearchFilters {
limit: 10,
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let (results, warnings) = rt
.block_on(search_hybrid(
&conn,
None, // No Ollama client
"performance slow",
SearchMode::Hybrid,
&filters,
FtsQueryMode::Safe,
))
.unwrap();
assert!(!results.is_empty(), "Should fall back to FTS results");
// Should warn about missing Ollama client
assert!(
warnings.iter().any(|w| w.to_lowercase().contains("vector") || w.to_lowercase().contains("ollama") || w.to_lowercase().contains("client") || w.to_lowercase().contains("fallback") || w.to_lowercase().contains("fts")),
"Should produce a degradation warning, got: {:?}",
warnings
);
}
#[test]
fn rrf_ranking_combines_signals() {
use lore::search::rank_rrf;
// Two documents with different rankings in each signal
let vector_results = vec![(1_i64, 0.1), (2, 0.5)]; // doc 1 closer
let fts_results = vec![(2_i64, -5.0), (1, -3.0)]; // doc 2 higher BM25
let rrf = rank_rrf(&vector_results, &fts_results);
assert_eq!(rrf.len(), 2, "Should return both documents");
// Both docs appear in both signals, so both get RRF scores
for r in &rrf {
assert!(r.rrf_score > 0.0, "RRF score should be positive");
}
}
#[test]
fn filters_by_source_type() {
let (_tmp, conn) = create_test_db();
insert_document(&conn, 1, "issue", "Bug report", "Authentication bug in login flow.");
insert_document(&conn, 2, "merge_request", "Fix auth", "Fixed authentication issue.");
let filters = SearchFilters {
source_type: Some(lore::documents::SourceType::Issue),
limit: 10,
..Default::default()
};
let all_ids = vec![1, 2];
let filtered = lore::search::apply_filters(&conn, &all_ids, &filters).unwrap();
assert_eq!(filtered.len(), 1, "Filter should remove non-issue documents");
assert_eq!(filtered[0], 1, "Only issue document should remain");
}
#[test]
fn search_mode_variants_exist() {
// Verify all enum variants compile and are distinct
let hybrid = SearchMode::Hybrid;
let lexical = SearchMode::Lexical;
let semantic = SearchMode::Semantic;
assert_ne!(hybrid, lexical);
assert_ne!(hybrid, semantic);
assert_ne!(lexical, semantic);
}