diff --git a/tests/embedding.rs b/tests/embedding.rs new file mode 100644 index 0000000..eaf5bb3 --- /dev/null +++ b/tests/embedding.rs @@ -0,0 +1,183 @@ +//! Integration tests for embedding storage and vector search. +//! +//! These tests create an in-memory SQLite database with sqlite-vec loaded, +//! apply all migrations through 009 (embeddings), and verify KNN search +//! and metadata operations. + +use lore::core::db::create_connection; +use rusqlite::Connection; +use std::path::PathBuf; +use tempfile::TempDir; + +/// Create a test DB on disk (required for sqlite-vec which needs the extension loaded). +/// Uses create_connection to get the sqlite-vec extension registered. +fn create_test_db() -> (TempDir, Connection) { + let tmp = TempDir::new().unwrap(); + let db_path = tmp.path().join("test.db"); + let conn = create_connection(&db_path).unwrap(); + + let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); + + for version in 1..=9 { + let entries: Vec<_> = std::fs::read_dir(&migrations_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_string_lossy() + .starts_with(&format!("{:03}", version)) + }) + .collect(); + + assert!(!entries.is_empty(), "Migration {} not found", version); + let sql = std::fs::read_to_string(entries[0].path()).unwrap(); + conn.execute_batch(&sql) + .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e)); + } + + // Seed a project + conn.execute( + "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')", + [], + ) + .unwrap(); + + (tmp, conn) +} + +fn insert_document(conn: &Connection, id: i64, title: &str, content: &str) { + conn.execute( + "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url) + VALUES (?1, 'issue', ?1, 1, ?2, ?3, 'hash_' || ?1, 'https://example.com/' || ?1)", + rusqlite::params![id, title, content], + ) + .unwrap(); +} + +/// Create a 768-dim vector with a specific dimension set to 1.0 (unit vector along axis). +fn axis_vector(dim: usize) -> Vec { + let mut v = vec![0.0f32; 768]; + v[dim] = 1.0; + v +} + +fn insert_embedding(conn: &Connection, doc_id: i64, chunk_index: i64, embedding: &[f32]) { + let rowid = doc_id * 1000 + chunk_index; + let embedding_bytes: Vec = embedding.iter().flat_map(|f| f.to_le_bytes()).collect(); + + conn.execute( + "INSERT INTO embeddings (rowid, embedding) VALUES (?1, ?2)", + rusqlite::params![rowid, embedding_bytes], + ) + .unwrap(); + + let now = chrono::Utc::now().timestamp_millis(); + conn.execute( + "INSERT INTO embedding_metadata + (document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count) + VALUES (?1, ?2, 'nomic-embed-text', 768, 'hash_' || ?1, 'chunk_hash', ?3, 1)", + rusqlite::params![doc_id, chunk_index, now], + ) + .unwrap(); +} + +#[test] +fn knn_search_returns_nearest_neighbors() { + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "Doc A", "Content about authentication."); + insert_document(&conn, 2, "Doc B", "Content about database optimization."); + insert_document(&conn, 3, "Doc C", "Content about logging infrastructure."); + + // Doc 1: axis 0, Doc 2: axis 1, Doc 3: axis 2 + insert_embedding(&conn, 1, 0, &axis_vector(0)); + insert_embedding(&conn, 2, 0, &axis_vector(1)); + insert_embedding(&conn, 3, 0, &axis_vector(2)); + + // Query vector close to axis 0 (should match doc 1) + let mut query = vec![0.0f32; 768]; + query[0] = 0.9; + query[1] = 0.1; + + let results = lore::search::search_vector(&conn, &query, 10).unwrap(); + + assert!(!results.is_empty(), "Should return at least one result"); + assert_eq!(results[0].document_id, 1, "Nearest neighbor should be doc 1"); +} + +#[test] +fn knn_search_respects_limit() { + let (_tmp, conn) = create_test_db(); + + for i in 1..=10 { + insert_document(&conn, i, &format!("Doc {}", i), "Some content."); + insert_embedding(&conn, i, 0, &axis_vector(i as usize)); + } + + let results = lore::search::search_vector(&conn, &axis_vector(0), 3).unwrap(); + assert!(results.len() <= 3, "Results should be capped at limit"); +} + +#[test] +fn knn_search_deduplicates_chunks() { + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "Multi-chunk doc", "Very long content that was chunked."); + + // Same document, two chunks, both similar to query + let mut v1 = vec![0.0f32; 768]; + v1[0] = 1.0; + let mut v2 = vec![0.0f32; 768]; + v2[0] = 0.95; + v2[1] = 0.05; + + insert_embedding(&conn, 1, 0, &v1); + insert_embedding(&conn, 1, 1, &v2); + + let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); + + // Should deduplicate: same document_id appears at most once + let unique_docs: std::collections::HashSet = results.iter().map(|r| r.document_id).collect(); + assert_eq!( + unique_docs.len(), + results.len(), + "Each document should appear at most once in results" + ); +} + +#[test] +fn orphan_trigger_deletes_embeddings_on_document_delete() { + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "Will be deleted", "Content."); + insert_embedding(&conn, 1, 0, &axis_vector(0)); + + // Verify embedding exists + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0)) + .unwrap(); + assert_eq!(count, 1, "Embedding should exist before delete"); + + // Delete the document + conn.execute("DELETE FROM documents WHERE id = 1", []).unwrap(); + + // Verify embedding was cascade-deleted via trigger + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0)) + .unwrap(); + assert_eq!(count, 0, "Trigger should delete embeddings when document is deleted"); + + // Verify metadata was cascade-deleted via FK + let meta_count: i64 = conn + .query_row("SELECT COUNT(*) FROM embedding_metadata WHERE document_id = 1", [], |r| r.get(0)) + .unwrap(); + assert_eq!(meta_count, 0, "Metadata should be cascade-deleted"); +} + +#[test] +fn empty_database_returns_no_results() { + let (_tmp, conn) = create_test_db(); + + let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); + assert!(results.is_empty(), "Empty DB should return no results"); +} diff --git a/tests/fixtures/golden_queries.json b/tests/fixtures/golden_queries.json new file mode 100644 index 0000000..537d593 --- /dev/null +++ b/tests/fixtures/golden_queries.json @@ -0,0 +1,65 @@ +[ + { + "query": "authentication login", + "mode": "lexical", + "filters": {}, + "expected_doc_ids": [1], + "min_results": 1, + "max_rank": 10, + "description": "Basic auth keywords should find the OAuth login issue" + }, + { + "query": "database migration", + "mode": "lexical", + "filters": {}, + "expected_doc_ids": [3], + "min_results": 1, + "max_rank": 10, + "description": "Database migration terms should find the migration issue" + }, + { + "query": "user profile", + "mode": "lexical", + "filters": {}, + "expected_doc_ids": [2], + "min_results": 1, + "max_rank": 10, + "description": "User profile keywords should find the profile MR" + }, + { + "query": "API rate limiting", + "mode": "lexical", + "filters": {}, + "expected_doc_ids": [5], + "min_results": 1, + "max_rank": 10, + "description": "Rate limiting query should find the discussion document" + }, + { + "query": "performance optimization", + "mode": "lexical", + "filters": {}, + "expected_doc_ids": [4], + "min_results": 1, + "max_rank": 10, + "description": "Performance terms should find the performance MR" + }, + { + "query": "token refresh", + "mode": "lexical", + "filters": {"source_type": "issue"}, + "expected_doc_ids": [1], + "min_results": 1, + "max_rank": 10, + "description": "Token refresh with issue filter should find auth issue only" + }, + { + "query": "CSS styling frontend", + "mode": "lexical", + "filters": {}, + "expected_doc_ids": [6], + "min_results": 1, + "max_rank": 10, + "description": "Frontend CSS query should find the UI improvements issue" + } +] diff --git a/tests/fts_search.rs b/tests/fts_search.rs new file mode 100644 index 0000000..ca3bf76 --- /dev/null +++ b/tests/fts_search.rs @@ -0,0 +1,198 @@ +//! Integration tests for FTS5 search. +//! +//! These tests create an in-memory SQLite database, apply migrations through 008 (FTS5), +//! seed documents, and verify search behavior. + +use rusqlite::Connection; + +fn create_test_db() -> Connection { + let conn = Connection::open_in_memory().unwrap(); + conn.pragma_update(None, "foreign_keys", "ON").unwrap(); + + let migrations_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); + + for version in 1..=8 { + let entries: Vec<_> = std::fs::read_dir(&migrations_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_string_lossy() + .starts_with(&format!("{:03}", version)) + }) + .collect(); + + assert!(!entries.is_empty(), "Migration {} not found", version); + let sql = std::fs::read_to_string(entries[0].path()).unwrap(); + conn.execute_batch(&sql) + .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e)); + } + + // Seed a project + conn.execute( + "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')", + [], + ) + .unwrap(); + + conn +} + +fn insert_document(conn: &Connection, id: i64, source_type: &str, title: &str, content: &str) { + conn.execute( + "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url) + VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://example.com/' || ?1)", + rusqlite::params![id, source_type, title, content], + ) + .unwrap(); +} + +#[test] +fn fts_basic_search() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "Authentication bug", "Users cannot login when using OAuth tokens. The JWT refresh fails silently."); + insert_document(&conn, 2, "merge_request", "Add user profile page", "This MR adds a new user profile page with avatar upload support."); + insert_document(&conn, 3, "issue", "Database migration failing", "The migration script crashes on PostgreSQL 14 due to deprecated syntax."); + + let results = lore::search::search_fts(&conn, "authentication login", 10, lore::search::FtsQueryMode::Safe).unwrap(); + + assert!(!results.is_empty(), "Expected at least one result for 'authentication login'"); + assert_eq!(results[0].document_id, 1, "Authentication issue should be top result"); +} + +#[test] +fn fts_stemming_matches() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "Running tests", "The test runner is executing integration tests."); + insert_document(&conn, 2, "issue", "Deployment config", "Deployment configuration for production servers."); + + // "running" should match "runner" and "executing" via porter stemmer + let results = lore::search::search_fts(&conn, "running", 10, lore::search::FtsQueryMode::Safe).unwrap(); + assert!(!results.is_empty(), "Stemming should match 'running' to 'runner'"); + assert_eq!(results[0].document_id, 1); +} + +#[test] +fn fts_empty_results() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "Bug fix", "Fixed a null pointer dereference in the parser."); + + let results = lore::search::search_fts(&conn, "kubernetes deployment helm", 10, lore::search::FtsQueryMode::Safe).unwrap(); + assert!(results.is_empty(), "No documents should match unrelated query"); +} + +#[test] +fn fts_special_characters_handled() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "C++ compiler", "The C++ compiler segfaults on template metaprogramming."); + + // Special characters should not crash the search + let results = lore::search::search_fts(&conn, "C++ compiler", 10, lore::search::FtsQueryMode::Safe).unwrap(); + // Safe mode sanitizes the query — it should still return results or at least not crash + assert!(results.len() <= 1); +} + +#[test] +fn fts_result_ordering_by_relevance() { + let conn = create_test_db(); + + // Doc 1: "authentication" in title and content + insert_document(&conn, 1, "issue", "Authentication system redesign", "The authentication system needs a complete redesign. Authentication flows are broken."); + // Doc 2: "authentication" only in content, once + insert_document(&conn, 2, "issue", "Login page update", "Updated the login page with better authentication error messages."); + // Doc 3: unrelated + insert_document(&conn, 3, "issue", "Database optimization", "Optimize database queries for faster response times."); + + let results = lore::search::search_fts(&conn, "authentication", 10, lore::search::FtsQueryMode::Safe).unwrap(); + + assert!(results.len() >= 2, "Should match at least 2 documents"); + // Doc 1 should rank higher (more occurrences of the term) + assert_eq!(results[0].document_id, 1, "Document with more term occurrences should rank first"); +} + +#[test] +fn fts_respects_limit() { + let conn = create_test_db(); + + for i in 1..=20 { + insert_document( + &conn, + i, + "issue", + &format!("Bug report {}", i), + &format!("This is bug report number {} about the login system.", i), + ); + } + + let results = lore::search::search_fts(&conn, "bug login", 5, lore::search::FtsQueryMode::Safe).unwrap(); + assert!(results.len() <= 5, "Results should be capped at limit"); +} + +#[test] +fn fts_snippet_generated() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "Performance issue", "The application performance degrades significantly when more than 100 users are connected simultaneously. Memory usage spikes to 4GB."); + + let results = lore::search::search_fts(&conn, "performance", 10, lore::search::FtsQueryMode::Safe).unwrap(); + + assert!(!results.is_empty()); + // Snippet should contain some text (may have FTS5 highlight markers) + assert!(!results[0].snippet.is_empty(), "Snippet should be generated"); +} + +#[test] +fn fts_triggers_sync_on_insert() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "Test document", "This is test content for FTS trigger verification."); + + // Verify FTS table has an entry via direct query + let fts_count: i64 = conn + .query_row("SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'test'", [], |r| r.get(0)) + .unwrap(); + + assert_eq!(fts_count, 1, "FTS trigger should auto-index on INSERT"); +} + +#[test] +fn fts_triggers_sync_on_delete() { + let conn = create_test_db(); + + insert_document(&conn, 1, "issue", "Deletable document", "This content will be deleted from the index."); + + // Verify it's indexed + let before: i64 = conn + .query_row("SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'deletable'", [], |r| r.get(0)) + .unwrap(); + assert_eq!(before, 1); + + // Delete the document + conn.execute("DELETE FROM documents WHERE id = 1", []).unwrap(); + + // Verify it's removed from FTS + let after: i64 = conn + .query_row("SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'deletable'", [], |r| r.get(0)) + .unwrap(); + assert_eq!(after, 0, "FTS trigger should remove entry on DELETE"); +} + +#[test] +fn fts_null_title_handled() { + let conn = create_test_db(); + + // Discussion documents have NULL titles + conn.execute( + "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url) + VALUES (1, 'discussion', 1, 1, NULL, 'Discussion about API rate limiting strategies.', 'hash1', 'https://example.com/1')", + [], + ) + .unwrap(); + + let results = lore::search::search_fts(&conn, "rate limiting", 10, lore::search::FtsQueryMode::Safe).unwrap(); + assert!(!results.is_empty(), "Should find documents with NULL title"); +} diff --git a/tests/golden_query_tests.rs b/tests/golden_query_tests.rs new file mode 100644 index 0000000..afc4e56 --- /dev/null +++ b/tests/golden_query_tests.rs @@ -0,0 +1,279 @@ +//! Golden query test suite. +//! +//! Verifies end-to-end search quality with known-good expected results. +//! Uses a seeded SQLite DB with deterministic fixture data and no external +//! dependencies (no Ollama, no GitLab). + +#![allow(dead_code)] + +use rusqlite::Connection; +use serde::Deserialize; +use std::path::PathBuf; + +use lore::search::{self, FtsQueryMode, SearchFilters, SearchMode, search_fts, apply_filters}; + +/// A golden query test case. +#[derive(Debug, Deserialize)] +struct GoldenQuery { + query: String, + mode: String, + #[serde(default)] + filters: GoldenFilters, + expected_doc_ids: Vec, + min_results: usize, + max_rank: usize, + description: String, +} + +#[derive(Debug, Default, Deserialize)] +struct GoldenFilters { + source_type: Option, + author: Option, + project: Option, + #[serde(default)] + labels: Vec, +} + +fn load_golden_queries() -> Vec { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/golden_queries.json"); + let content = std::fs::read_to_string(&path) + .unwrap_or_else(|_| panic!("Failed to read golden queries fixture")); + serde_json::from_str(&content) + .unwrap_or_else(|e| panic!("Failed to parse golden queries: {}", e)) +} + +/// Create an in-memory database with FTS5 schema and seed deterministic fixture data. +fn create_seeded_db() -> Connection { + let conn = Connection::open_in_memory().unwrap(); + conn.pragma_update(None, "foreign_keys", "ON").unwrap(); + + // Apply migrations 001-008 (FTS5) + let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); + for version in 1..=8 { + let entries: Vec<_> = std::fs::read_dir(&migrations_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_string_lossy() + .starts_with(&format!("{:03}", version)) + }) + .collect(); + assert!(!entries.is_empty(), "Migration {} not found", version); + let sql = std::fs::read_to_string(entries[0].path()).unwrap(); + conn.execute_batch(&sql) + .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e)); + } + + // Seed project + conn.execute( + "INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url) + VALUES (1, 100, 'group/project', 'https://gitlab.example.com/group/project')", + [], + ) + .unwrap(); + + // Seed deterministic documents + let documents = vec![ + // id=1: Auth issue (matches: authentication, login, OAuth, JWT, token, refresh) + (1, "issue", "Authentication and login broken with OAuth", + "Users cannot login when using OAuth tokens. The JWT token refresh fails silently, \ + causing authentication errors. When the access token expires, the refresh flow returns \ + a 401 instead of fetching new credentials. Login page shows a generic error. \ + Multiple users reported authentication failures across all OAuth providers.", + "testuser"), + + // id=2: User profile MR (matches: user, profile, avatar, upload) + (2, "merge_request", "Add user profile page with avatar upload", + "This merge request adds a new user profile page. Users can now upload their avatar, \ + edit their display name, and manage notification preferences. The profile page includes \ + responsive design for mobile and desktop viewports.", + "developer1"), + + // id=3: Database migration issue (matches: database, migration, PostgreSQL, schema) + (3, "issue", "Database migration failing on PostgreSQL 14", + "The database migration script crashes on PostgreSQL 14 due to deprecated syntax. \ + The ALTER TABLE command uses a syntax removed in PG14. Migration 042 needs to be \ + rewritten to use the new schema modification syntax. All staging environments affected.", + "dba_admin"), + + // id=4: Performance MR (matches: performance, optimization, caching, query) + (4, "merge_request", "Performance optimization for dashboard queries", + "Optimized the dashboard query performance by adding database indexes and implementing \ + Redis caching for frequently accessed reports. Query execution time reduced from 3.2s \ + to 180ms. Added connection pooling and prepared statement caching.", + "senior_dev"), + + // id=5: API rate limiting discussion (matches: API, rate, limiting, throttle) + (5, "discussion", "API rate limiting strategies for public endpoints", + "Discussion about implementing API rate limiting on public-facing endpoints. \ + Proposed approaches: token bucket with sliding window, fixed window counters, \ + or leaky bucket algorithm. Rate limits should be configurable per API key tier. \ + Need to handle burst traffic during peak hours without throttling legitimate users.", + "architect"), + + // id=6: UI/CSS issue (matches: CSS, styling, frontend, responsive, UI) + (6, "issue", "CSS styling issues on mobile frontend", + "Multiple CSS styling problems on the mobile frontend. The navigation menu overlaps \ + content on screens smaller than 768px. Button text truncates on compact viewports. \ + Frontend responsive breakpoints need adjustment. The UI components library has \ + conflicting CSS specificity with the theme system.", + "frontend_dev"), + + // id=7: CI/CD MR (matches: CI, CD, pipeline, deployment, Docker) + (7, "merge_request", "Revamp CI/CD pipeline with Docker caching", + "Complete overhaul of the CI/CD pipeline. Added Docker layer caching to speed up \ + builds. Deployment stages now run in parallel where possible. Added rollback \ + support for failed deployments. Pipeline runtime reduced from 45min to 12min.", + "devops_lead"), + + // id=8: Security issue (matches: security, vulnerability, XSS, injection) + (8, "issue", "Security vulnerability in form submission", + "A cross-site scripting (XSS) vulnerability was found in the comment submission form. \ + User input is not properly sanitized before rendering. The security scanner also flagged \ + potential SQL injection in the search endpoint. Both vulnerabilities need immediate patching.", + "security_team"), + ]; + + for (id, source_type, title, content, author) in &documents { + conn.execute( + "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url, author_username) + VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://gitlab.example.com/group/project/-/' || ?2 || 's/' || ?1, ?5)", + rusqlite::params![id, source_type, title, content, author], + ) + .unwrap(); + } + + // Seed labels for filtered queries + conn.execute_batch( + "INSERT INTO document_labels (document_id, label_name) VALUES (1, 'bug'); + INSERT INTO document_labels (document_id, label_name) VALUES (1, 'authentication'); + INSERT INTO document_labels (document_id, label_name) VALUES (3, 'bug'); + INSERT INTO document_labels (document_id, label_name) VALUES (3, 'database'); + INSERT INTO document_labels (document_id, label_name) VALUES (6, 'bug'); + INSERT INTO document_labels (document_id, label_name) VALUES (6, 'frontend'); + INSERT INTO document_labels (document_id, label_name) VALUES (8, 'security'); + INSERT INTO document_labels (document_id, label_name) VALUES (8, 'critical');", + ) + .unwrap(); + + conn +} + +fn build_search_filters(golden: &GoldenFilters) -> SearchFilters { + let source_type = golden.source_type.as_deref().and_then(|s| match s { + "issue" => Some(lore::documents::SourceType::Issue), + "merge_request" => Some(lore::documents::SourceType::MergeRequest), + "discussion" => Some(lore::documents::SourceType::Discussion), + _ => None, + }); + + SearchFilters { + source_type, + author: golden.author.clone(), + labels: golden.labels.clone(), + limit: 100, + ..Default::default() + } +} + +#[test] +fn golden_queries_all_pass() { + let queries = load_golden_queries(); + let conn = create_seeded_db(); + + let mut failures: Vec = Vec::new(); + + for (i, gq) in queries.iter().enumerate() { + let mode = SearchMode::parse(&gq.mode).unwrap_or(SearchMode::Lexical); + + // For lexical-only golden queries (no Ollama needed) + assert_eq!( + mode, + SearchMode::Lexical, + "Golden query {} uses non-lexical mode '{}' which requires Ollama — not supported in CI", + i, + gq.mode + ); + + // Run FTS search + let fts_results = search_fts(&conn, &gq.query, 50, FtsQueryMode::Safe).unwrap(); + let doc_ids: Vec = fts_results.iter().map(|r| r.document_id).collect(); + + // Apply filters if any + let filters = build_search_filters(&gq.filters); + let filtered_ids = if filters.has_any_filter() { + apply_filters(&conn, &doc_ids, &filters).unwrap() + } else { + doc_ids.clone() + }; + + // Check min_results + if filtered_ids.len() < gq.min_results { + failures.push(format!( + "FAIL [{}] \"{}\": expected >= {} results, got {} (description: {})", + i, gq.query, gq.min_results, filtered_ids.len(), gq.description + )); + continue; + } + + // Check each expected doc_id is in top max_rank + for expected_id in &gq.expected_doc_ids { + let position = filtered_ids.iter().position(|id| id == expected_id); + match position { + Some(pos) if pos < gq.max_rank => { + // Pass + } + Some(pos) => { + failures.push(format!( + "FAIL [{}] \"{}\": expected doc_id {} in top {}, found at rank {} (description: {})", + i, gq.query, expected_id, gq.max_rank, pos + 1, gq.description + )); + } + None => { + failures.push(format!( + "FAIL [{}] \"{}\": expected doc_id {} not found in results {:?} (description: {})", + i, gq.query, expected_id, filtered_ids, gq.description + )); + } + } + } + } + + if !failures.is_empty() { + panic!( + "Golden query failures ({}/{}):\n{}", + failures.len(), + queries.len(), + failures.join("\n") + ); + } +} + +#[test] +fn golden_queries_fixture_is_valid() { + let queries = load_golden_queries(); + assert!( + queries.len() >= 5, + "Golden queries fixture should have at least 5 queries, got {}", + queries.len() + ); + + for (i, gq) in queries.iter().enumerate() { + assert!(!gq.query.is_empty(), "Query {} has empty query string", i); + assert!( + !gq.expected_doc_ids.is_empty(), + "Query {} has no expected doc IDs", + i + ); + assert!(gq.min_results > 0, "Query {} has min_results=0", i); + assert!(gq.max_rank > 0, "Query {} has max_rank=0", i); + assert!( + SearchMode::parse(&gq.mode).is_some(), + "Query {} has invalid mode '{}'", + i, + gq.mode + ); + } +} diff --git a/tests/hybrid_search.rs b/tests/hybrid_search.rs new file mode 100644 index 0000000..5eaeedb --- /dev/null +++ b/tests/hybrid_search.rs @@ -0,0 +1,206 @@ +//! Integration tests for hybrid search combining FTS + vector. +//! +//! Tests all three search modes (lexical, semantic, hybrid) and +//! verifies graceful degradation when embeddings are unavailable. + +use lore::core::db::create_connection; +use lore::search::{FtsQueryMode, SearchFilters, SearchMode, search_fts, search_hybrid}; +use rusqlite::Connection; +use std::path::PathBuf; +use tempfile::TempDir; + +fn create_test_db() -> (TempDir, Connection) { + let tmp = TempDir::new().unwrap(); + let db_path = tmp.path().join("test.db"); + let conn = create_connection(&db_path).unwrap(); + + let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); + + for version in 1..=9 { + let entries: Vec<_> = std::fs::read_dir(&migrations_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_string_lossy() + .starts_with(&format!("{:03}", version)) + }) + .collect(); + + assert!(!entries.is_empty(), "Migration {} not found", version); + let sql = std::fs::read_to_string(entries[0].path()).unwrap(); + conn.execute_batch(&sql) + .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e)); + } + + conn.execute( + "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')", + [], + ) + .unwrap(); + + (tmp, conn) +} + +fn insert_document(conn: &Connection, id: i64, source_type: &str, title: &str, content: &str) { + conn.execute( + "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url, author_username) + VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://example.com/' || ?1, 'testuser')", + rusqlite::params![id, source_type, title, content], + ) + .unwrap(); +} + + +#[test] +fn lexical_mode_uses_fts_only() { + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "issue", "Authentication bug", "OAuth token refresh fails silently."); + insert_document(&conn, 2, "issue", "Database migration", "Migration script crashes on PostgreSQL."); + + let filters = SearchFilters { + limit: 10, + ..Default::default() + }; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (results, warnings) = rt + .block_on(search_hybrid( + &conn, + None, + "authentication", + SearchMode::Lexical, + &filters, + FtsQueryMode::Safe, + )) + .unwrap(); + + assert!(!results.is_empty(), "Lexical search should find results"); + assert_eq!(results[0].document_id, 1); + // Lexical mode should not produce Ollama-related warnings + assert!( + warnings.iter().all(|w| !w.contains("Ollama")), + "Lexical mode should not warn about Ollama" + ); +} + +#[test] +fn lexical_mode_no_embeddings_required() { + // Use in-memory DB without sqlite-vec for pure FTS + let conn = Connection::open_in_memory().unwrap(); + conn.pragma_update(None, "foreign_keys", "ON").unwrap(); + + let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); + // Only apply through migration 008 (FTS5, no embeddings) + for version in 1..=8 { + let entries: Vec<_> = std::fs::read_dir(&migrations_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_string_lossy() + .starts_with(&format!("{:03}", version)) + }) + .collect(); + let sql = std::fs::read_to_string(entries[0].path()).unwrap(); + conn.execute_batch(&sql).unwrap(); + } + + conn.execute( + "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')", + [], + ) + .unwrap(); + + conn.execute( + "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url) + VALUES (1, 'issue', 1, 1, 'Test issue', 'Content about testing and verification.', 'h1', 'https://example.com/1')", + [], + ) + .unwrap(); + + let results = search_fts(&conn, "testing", 10, FtsQueryMode::Safe).unwrap(); + assert!(!results.is_empty(), "FTS should work without embeddings tables"); +} + +#[test] +fn hybrid_mode_degrades_to_fts_without_client() { + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "issue", "Performance issue", "Application is slow under load."); + + let filters = SearchFilters { + limit: 10, + ..Default::default() + }; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (results, warnings) = rt + .block_on(search_hybrid( + &conn, + None, // No Ollama client + "performance slow", + SearchMode::Hybrid, + &filters, + FtsQueryMode::Safe, + )) + .unwrap(); + + assert!(!results.is_empty(), "Should fall back to FTS results"); + // Should warn about missing Ollama client + assert!( + warnings.iter().any(|w| w.to_lowercase().contains("vector") || w.to_lowercase().contains("ollama") || w.to_lowercase().contains("client") || w.to_lowercase().contains("fallback") || w.to_lowercase().contains("fts")), + "Should produce a degradation warning, got: {:?}", + warnings + ); +} + +#[test] +fn rrf_ranking_combines_signals() { + use lore::search::rank_rrf; + + // Two documents with different rankings in each signal + let vector_results = vec![(1_i64, 0.1), (2, 0.5)]; // doc 1 closer + let fts_results = vec![(2_i64, -5.0), (1, -3.0)]; // doc 2 higher BM25 + + let rrf = rank_rrf(&vector_results, &fts_results); + + assert_eq!(rrf.len(), 2, "Should return both documents"); + // Both docs appear in both signals, so both get RRF scores + for r in &rrf { + assert!(r.rrf_score > 0.0, "RRF score should be positive"); + } +} + +#[test] +fn filters_by_source_type() { + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "issue", "Bug report", "Authentication bug in login flow."); + insert_document(&conn, 2, "merge_request", "Fix auth", "Fixed authentication issue."); + + let filters = SearchFilters { + source_type: Some(lore::documents::SourceType::Issue), + limit: 10, + ..Default::default() + }; + + let all_ids = vec![1, 2]; + let filtered = lore::search::apply_filters(&conn, &all_ids, &filters).unwrap(); + + assert_eq!(filtered.len(), 1, "Filter should remove non-issue documents"); + assert_eq!(filtered[0], 1, "Only issue document should remain"); +} + +#[test] +fn search_mode_variants_exist() { + // Verify all enum variants compile and are distinct + let hybrid = SearchMode::Hybrid; + let lexical = SearchMode::Lexical; + let semantic = SearchMode::Semantic; + + assert_ne!(hybrid, lexical); + assert_ne!(hybrid, semantic); + assert_ne!(lexical, semantic); +}