//! Integration tests for embedding storage and vector search. //! //! These tests create an in-memory SQLite database with sqlite-vec loaded, //! apply all migrations through 009 (embeddings), and verify KNN search //! and metadata operations. use lore::core::db::create_connection; use rusqlite::Connection; use std::path::PathBuf; use tempfile::TempDir; /// Create a test DB on disk (required for sqlite-vec which needs the extension loaded). /// Uses create_connection to get the sqlite-vec extension registered. fn create_test_db() -> (TempDir, Connection) { let tmp = TempDir::new().unwrap(); let db_path = tmp.path().join("test.db"); let conn = create_connection(&db_path).unwrap(); let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); for version in 1..=9 { let entries: Vec<_> = std::fs::read_dir(&migrations_dir) .unwrap() .filter_map(|e| e.ok()) .filter(|e| { e.file_name() .to_string_lossy() .starts_with(&format!("{:03}", version)) }) .collect(); assert!(!entries.is_empty(), "Migration {} not found", version); let sql = std::fs::read_to_string(entries[0].path()).unwrap(); conn.execute_batch(&sql) .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e)); } // Seed a project conn.execute( "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')", [], ) .unwrap(); (tmp, conn) } fn insert_document(conn: &Connection, id: i64, title: &str, content: &str) { conn.execute( "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url) VALUES (?1, 'issue', ?1, 1, ?2, ?3, 'hash_' || ?1, 'https://example.com/' || ?1)", rusqlite::params![id, title, content], ) .unwrap(); } /// Create a 768-dim vector with a specific dimension set to 1.0 (unit vector along axis). fn axis_vector(dim: usize) -> Vec { let mut v = vec![0.0f32; 768]; v[dim] = 1.0; v } fn insert_embedding(conn: &Connection, doc_id: i64, chunk_index: i64, embedding: &[f32]) { let rowid = doc_id * 1000 + chunk_index; let embedding_bytes: Vec = embedding.iter().flat_map(|f| f.to_le_bytes()).collect(); conn.execute( "INSERT INTO embeddings (rowid, embedding) VALUES (?1, ?2)", rusqlite::params![rowid, embedding_bytes], ) .unwrap(); let now = chrono::Utc::now().timestamp_millis(); conn.execute( "INSERT INTO embedding_metadata (document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count) VALUES (?1, ?2, 'nomic-embed-text', 768, 'hash_' || ?1, 'chunk_hash', ?3, 1)", rusqlite::params![doc_id, chunk_index, now], ) .unwrap(); } #[test] fn knn_search_returns_nearest_neighbors() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Doc A", "Content about authentication."); insert_document(&conn, 2, "Doc B", "Content about database optimization."); insert_document(&conn, 3, "Doc C", "Content about logging infrastructure."); // Doc 1: axis 0, Doc 2: axis 1, Doc 3: axis 2 insert_embedding(&conn, 1, 0, &axis_vector(0)); insert_embedding(&conn, 2, 0, &axis_vector(1)); insert_embedding(&conn, 3, 0, &axis_vector(2)); // Query vector close to axis 0 (should match doc 1) let mut query = vec![0.0f32; 768]; query[0] = 0.9; query[1] = 0.1; let results = lore::search::search_vector(&conn, &query, 10).unwrap(); assert!(!results.is_empty(), "Should return at least one result"); assert_eq!(results[0].document_id, 1, "Nearest neighbor should be doc 1"); } #[test] fn knn_search_respects_limit() { let (_tmp, conn) = create_test_db(); for i in 1..=10 { insert_document(&conn, i, &format!("Doc {}", i), "Some content."); insert_embedding(&conn, i, 0, &axis_vector(i as usize)); } let results = lore::search::search_vector(&conn, &axis_vector(0), 3).unwrap(); assert!(results.len() <= 3, "Results should be capped at limit"); } #[test] fn knn_search_deduplicates_chunks() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Multi-chunk doc", "Very long content that was chunked."); // Same document, two chunks, both similar to query let mut v1 = vec![0.0f32; 768]; v1[0] = 1.0; let mut v2 = vec![0.0f32; 768]; v2[0] = 0.95; v2[1] = 0.05; insert_embedding(&conn, 1, 0, &v1); insert_embedding(&conn, 1, 1, &v2); let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); // Should deduplicate: same document_id appears at most once let unique_docs: std::collections::HashSet = results.iter().map(|r| r.document_id).collect(); assert_eq!( unique_docs.len(), results.len(), "Each document should appear at most once in results" ); } #[test] fn orphan_trigger_deletes_embeddings_on_document_delete() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Will be deleted", "Content."); insert_embedding(&conn, 1, 0, &axis_vector(0)); // Verify embedding exists let count: i64 = conn .query_row("SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0)) .unwrap(); assert_eq!(count, 1, "Embedding should exist before delete"); // Delete the document conn.execute("DELETE FROM documents WHERE id = 1", []).unwrap(); // Verify embedding was cascade-deleted via trigger let count: i64 = conn .query_row("SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0)) .unwrap(); assert_eq!(count, 0, "Trigger should delete embeddings when document is deleted"); // Verify metadata was cascade-deleted via FK let meta_count: i64 = conn .query_row("SELECT COUNT(*) FROM embedding_metadata WHERE document_id = 1", [], |r| r.get(0)) .unwrap(); assert_eq!(meta_count, 0, "Metadata should be cascade-deleted"); } #[test] fn empty_database_returns_no_results() { let (_tmp, conn) = create_test_db(); let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); assert!(results.is_empty(), "Empty DB should return no results"); }