use lore::core::db::create_connection; use rusqlite::Connection; use std::path::PathBuf; use tempfile::TempDir; fn create_test_db() -> (TempDir, Connection) { let tmp = TempDir::new().unwrap(); let db_path = tmp.path().join("test.db"); let conn = create_connection(&db_path).unwrap(); let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); for version in 1..=10 { let entries: Vec<_> = std::fs::read_dir(&migrations_dir) .unwrap() .filter_map(|e| e.ok()) .filter(|e| { e.file_name() .to_string_lossy() .starts_with(&format!("{:03}", version)) }) .collect(); assert!(!entries.is_empty(), "Migration {} not found", version); let sql = std::fs::read_to_string(entries[0].path()).unwrap(); conn.execute_batch(&sql) .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e)); } conn.execute( "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')", [], ) .unwrap(); (tmp, conn) } fn insert_document(conn: &Connection, id: i64, title: &str, content: &str) { conn.execute( "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url) VALUES (?1, 'issue', ?1, 1, ?2, ?3, 'hash_' || ?1, 'https://example.com/' || ?1)", rusqlite::params![id, title, content], ) .unwrap(); } fn axis_vector(dim: usize) -> Vec { let mut v = vec![0.0f32; 768]; v[dim] = 1.0; v } fn insert_embedding(conn: &Connection, doc_id: i64, chunk_index: i64, embedding: &[f32]) { let rowid = doc_id * 1000 + chunk_index; let embedding_bytes: Vec = embedding.iter().flat_map(|f| f.to_le_bytes()).collect(); conn.execute( "INSERT INTO embeddings (rowid, embedding) VALUES (?1, ?2)", rusqlite::params![rowid, embedding_bytes], ) .unwrap(); let now = chrono::Utc::now().timestamp_millis(); conn.execute( "INSERT INTO embedding_metadata (document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count) VALUES (?1, ?2, 'nomic-embed-text', 768, 'hash_' || ?1, 'chunk_hash', ?3, 1)", rusqlite::params![doc_id, chunk_index, now], ) .unwrap(); } #[test] fn knn_search_returns_nearest_neighbors() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Doc A", "Content about authentication."); insert_document(&conn, 2, "Doc B", "Content about database optimization."); insert_document(&conn, 3, "Doc C", "Content about logging infrastructure."); insert_embedding(&conn, 1, 0, &axis_vector(0)); insert_embedding(&conn, 2, 0, &axis_vector(1)); insert_embedding(&conn, 3, 0, &axis_vector(2)); let mut query = vec![0.0f32; 768]; query[0] = 0.9; query[1] = 0.1; let results = lore::search::search_vector(&conn, &query, 10).unwrap(); assert!(!results.is_empty(), "Should return at least one result"); assert_eq!( results[0].document_id, 1, "Nearest neighbor should be doc 1" ); } #[test] fn knn_search_respects_limit() { let (_tmp, conn) = create_test_db(); for i in 1..=10 { insert_document(&conn, i, &format!("Doc {}", i), "Some content."); insert_embedding(&conn, i, 0, &axis_vector(i as usize)); } let results = lore::search::search_vector(&conn, &axis_vector(0), 3).unwrap(); assert!(results.len() <= 3, "Results should be capped at limit"); } #[test] fn knn_search_deduplicates_chunks() { let (_tmp, conn) = create_test_db(); insert_document( &conn, 1, "Multi-chunk doc", "Very long content that was chunked.", ); let mut v1 = vec![0.0f32; 768]; v1[0] = 1.0; let mut v2 = vec![0.0f32; 768]; v2[0] = 0.95; v2[1] = 0.05; insert_embedding(&conn, 1, 0, &v1); insert_embedding(&conn, 1, 1, &v2); let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); let unique_docs: std::collections::HashSet = results.iter().map(|r| r.document_id).collect(); assert_eq!( unique_docs.len(), results.len(), "Each document should appear at most once in results" ); } #[test] fn orphan_trigger_deletes_embeddings_on_document_delete() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Will be deleted", "Content."); insert_embedding(&conn, 1, 0, &axis_vector(0)); let count: i64 = conn .query_row( "SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0), ) .unwrap(); assert_eq!(count, 1, "Embedding should exist before delete"); conn.execute("DELETE FROM documents WHERE id = 1", []) .unwrap(); let count: i64 = conn .query_row( "SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0), ) .unwrap(); assert_eq!( count, 0, "Trigger should delete embeddings when document is deleted" ); let meta_count: i64 = conn .query_row( "SELECT COUNT(*) FROM embedding_metadata WHERE document_id = 1", [], |r| r.get(0), ) .unwrap(); assert_eq!(meta_count, 0, "Metadata should be cascade-deleted"); } #[test] fn empty_database_returns_no_results() { let (_tmp, conn) = create_test_db(); let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); assert!(results.is_empty(), "Empty DB should return no results"); } #[test] fn overflow_doc_with_error_sentinel_not_re_detected_as_pending() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Overflow doc", "Some content"); let now = chrono::Utc::now().timestamp_millis(); conn.execute( "INSERT INTO embedding_metadata (document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count, last_error, last_attempt_at, chunk_max_bytes) VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'overflow-sentinel', ?1, 1, 'Document produces too many chunks', ?1, ?2)", rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64], ) .unwrap(); let pending = lore::embedding::find_pending_documents(&conn, 100, 0, "nomic-embed-text").unwrap(); assert!( pending.is_empty(), "Document with overflow error sentinel should not be re-detected as pending, got {} pending", pending.len() ); let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); assert_eq!( count, 0, "Count should be 0 for document with overflow sentinel" ); } #[test] fn count_and_find_pending_agree() { let (_tmp, conn) = create_test_db(); let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); assert_eq!( count as usize, found.len(), "Empty DB: count and find should agree" ); insert_document(&conn, 1, "New doc", "Content"); let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); assert_eq!( count as usize, found.len(), "New doc: count and find should agree" ); assert_eq!(count, 1); let now = chrono::Utc::now().timestamp_millis(); conn.execute( "INSERT INTO embedding_metadata (document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count, chunk_max_bytes) VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'ch', ?1, 1, ?2)", rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64], ) .unwrap(); let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); assert_eq!( count as usize, found.len(), "Complete doc: count and find should agree" ); assert_eq!(count, 0); conn.execute( "UPDATE embedding_metadata SET chunk_max_bytes = 999 WHERE document_id = 1", [], ) .unwrap(); let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); assert_eq!( count as usize, found.len(), "Config drift: count and find should agree" ); assert_eq!(count, 1); } #[test] fn full_embed_delete_is_atomic() { let (_tmp, conn) = create_test_db(); insert_document(&conn, 1, "Doc", "Content"); insert_embedding(&conn, 1, 0, &axis_vector(0)); let meta_count: i64 = conn .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0)) .unwrap(); let embed_count: i64 = conn .query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0)) .unwrap(); assert_eq!(meta_count, 1); assert_eq!(embed_count, 1); conn.execute_batch( "BEGIN; DELETE FROM embedding_metadata; DELETE FROM embeddings; COMMIT;", ) .unwrap(); let meta_count: i64 = conn .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0)) .unwrap(); let embed_count: i64 = conn .query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0)) .unwrap(); assert_eq!(meta_count, 0, "Metadata should be cleared"); assert_eq!(embed_count, 0, "Embeddings should be cleared"); }