diff --git a/tests/embedding.rs b/tests/embedding.rs index eaf5bb3..c438f06 100644 --- a/tests/embedding.rs +++ b/tests/embedding.rs @@ -1,7 +1,7 @@ //! Integration tests for embedding storage and vector search. //! //! These tests create an in-memory SQLite database with sqlite-vec loaded, -//! apply all migrations through 009 (embeddings), and verify KNN search +//! apply all migrations through 010 (chunk config), and verify KNN search //! and metadata operations. use lore::core::db::create_connection; @@ -18,7 +18,7 @@ fn create_test_db() -> (TempDir, Connection) { let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations"); - for version in 1..=9 { + for version in 1..=10 { let entries: Vec<_> = std::fs::read_dir(&migrations_dir) .unwrap() .filter_map(|e| e.ok()) @@ -181,3 +181,122 @@ fn empty_database_returns_no_results() { let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap(); assert!(results.is_empty(), "Empty DB should return no results"); } + +// --- Bug-fix regression tests --- + +#[test] +fn overflow_doc_with_error_sentinel_not_re_detected_as_pending() { + // Bug 2: Documents skipped for chunk overflow must record a sentinel error + // in embedding_metadata so they are not re-detected as pending on subsequent + // pipeline runs (which would cause an infinite re-processing loop). + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "Overflow doc", "Some content"); + + // Simulate what the pipeline does when a document exceeds CHUNK_ROWID_MULTIPLIER: + // it records an error sentinel at chunk_index=0. + let now = chrono::Utc::now().timestamp_millis(); + conn.execute( + "INSERT INTO embedding_metadata + (document_id, chunk_index, model, dims, document_hash, chunk_hash, + created_at, attempt_count, last_error, last_attempt_at, chunk_max_bytes) + VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'overflow-sentinel', ?1, 1, 'Document produces too many chunks', ?1, ?2)", + rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64], + ) + .unwrap(); + + // Now find_pending_documents should NOT return this document + let pending = lore::embedding::find_pending_documents(&conn, 100, 0, "nomic-embed-text").unwrap(); + assert!( + pending.is_empty(), + "Document with overflow error sentinel should not be re-detected as pending, got {} pending", + pending.len() + ); + + // count_pending_documents should also return 0 + let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); + assert_eq!(count, 0, "Count should be 0 for document with overflow sentinel"); +} + +#[test] +fn count_and_find_pending_agree() { + // Bug 1: count_pending_documents and find_pending_documents must use + // logically equivalent WHERE clauses to produce consistent results. + let (_tmp, conn) = create_test_db(); + + // Case 1: No documents at all + let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); + let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); + assert_eq!(count as usize, found.len(), "Empty DB: count and find should agree"); + + // Case 2: New document (no metadata) + insert_document(&conn, 1, "New doc", "Content"); + let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); + let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); + assert_eq!(count as usize, found.len(), "New doc: count and find should agree"); + assert_eq!(count, 1); + + // Case 3: Document with matching metadata (not pending) + let now = chrono::Utc::now().timestamp_millis(); + conn.execute( + "INSERT INTO embedding_metadata + (document_id, chunk_index, model, dims, document_hash, chunk_hash, + created_at, attempt_count, chunk_max_bytes) + VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'ch', ?1, 1, ?2)", + rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64], + ) + .unwrap(); + let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); + let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); + assert_eq!(count as usize, found.len(), "Complete doc: count and find should agree"); + assert_eq!(count, 0); + + // Case 4: Config drift (chunk_max_bytes mismatch) + conn.execute( + "UPDATE embedding_metadata SET chunk_max_bytes = 999 WHERE document_id = 1", + [], + ) + .unwrap(); + let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap(); + let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap(); + assert_eq!(count as usize, found.len(), "Config drift: count and find should agree"); + assert_eq!(count, 1); +} + +#[test] +fn full_embed_delete_is_atomic() { + // Bug 7: The --full flag's two DELETE statements should be atomic. + // This test verifies that both tables are cleared together. + let (_tmp, conn) = create_test_db(); + + insert_document(&conn, 1, "Doc", "Content"); + insert_embedding(&conn, 1, 0, &axis_vector(0)); + + // Verify data exists + let meta_count: i64 = conn + .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0)) + .unwrap(); + let embed_count: i64 = conn + .query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0)) + .unwrap(); + assert_eq!(meta_count, 1); + assert_eq!(embed_count, 1); + + // Execute the atomic delete (same as embed.rs --full) + conn.execute_batch( + "BEGIN; + DELETE FROM embedding_metadata; + DELETE FROM embeddings; + COMMIT;", + ) + .unwrap(); + + let meta_count: i64 = conn + .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0)) + .unwrap(); + let embed_count: i64 = conn + .query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0)) + .unwrap(); + assert_eq!(meta_count, 0, "Metadata should be cleared"); + assert_eq!(embed_count, 0, "Embeddings should be cleared"); +}