test(embedding): Add regression tests for pipeline hardening bugs
Three targeted regression tests covering bugs fixed in the embedding pipeline hardening: - overflow_doc_with_error_sentinel_not_re_detected_as_pending: verifies that documents skipped for producing too many chunks have their sentinel error recorded in embedding_metadata and are NOT returned by find_pending_documents or count_pending_documents on subsequent runs (prevents infinite re-processing loop). - count_and_find_pending_agree: exercises four states (empty DB, new document, fully-embedded document, config-drifted document) and asserts that count_pending_documents and find_pending_documents produce consistent results across all of them. - full_embed_delete_is_atomic: confirms the --full flag's two DELETE statements (embedding_metadata + embeddings) execute atomically within a transaction. Also updates test DB creation to apply migration 010. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
//! Integration tests for embedding storage and vector search.
|
||||
//!
|
||||
//! These tests create an in-memory SQLite database with sqlite-vec loaded,
|
||||
//! apply all migrations through 009 (embeddings), and verify KNN search
|
||||
//! apply all migrations through 010 (chunk config), and verify KNN search
|
||||
//! and metadata operations.
|
||||
|
||||
use lore::core::db::create_connection;
|
||||
@@ -18,7 +18,7 @@ fn create_test_db() -> (TempDir, Connection) {
|
||||
|
||||
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
||||
|
||||
for version in 1..=9 {
|
||||
for version in 1..=10 {
|
||||
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
@@ -181,3 +181,122 @@ fn empty_database_returns_no_results() {
|
||||
let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();
|
||||
assert!(results.is_empty(), "Empty DB should return no results");
|
||||
}
|
||||
|
||||
// --- Bug-fix regression tests ---
|
||||
|
||||
#[test]
|
||||
fn overflow_doc_with_error_sentinel_not_re_detected_as_pending() {
|
||||
// Bug 2: Documents skipped for chunk overflow must record a sentinel error
|
||||
// in embedding_metadata so they are not re-detected as pending on subsequent
|
||||
// pipeline runs (which would cause an infinite re-processing loop).
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "Overflow doc", "Some content");
|
||||
|
||||
// Simulate what the pipeline does when a document exceeds CHUNK_ROWID_MULTIPLIER:
|
||||
// it records an error sentinel at chunk_index=0.
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
conn.execute(
|
||||
"INSERT INTO embedding_metadata
|
||||
(document_id, chunk_index, model, dims, document_hash, chunk_hash,
|
||||
created_at, attempt_count, last_error, last_attempt_at, chunk_max_bytes)
|
||||
VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'overflow-sentinel', ?1, 1, 'Document produces too many chunks', ?1, ?2)",
|
||||
rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Now find_pending_documents should NOT return this document
|
||||
let pending = lore::embedding::find_pending_documents(&conn, 100, 0, "nomic-embed-text").unwrap();
|
||||
assert!(
|
||||
pending.is_empty(),
|
||||
"Document with overflow error sentinel should not be re-detected as pending, got {} pending",
|
||||
pending.len()
|
||||
);
|
||||
|
||||
// count_pending_documents should also return 0
|
||||
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
||||
assert_eq!(count, 0, "Count should be 0 for document with overflow sentinel");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_and_find_pending_agree() {
|
||||
// Bug 1: count_pending_documents and find_pending_documents must use
|
||||
// logically equivalent WHERE clauses to produce consistent results.
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
// Case 1: No documents at all
|
||||
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
||||
let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
||||
assert_eq!(count as usize, found.len(), "Empty DB: count and find should agree");
|
||||
|
||||
// Case 2: New document (no metadata)
|
||||
insert_document(&conn, 1, "New doc", "Content");
|
||||
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
||||
let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
||||
assert_eq!(count as usize, found.len(), "New doc: count and find should agree");
|
||||
assert_eq!(count, 1);
|
||||
|
||||
// Case 3: Document with matching metadata (not pending)
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
conn.execute(
|
||||
"INSERT INTO embedding_metadata
|
||||
(document_id, chunk_index, model, dims, document_hash, chunk_hash,
|
||||
created_at, attempt_count, chunk_max_bytes)
|
||||
VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'ch', ?1, 1, ?2)",
|
||||
rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64],
|
||||
)
|
||||
.unwrap();
|
||||
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
||||
let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
||||
assert_eq!(count as usize, found.len(), "Complete doc: count and find should agree");
|
||||
assert_eq!(count, 0);
|
||||
|
||||
// Case 4: Config drift (chunk_max_bytes mismatch)
|
||||
conn.execute(
|
||||
"UPDATE embedding_metadata SET chunk_max_bytes = 999 WHERE document_id = 1",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
||||
let found = lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
||||
assert_eq!(count as usize, found.len(), "Config drift: count and find should agree");
|
||||
assert_eq!(count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn full_embed_delete_is_atomic() {
|
||||
// Bug 7: The --full flag's two DELETE statements should be atomic.
|
||||
// This test verifies that both tables are cleared together.
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "Doc", "Content");
|
||||
insert_embedding(&conn, 1, 0, &axis_vector(0));
|
||||
|
||||
// Verify data exists
|
||||
let meta_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
let embed_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(meta_count, 1);
|
||||
assert_eq!(embed_count, 1);
|
||||
|
||||
// Execute the atomic delete (same as embed.rs --full)
|
||||
conn.execute_batch(
|
||||
"BEGIN;
|
||||
DELETE FROM embedding_metadata;
|
||||
DELETE FROM embeddings;
|
||||
COMMIT;",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let meta_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
let embed_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(meta_count, 0, "Metadata should be cleared");
|
||||
assert_eq!(embed_count, 0, "Embeddings should be cleared");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user