Applies the same doc comment cleanup to test files: - Removes test module headers (//! lines) - Removes obvious test function comments - Retains comments explaining non-obvious test scenarios Test names should be descriptive enough to convey intent without additional comments. Complex test setup or assertions that need explanation retain their comments. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
315 lines
9.8 KiB
Rust
315 lines
9.8 KiB
Rust
use lore::core::db::create_connection;
|
|
use rusqlite::Connection;
|
|
use std::path::PathBuf;
|
|
use tempfile::TempDir;
|
|
|
|
fn create_test_db() -> (TempDir, Connection) {
|
|
let tmp = TempDir::new().unwrap();
|
|
let db_path = tmp.path().join("test.db");
|
|
let conn = create_connection(&db_path).unwrap();
|
|
|
|
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
|
|
|
for version in 1..=10 {
|
|
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
|
.unwrap()
|
|
.filter_map(|e| e.ok())
|
|
.filter(|e| {
|
|
e.file_name()
|
|
.to_string_lossy()
|
|
.starts_with(&format!("{:03}", version))
|
|
})
|
|
.collect();
|
|
|
|
assert!(!entries.is_empty(), "Migration {} not found", version);
|
|
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
|
|
conn.execute_batch(&sql)
|
|
.unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
|
|
}
|
|
|
|
conn.execute(
|
|
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
|
|
[],
|
|
)
|
|
.unwrap();
|
|
|
|
(tmp, conn)
|
|
}
|
|
|
|
fn insert_document(conn: &Connection, id: i64, title: &str, content: &str) {
|
|
conn.execute(
|
|
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
|
|
VALUES (?1, 'issue', ?1, 1, ?2, ?3, 'hash_' || ?1, 'https://example.com/' || ?1)",
|
|
rusqlite::params![id, title, content],
|
|
)
|
|
.unwrap();
|
|
}
|
|
|
|
fn axis_vector(dim: usize) -> Vec<f32> {
|
|
let mut v = vec![0.0f32; 768];
|
|
v[dim] = 1.0;
|
|
v
|
|
}
|
|
|
|
fn insert_embedding(conn: &Connection, doc_id: i64, chunk_index: i64, embedding: &[f32]) {
|
|
let rowid = doc_id * 1000 + chunk_index;
|
|
let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();
|
|
|
|
conn.execute(
|
|
"INSERT INTO embeddings (rowid, embedding) VALUES (?1, ?2)",
|
|
rusqlite::params![rowid, embedding_bytes],
|
|
)
|
|
.unwrap();
|
|
|
|
let now = chrono::Utc::now().timestamp_millis();
|
|
conn.execute(
|
|
"INSERT INTO embedding_metadata
|
|
(document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count)
|
|
VALUES (?1, ?2, 'nomic-embed-text', 768, 'hash_' || ?1, 'chunk_hash', ?3, 1)",
|
|
rusqlite::params![doc_id, chunk_index, now],
|
|
)
|
|
.unwrap();
|
|
}
|
|
|
|
#[test]
|
|
fn knn_search_returns_nearest_neighbors() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
insert_document(&conn, 1, "Doc A", "Content about authentication.");
|
|
insert_document(&conn, 2, "Doc B", "Content about database optimization.");
|
|
insert_document(&conn, 3, "Doc C", "Content about logging infrastructure.");
|
|
|
|
insert_embedding(&conn, 1, 0, &axis_vector(0));
|
|
insert_embedding(&conn, 2, 0, &axis_vector(1));
|
|
insert_embedding(&conn, 3, 0, &axis_vector(2));
|
|
|
|
let mut query = vec![0.0f32; 768];
|
|
query[0] = 0.9;
|
|
query[1] = 0.1;
|
|
|
|
let results = lore::search::search_vector(&conn, &query, 10).unwrap();
|
|
|
|
assert!(!results.is_empty(), "Should return at least one result");
|
|
assert_eq!(
|
|
results[0].document_id, 1,
|
|
"Nearest neighbor should be doc 1"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn knn_search_respects_limit() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
for i in 1..=10 {
|
|
insert_document(&conn, i, &format!("Doc {}", i), "Some content.");
|
|
insert_embedding(&conn, i, 0, &axis_vector(i as usize));
|
|
}
|
|
|
|
let results = lore::search::search_vector(&conn, &axis_vector(0), 3).unwrap();
|
|
assert!(results.len() <= 3, "Results should be capped at limit");
|
|
}
|
|
|
|
#[test]
|
|
fn knn_search_deduplicates_chunks() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
insert_document(
|
|
&conn,
|
|
1,
|
|
"Multi-chunk doc",
|
|
"Very long content that was chunked.",
|
|
);
|
|
|
|
let mut v1 = vec![0.0f32; 768];
|
|
v1[0] = 1.0;
|
|
let mut v2 = vec![0.0f32; 768];
|
|
v2[0] = 0.95;
|
|
v2[1] = 0.05;
|
|
|
|
insert_embedding(&conn, 1, 0, &v1);
|
|
insert_embedding(&conn, 1, 1, &v2);
|
|
|
|
let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();
|
|
|
|
let unique_docs: std::collections::HashSet<i64> =
|
|
results.iter().map(|r| r.document_id).collect();
|
|
assert_eq!(
|
|
unique_docs.len(),
|
|
results.len(),
|
|
"Each document should appear at most once in results"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn orphan_trigger_deletes_embeddings_on_document_delete() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
insert_document(&conn, 1, "Will be deleted", "Content.");
|
|
insert_embedding(&conn, 1, 0, &axis_vector(0));
|
|
|
|
let count: i64 = conn
|
|
.query_row(
|
|
"SELECT COUNT(*) FROM embeddings WHERE rowid = 1000",
|
|
[],
|
|
|r| r.get(0),
|
|
)
|
|
.unwrap();
|
|
assert_eq!(count, 1, "Embedding should exist before delete");
|
|
|
|
conn.execute("DELETE FROM documents WHERE id = 1", [])
|
|
.unwrap();
|
|
|
|
let count: i64 = conn
|
|
.query_row(
|
|
"SELECT COUNT(*) FROM embeddings WHERE rowid = 1000",
|
|
[],
|
|
|r| r.get(0),
|
|
)
|
|
.unwrap();
|
|
assert_eq!(
|
|
count, 0,
|
|
"Trigger should delete embeddings when document is deleted"
|
|
);
|
|
|
|
let meta_count: i64 = conn
|
|
.query_row(
|
|
"SELECT COUNT(*) FROM embedding_metadata WHERE document_id = 1",
|
|
[],
|
|
|r| r.get(0),
|
|
)
|
|
.unwrap();
|
|
assert_eq!(meta_count, 0, "Metadata should be cascade-deleted");
|
|
}
|
|
|
|
#[test]
|
|
fn empty_database_returns_no_results() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();
|
|
assert!(results.is_empty(), "Empty DB should return no results");
|
|
}
|
|
|
|
#[test]
|
|
fn overflow_doc_with_error_sentinel_not_re_detected_as_pending() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
insert_document(&conn, 1, "Overflow doc", "Some content");
|
|
|
|
let now = chrono::Utc::now().timestamp_millis();
|
|
conn.execute(
|
|
"INSERT INTO embedding_metadata
|
|
(document_id, chunk_index, model, dims, document_hash, chunk_hash,
|
|
created_at, attempt_count, last_error, last_attempt_at, chunk_max_bytes)
|
|
VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'overflow-sentinel', ?1, 1, 'Document produces too many chunks', ?1, ?2)",
|
|
rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64],
|
|
)
|
|
.unwrap();
|
|
|
|
let pending =
|
|
lore::embedding::find_pending_documents(&conn, 100, 0, "nomic-embed-text").unwrap();
|
|
assert!(
|
|
pending.is_empty(),
|
|
"Document with overflow error sentinel should not be re-detected as pending, got {} pending",
|
|
pending.len()
|
|
);
|
|
|
|
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
|
assert_eq!(
|
|
count, 0,
|
|
"Count should be 0 for document with overflow sentinel"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn count_and_find_pending_agree() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
|
let found =
|
|
lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
|
assert_eq!(
|
|
count as usize,
|
|
found.len(),
|
|
"Empty DB: count and find should agree"
|
|
);
|
|
|
|
insert_document(&conn, 1, "New doc", "Content");
|
|
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
|
let found =
|
|
lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
|
assert_eq!(
|
|
count as usize,
|
|
found.len(),
|
|
"New doc: count and find should agree"
|
|
);
|
|
assert_eq!(count, 1);
|
|
|
|
let now = chrono::Utc::now().timestamp_millis();
|
|
conn.execute(
|
|
"INSERT INTO embedding_metadata
|
|
(document_id, chunk_index, model, dims, document_hash, chunk_hash,
|
|
created_at, attempt_count, chunk_max_bytes)
|
|
VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'ch', ?1, 1, ?2)",
|
|
rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64],
|
|
)
|
|
.unwrap();
|
|
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
|
let found =
|
|
lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
|
assert_eq!(
|
|
count as usize,
|
|
found.len(),
|
|
"Complete doc: count and find should agree"
|
|
);
|
|
assert_eq!(count, 0);
|
|
|
|
conn.execute(
|
|
"UPDATE embedding_metadata SET chunk_max_bytes = 999 WHERE document_id = 1",
|
|
[],
|
|
)
|
|
.unwrap();
|
|
let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
|
|
let found =
|
|
lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
|
|
assert_eq!(
|
|
count as usize,
|
|
found.len(),
|
|
"Config drift: count and find should agree"
|
|
);
|
|
assert_eq!(count, 1);
|
|
}
|
|
|
|
#[test]
|
|
fn full_embed_delete_is_atomic() {
|
|
let (_tmp, conn) = create_test_db();
|
|
|
|
insert_document(&conn, 1, "Doc", "Content");
|
|
insert_embedding(&conn, 1, 0, &axis_vector(0));
|
|
|
|
let meta_count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0))
|
|
.unwrap();
|
|
let embed_count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0))
|
|
.unwrap();
|
|
assert_eq!(meta_count, 1);
|
|
assert_eq!(embed_count, 1);
|
|
|
|
conn.execute_batch(
|
|
"BEGIN;
|
|
DELETE FROM embedding_metadata;
|
|
DELETE FROM embeddings;
|
|
COMMIT;",
|
|
)
|
|
.unwrap();
|
|
|
|
let meta_count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0))
|
|
.unwrap();
|
|
let embed_count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0))
|
|
.unwrap();
|
|
assert_eq!(meta_count, 0, "Metadata should be cleared");
|
|
assert_eq!(embed_count, 0, "Embeddings should be cleared");
|
|
}
|