gitlore/tests/embedding.rs

//! Integration tests for embedding storage and vector search.
//!
//! These tests create an in-memory SQLite database with sqlite-vec loaded,
//! apply all migrations through 010 (chunk config), and verify KNN search
//! and metadata operations.

use lore::core::db::create_connection;
use rusqlite::Connection;
use std::path::PathBuf;
use tempfile::TempDir;

/// Create a test DB on disk (required for sqlite-vec which needs the extension loaded).
/// Uses create_connection to get the sqlite-vec extension registered.
fn create_test_db() -> (TempDir, Connection) {
    let tmp = TempDir::new().unwrap();
    let db_path = tmp.path().join("test.db");
    let conn = create_connection(&db_path).unwrap();

    let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");

    for version in 1..=10 {
        let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
            .unwrap()
            .filter_map(|e| e.ok())
            .filter(|e| {
                e.file_name()
                    .to_string_lossy()
                    .starts_with(&format!("{:03}", version))
            })
            .collect();

        assert!(!entries.is_empty(), "Migration {} not found", version);
        let sql = std::fs::read_to_string(entries[0].path()).unwrap();
        conn.execute_batch(&sql)
            .unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
    }

    // Seed a project
    conn.execute(
        "INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
        [],
    )
    .unwrap();

    (tmp, conn)
}

fn insert_document(conn: &Connection, id: i64, title: &str, content: &str) {
    conn.execute(
        "INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
         VALUES (?1, 'issue', ?1, 1, ?2, ?3, 'hash_' || ?1, 'https://example.com/' || ?1)",
        rusqlite::params![id, title, content],
    )
    .unwrap();
}

/// Create a 768-dim vector with a specific dimension set to 1.0 (unit vector along axis).
fn axis_vector(dim: usize) -> Vec<f32> {
    let mut v = vec![0.0f32; 768];
    v[dim] = 1.0;
    v
}

fn insert_embedding(conn: &Connection, doc_id: i64, chunk_index: i64, embedding: &[f32]) {
    let rowid = doc_id * 1000 + chunk_index;
    let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();

    conn.execute(
        "INSERT INTO embeddings (rowid, embedding) VALUES (?1, ?2)",
        rusqlite::params![rowid, embedding_bytes],
    )
    .unwrap();

    let now = chrono::Utc::now().timestamp_millis();
    conn.execute(
        "INSERT INTO embedding_metadata
         (document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count)
         VALUES (?1, ?2, 'nomic-embed-text', 768, 'hash_' || ?1, 'chunk_hash', ?3, 1)",
        rusqlite::params![doc_id, chunk_index, now],
    )
    .unwrap();
}

#[test]
fn knn_search_returns_nearest_neighbors() {
    let (_tmp, conn) = create_test_db();

    insert_document(&conn, 1, "Doc A", "Content about authentication.");
    insert_document(&conn, 2, "Doc B", "Content about database optimization.");
    insert_document(&conn, 3, "Doc C", "Content about logging infrastructure.");

    // Doc 1: axis 0, Doc 2: axis 1, Doc 3: axis 2
    insert_embedding(&conn, 1, 0, &axis_vector(0));
    insert_embedding(&conn, 2, 0, &axis_vector(1));
    insert_embedding(&conn, 3, 0, &axis_vector(2));

    // Query vector close to axis 0 (should match doc 1)
    let mut query = vec![0.0f32; 768];
    query[0] = 0.9;
    query[1] = 0.1;

    let results = lore::search::search_vector(&conn, &query, 10).unwrap();

    assert!(!results.is_empty(), "Should return at least one result");
    assert_eq!(
        results[0].document_id, 1,
        "Nearest neighbor should be doc 1"
    );
}

#[test]
fn knn_search_respects_limit() {
    let (_tmp, conn) = create_test_db();

    for i in 1..=10 {
        insert_document(&conn, i, &format!("Doc {}", i), "Some content.");
        insert_embedding(&conn, i, 0, &axis_vector(i as usize));
    }

    let results = lore::search::search_vector(&conn, &axis_vector(0), 3).unwrap();
    assert!(results.len() <= 3, "Results should be capped at limit");
}

#[test]
fn knn_search_deduplicates_chunks() {
    let (_tmp, conn) = create_test_db();

    insert_document(
        &conn,
        1,
        "Multi-chunk doc",
        "Very long content that was chunked.",
    );

    // Same document, two chunks, both similar to query
    let mut v1 = vec![0.0f32; 768];
    v1[0] = 1.0;
    let mut v2 = vec![0.0f32; 768];
    v2[0] = 0.95;
    v2[1] = 0.05;

    insert_embedding(&conn, 1, 0, &v1);
    insert_embedding(&conn, 1, 1, &v2);

    let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();

    // Should deduplicate: same document_id appears at most once
    let unique_docs: std::collections::HashSet<i64> =
        results.iter().map(|r| r.document_id).collect();
    assert_eq!(
        unique_docs.len(),
        results.len(),
        "Each document should appear at most once in results"
    );
}

#[test]
fn orphan_trigger_deletes_embeddings_on_document_delete() {
    let (_tmp, conn) = create_test_db();

    insert_document(&conn, 1, "Will be deleted", "Content.");
    insert_embedding(&conn, 1, 0, &axis_vector(0));

    // Verify embedding exists
    let count: i64 = conn
        .query_row(
            "SELECT COUNT(*) FROM embeddings WHERE rowid = 1000",
            [],
            |r| r.get(0),
        )
        .unwrap();
    assert_eq!(count, 1, "Embedding should exist before delete");

    // Delete the document
    conn.execute("DELETE FROM documents WHERE id = 1", [])
        .unwrap();

    // Verify embedding was cascade-deleted via trigger
    let count: i64 = conn
        .query_row(
            "SELECT COUNT(*) FROM embeddings WHERE rowid = 1000",
            [],
            |r| r.get(0),
        )
        .unwrap();
    assert_eq!(
        count, 0,
        "Trigger should delete embeddings when document is deleted"
    );

    // Verify metadata was cascade-deleted via FK
    let meta_count: i64 = conn
        .query_row(
            "SELECT COUNT(*) FROM embedding_metadata WHERE document_id = 1",
            [],
            |r| r.get(0),
        )
        .unwrap();
    assert_eq!(meta_count, 0, "Metadata should be cascade-deleted");
}

#[test]
fn empty_database_returns_no_results() {
    let (_tmp, conn) = create_test_db();

    let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();
    assert!(results.is_empty(), "Empty DB should return no results");
}

// --- Bug-fix regression tests ---

#[test]
fn overflow_doc_with_error_sentinel_not_re_detected_as_pending() {
    // Bug 2: Documents skipped for chunk overflow must record a sentinel error
    // in embedding_metadata so they are not re-detected as pending on subsequent
    // pipeline runs (which would cause an infinite re-processing loop).
    let (_tmp, conn) = create_test_db();

    insert_document(&conn, 1, "Overflow doc", "Some content");

    // Simulate what the pipeline does when a document exceeds CHUNK_ROWID_MULTIPLIER:
    // it records an error sentinel at chunk_index=0.
    let now = chrono::Utc::now().timestamp_millis();
    conn.execute(
        "INSERT INTO embedding_metadata
         (document_id, chunk_index, model, dims, document_hash, chunk_hash,
          created_at, attempt_count, last_error, last_attempt_at, chunk_max_bytes)
         VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'overflow-sentinel', ?1, 1, 'Document produces too many chunks', ?1, ?2)",
        rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64],
    )
    .unwrap();

    // Now find_pending_documents should NOT return this document
    let pending =
        lore::embedding::find_pending_documents(&conn, 100, 0, "nomic-embed-text").unwrap();
    assert!(
        pending.is_empty(),
        "Document with overflow error sentinel should not be re-detected as pending, got {} pending",
        pending.len()
    );

    // count_pending_documents should also return 0
    let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
    assert_eq!(
        count, 0,
        "Count should be 0 for document with overflow sentinel"
    );
}

#[test]
fn count_and_find_pending_agree() {
    // Bug 1: count_pending_documents and find_pending_documents must use
    // logically equivalent WHERE clauses to produce consistent results.
    let (_tmp, conn) = create_test_db();

    // Case 1: No documents at all
    let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
    let found =
        lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
    assert_eq!(
        count as usize,
        found.len(),
        "Empty DB: count and find should agree"
    );

    // Case 2: New document (no metadata)
    insert_document(&conn, 1, "New doc", "Content");
    let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
    let found =
        lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
    assert_eq!(
        count as usize,
        found.len(),
        "New doc: count and find should agree"
    );
    assert_eq!(count, 1);

    // Case 3: Document with matching metadata (not pending)
    let now = chrono::Utc::now().timestamp_millis();
    conn.execute(
        "INSERT INTO embedding_metadata
         (document_id, chunk_index, model, dims, document_hash, chunk_hash,
          created_at, attempt_count, chunk_max_bytes)
         VALUES (1, 0, 'nomic-embed-text', 768, 'hash_1', 'ch', ?1, 1, ?2)",
        rusqlite::params![now, lore::embedding::CHUNK_MAX_BYTES as i64],
    )
    .unwrap();
    let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
    let found =
        lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
    assert_eq!(
        count as usize,
        found.len(),
        "Complete doc: count and find should agree"
    );
    assert_eq!(count, 0);

    // Case 4: Config drift (chunk_max_bytes mismatch)
    conn.execute(
        "UPDATE embedding_metadata SET chunk_max_bytes = 999 WHERE document_id = 1",
        [],
    )
    .unwrap();
    let count = lore::embedding::count_pending_documents(&conn, "nomic-embed-text").unwrap();
    let found =
        lore::embedding::find_pending_documents(&conn, 1000, 0, "nomic-embed-text").unwrap();
    assert_eq!(
        count as usize,
        found.len(),
        "Config drift: count and find should agree"
    );
    assert_eq!(count, 1);
}

#[test]
fn full_embed_delete_is_atomic() {
    // Bug 7: The --full flag's two DELETE statements should be atomic.
    // This test verifies that both tables are cleared together.
    let (_tmp, conn) = create_test_db();

    insert_document(&conn, 1, "Doc", "Content");
    insert_embedding(&conn, 1, 0, &axis_vector(0));

    // Verify data exists
    let meta_count: i64 = conn
        .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0))
        .unwrap();
    let embed_count: i64 = conn
        .query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0))
        .unwrap();
    assert_eq!(meta_count, 1);
    assert_eq!(embed_count, 1);

    // Execute the atomic delete (same as embed.rs --full)
    conn.execute_batch(
        "BEGIN;
         DELETE FROM embedding_metadata;
         DELETE FROM embeddings;
         COMMIT;",
    )
    .unwrap();

    let meta_count: i64 = conn
        .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |r| r.get(0))
        .unwrap();
    let embed_count: i64 = conn
        .query_row("SELECT COUNT(*) FROM embeddings", [], |r| r.get(0))
        .unwrap();
    assert_eq!(meta_count, 0, "Metadata should be cleared");
    assert_eq!(embed_count, 0, "Embeddings should be cleared");
}