use rusqlite::Connection; use crate::core::error::Result; use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS}; #[derive(Debug)] pub struct PendingDocument { pub document_id: i64, pub content_text: String, pub content_hash: String, } pub fn find_pending_documents( conn: &Connection, page_size: usize, last_id: i64, model_name: &str, ) -> Result> { // Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern. // This allows SQLite to scan embedding_metadata once instead of three times. // Semantically identical: returns documents needing (re-)embedding when: // - No embedding exists (em.document_id IS NULL) // - Content hash changed (em.document_hash != d.content_hash) // - Config mismatch (model/dims/chunk_max_bytes) let sql = r#" SELECT d.id, d.content_text, d.content_hash FROM documents d LEFT JOIN embedding_metadata em ON em.document_id = d.id AND em.chunk_index = 0 WHERE d.id > ?1 AND ( em.document_id IS NULL OR em.document_hash != d.content_hash OR em.chunk_max_bytes IS NULL OR em.chunk_max_bytes != ?3 OR em.model != ?4 OR em.dims != ?5 ) ORDER BY d.id LIMIT ?2 "#; let mut stmt = conn.prepare(sql)?; let rows = stmt .query_map( rusqlite::params![ last_id, page_size as i64, CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64, ], |row| { Ok(PendingDocument { document_id: row.get(0)?, content_text: row.get(1)?, content_hash: row.get(2)?, }) }, )? .collect::, _>>()?; Ok(rows) } pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result { // Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern let count: i64 = conn.query_row( r#" SELECT COUNT(*) FROM documents d LEFT JOIN embedding_metadata em ON em.document_id = d.id AND em.chunk_index = 0 WHERE em.document_id IS NULL OR em.document_hash != d.content_hash OR em.chunk_max_bytes IS NULL OR em.chunk_max_bytes != ?1 OR em.model != ?2 OR em.dims != ?3 "#, rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64], |row| row.get(0), )?; Ok(count) } #[cfg(test)] mod tests { use std::path::Path; use super::*; use crate::core::db::{create_connection, run_migrations}; use crate::embedding::pipeline::record_embedding_error; const MODEL: &str = "nomic-embed-text"; fn setup_db() -> Connection { let conn = create_connection(Path::new(":memory:")).unwrap(); run_migrations(&conn).unwrap(); conn } fn insert_test_project(conn: &Connection) -> i64 { conn.execute( "INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url) VALUES (1, 'group/test', 'https://gitlab.example.com/group/test')", [], ) .unwrap(); conn.last_insert_rowid() } fn insert_test_document(conn: &Connection, project_id: i64, content: &str) -> i64 { conn.execute( "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash) VALUES ('issue', 1, ?1, ?2, 'hash123')", rusqlite::params![project_id, content], ) .unwrap(); conn.last_insert_rowid() } #[test] fn retry_failed_delete_makes_doc_pending_again() { let conn = setup_db(); let proj_id = insert_test_project(&conn); let doc_id = insert_test_document(&conn, proj_id, "some text content"); // Doc starts as pending let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap(); assert_eq!(pending.len(), 1, "Doc should be pending initially"); // Record an error — doc should no longer be pending record_embedding_error( &conn, doc_id, 0, "hash123", "chunkhash", MODEL, "test error", ) .unwrap(); let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap(); assert!( pending.is_empty(), "Doc with error metadata should not be pending" ); // DELETE error rows (mimicking --retry-failed) — doc should become pending again conn.execute_batch( "DELETE FROM embeddings WHERE rowid / 1000 IN ( SELECT DISTINCT document_id FROM embedding_metadata WHERE last_error IS NOT NULL ); DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;", ) .unwrap(); let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap(); assert_eq!(pending.len(), 1, "Doc should be pending again after DELETE"); assert_eq!(pending[0].document_id, doc_id); } #[test] fn empty_doc_with_error_not_pending() { let conn = setup_db(); let proj_id = insert_test_project(&conn); let doc_id = insert_test_document(&conn, proj_id, ""); // Empty doc starts as pending let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap(); assert_eq!(pending.len(), 1, "Empty doc should be pending initially"); // Record an error for the empty doc record_embedding_error( &conn, doc_id, 0, "hash123", "empty", MODEL, "Document has empty content", ) .unwrap(); // Should no longer be pending let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap(); assert!( pending.is_empty(), "Empty doc with error metadata should not be pending" ); } #[test] fn old_update_approach_leaves_doc_invisible() { // This test demonstrates WHY we use DELETE instead of UPDATE. // UPDATE clears last_error but the row still matches config params, // so the doc stays "not pending" — permanently invisible. let conn = setup_db(); let proj_id = insert_test_project(&conn); let doc_id = insert_test_document(&conn, proj_id, "some text content"); // Record an error record_embedding_error( &conn, doc_id, 0, "hash123", "chunkhash", MODEL, "test error", ) .unwrap(); // Old approach: UPDATE to clear error conn.execute( "UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0 WHERE last_error IS NOT NULL", [], ) .unwrap(); // Doc is NOT pending — it's permanently invisible! This is the bug. let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap(); assert!( pending.is_empty(), "UPDATE approach leaves doc invisible (this proves the bug)" ); } }