use rusqlite::Connection; use crate::core::error::Result; use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS}; #[derive(Debug)] pub struct PendingDocument { pub document_id: i64, pub content_text: String, pub content_hash: String, } pub fn find_pending_documents( conn: &Connection, page_size: usize, last_id: i64, model_name: &str, ) -> Result> { // Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern. // This allows SQLite to scan embedding_metadata once instead of three times. // Semantically identical: returns documents needing (re-)embedding when: // - No embedding exists (em.document_id IS NULL) // - Content hash changed (em.document_hash != d.content_hash) // - Config mismatch (model/dims/chunk_max_bytes) let sql = r#" SELECT d.id, d.content_text, d.content_hash FROM documents d LEFT JOIN embedding_metadata em ON em.document_id = d.id AND em.chunk_index = 0 WHERE d.id > ?1 AND ( em.document_id IS NULL OR em.document_hash != d.content_hash OR em.chunk_max_bytes IS NULL OR em.chunk_max_bytes != ?3 OR em.model != ?4 OR em.dims != ?5 ) ORDER BY d.id LIMIT ?2 "#; let mut stmt = conn.prepare(sql)?; let rows = stmt .query_map( rusqlite::params![ last_id, page_size as i64, CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64, ], |row| { Ok(PendingDocument { document_id: row.get(0)?, content_text: row.get(1)?, content_hash: row.get(2)?, }) }, )? .collect::, _>>()?; Ok(rows) } pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result { // Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern let count: i64 = conn.query_row( r#" SELECT COUNT(*) FROM documents d LEFT JOIN embedding_metadata em ON em.document_id = d.id AND em.chunk_index = 0 WHERE em.document_id IS NULL OR em.document_hash != d.content_hash OR em.chunk_max_bytes IS NULL OR em.chunk_max_bytes != ?1 OR em.model != ?2 OR em.dims != ?3 "#, rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64], |row| row.get(0), )?; Ok(count) } #[cfg(test)] #[path = "change_detector_tests.rs"] mod tests;