use rusqlite::Connection; use crate::core::error::Result; use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS}; #[derive(Debug)] pub struct PendingDocument { pub document_id: i64, pub content_text: String, pub content_hash: String, } pub fn find_pending_documents( conn: &Connection, page_size: usize, last_id: i64, model_name: &str, ) -> Result> { let sql = r#" SELECT d.id, d.content_text, d.content_hash FROM documents d WHERE d.id > ?1 AND ( NOT EXISTS ( SELECT 1 FROM embedding_metadata em WHERE em.document_id = d.id AND em.chunk_index = 0 ) OR EXISTS ( SELECT 1 FROM embedding_metadata em WHERE em.document_id = d.id AND em.chunk_index = 0 AND em.document_hash != d.content_hash ) OR EXISTS ( SELECT 1 FROM embedding_metadata em WHERE em.document_id = d.id AND em.chunk_index = 0 AND ( em.chunk_max_bytes IS NULL OR em.chunk_max_bytes != ?3 OR em.model != ?4 OR em.dims != ?5 ) ) ) ORDER BY d.id LIMIT ?2 "#; let mut stmt = conn.prepare(sql)?; let rows = stmt .query_map( rusqlite::params![ last_id, page_size as i64, CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64, ], |row| { Ok(PendingDocument { document_id: row.get(0)?, content_text: row.get(1)?, content_hash: row.get(2)?, }) }, )? .collect::, _>>()?; Ok(rows) } pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result { let count: i64 = conn.query_row( r#" SELECT COUNT(*) FROM documents d WHERE ( NOT EXISTS ( SELECT 1 FROM embedding_metadata em WHERE em.document_id = d.id AND em.chunk_index = 0 ) OR EXISTS ( SELECT 1 FROM embedding_metadata em WHERE em.document_id = d.id AND em.chunk_index = 0 AND em.document_hash != d.content_hash ) OR EXISTS ( SELECT 1 FROM embedding_metadata em WHERE em.document_id = d.id AND em.chunk_index = 0 AND ( em.chunk_max_bytes IS NULL OR em.chunk_max_bytes != ?1 OR em.model != ?2 OR em.dims != ?3 ) ) ) "#, rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64], |row| row.get(0), )?; Ok(count) }