Files
gitlore/src/embedding/change_detector.rs

90 lines
2.7 KiB
Rust

use rusqlite::Connection;
use crate::core::error::Result;
use crate::embedding::chunks::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
#[derive(Debug)]
pub struct PendingDocument {
pub document_id: i64,
pub content_text: String,
pub content_hash: String,
}
pub fn find_pending_documents(
conn: &Connection,
page_size: usize,
last_id: i64,
model_name: &str,
) -> Result<Vec<PendingDocument>> {
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern.
// This allows SQLite to scan embedding_metadata once instead of three times.
// Semantically identical: returns documents needing (re-)embedding when:
// - No embedding exists (em.document_id IS NULL)
// - Content hash changed (em.document_hash != d.content_hash)
// - Config mismatch (model/dims/chunk_max_bytes)
let sql = r#"
SELECT d.id, d.content_text, d.content_hash
FROM documents d
LEFT JOIN embedding_metadata em
ON em.document_id = d.id AND em.chunk_index = 0
WHERE d.id > ?1
AND (
em.document_id IS NULL
OR em.document_hash != d.content_hash
OR em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?3
OR em.model != ?4
OR em.dims != ?5
)
ORDER BY d.id
LIMIT ?2
"#;
let mut stmt = conn.prepare(sql)?;
let rows = stmt
.query_map(
rusqlite::params![
last_id,
page_size as i64,
CHUNK_MAX_BYTES as i64,
model_name,
EXPECTED_DIMS as i64,
],
|row| {
Ok(PendingDocument {
document_id: row.get(0)?,
content_text: row.get(1)?,
content_hash: row.get(2)?,
})
},
)?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(rows)
}
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern
let count: i64 = conn.query_row(
r#"
SELECT COUNT(*)
FROM documents d
LEFT JOIN embedding_metadata em
ON em.document_id = d.id AND em.chunk_index = 0
WHERE em.document_id IS NULL
OR em.document_hash != d.content_hash
OR em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?1
OR em.model != ?2
OR em.dims != ?3
"#,
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|row| row.get(0),
)?;
Ok(count)
}
#[cfg(test)]
#[path = "change_detector_tests.rs"]
mod tests;