gitlore/src/embedding/change_detector.rs

use rusqlite::Connection;

use crate::core::error::Result;
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};

#[derive(Debug)]
pub struct PendingDocument {
    pub document_id: i64,
    pub content_text: String,
    pub content_hash: String,
}

pub fn find_pending_documents(
    conn: &Connection,
    page_size: usize,
    last_id: i64,
    model_name: &str,
) -> Result<Vec<PendingDocument>> {
    // Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern.
    // This allows SQLite to scan embedding_metadata once instead of three times.
    // Semantically identical: returns documents needing (re-)embedding when:
    // - No embedding exists (em.document_id IS NULL)
    // - Content hash changed (em.document_hash != d.content_hash)
    // - Config mismatch (model/dims/chunk_max_bytes)
    let sql = r#"
        SELECT d.id, d.content_text, d.content_hash
        FROM documents d
        LEFT JOIN embedding_metadata em
          ON em.document_id = d.id AND em.chunk_index = 0
        WHERE d.id > ?1
          AND (
            em.document_id IS NULL
            OR em.document_hash != d.content_hash
            OR em.chunk_max_bytes IS NULL
            OR em.chunk_max_bytes != ?3
            OR em.model != ?4
            OR em.dims != ?5
          )
        ORDER BY d.id
        LIMIT ?2
    "#;

    let mut stmt = conn.prepare(sql)?;
    let rows = stmt
        .query_map(
            rusqlite::params![
                last_id,
                page_size as i64,
                CHUNK_MAX_BYTES as i64,
                model_name,
                EXPECTED_DIMS as i64,
            ],
            |row| {
                Ok(PendingDocument {
                    document_id: row.get(0)?,
                    content_text: row.get(1)?,
                    content_hash: row.get(2)?,
                })
            },
        )?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    Ok(rows)
}

pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
    // Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern
    let count: i64 = conn.query_row(
        r#"
        SELECT COUNT(*)
        FROM documents d
        LEFT JOIN embedding_metadata em
          ON em.document_id = d.id AND em.chunk_index = 0
        WHERE em.document_id IS NULL
           OR em.document_hash != d.content_hash
           OR em.chunk_max_bytes IS NULL
           OR em.chunk_max_bytes != ?1
           OR em.model != ?2
           OR em.dims != ?3
        "#,
        rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
        |row| row.get(0),
    )?;
    Ok(count)
}

#[cfg(test)]
#[path = "change_detector_tests.rs"]
mod tests;