gitlore/src/embedding/change_detector.rs

use rusqlite::Connection;

use crate::core::error::Result;
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};

#[derive(Debug)]
pub struct PendingDocument {
    pub document_id: i64,
    pub content_text: String,
    pub content_hash: String,
}

pub fn find_pending_documents(
    conn: &Connection,
    page_size: usize,
    last_id: i64,
    model_name: &str,
) -> Result<Vec<PendingDocument>> {
    let sql = r#"
        SELECT d.id, d.content_text, d.content_hash
        FROM documents d
        WHERE d.id > ?1
          AND (
            NOT EXISTS (
                SELECT 1 FROM embedding_metadata em
                WHERE em.document_id = d.id AND em.chunk_index = 0
            )
            OR EXISTS (
                SELECT 1 FROM embedding_metadata em
                WHERE em.document_id = d.id AND em.chunk_index = 0
                  AND em.document_hash != d.content_hash
            )
            OR EXISTS (
                SELECT 1 FROM embedding_metadata em
                WHERE em.document_id = d.id AND em.chunk_index = 0
                  AND (
                    em.chunk_max_bytes IS NULL
                    OR em.chunk_max_bytes != ?3
                    OR em.model != ?4
                    OR em.dims != ?5
                  )
            )
          )
        ORDER BY d.id
        LIMIT ?2
    "#;

    let mut stmt = conn.prepare(sql)?;
    let rows = stmt
        .query_map(
            rusqlite::params![
                last_id,
                page_size as i64,
                CHUNK_MAX_BYTES as i64,
                model_name,
                EXPECTED_DIMS as i64,
            ],
            |row| {
                Ok(PendingDocument {
                    document_id: row.get(0)?,
                    content_text: row.get(1)?,
                    content_hash: row.get(2)?,
                })
            },
        )?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    Ok(rows)
}

pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
    let count: i64 = conn.query_row(
        r#"
        SELECT COUNT(*)
        FROM documents d
        WHERE (
            NOT EXISTS (
                SELECT 1 FROM embedding_metadata em
                WHERE em.document_id = d.id AND em.chunk_index = 0
            )
            OR EXISTS (
                SELECT 1 FROM embedding_metadata em
                WHERE em.document_id = d.id AND em.chunk_index = 0
                  AND em.document_hash != d.content_hash
            )
            OR EXISTS (
                SELECT 1 FROM embedding_metadata em
                WHERE em.document_id = d.id AND em.chunk_index = 0
                  AND (
                    em.chunk_max_bytes IS NULL
                    OR em.chunk_max_bytes != ?1
                    OR em.model != ?2
                    OR em.dims != ?3
                  )
            )
        )
        "#,
        rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
        |row| row.get(0),
    )?;
    Ok(count)
}