Files
gitlore/src/embedding/change_detector.rs
Taylor Eernisse 7e0e6a91f2 refactor: extract unit tests into separate _tests.rs files
Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files
into dedicated _tests.rs companion files, wired via:

    #[cfg(test)]
    #[path = "module_tests.rs"]
    mod tests;

This keeps implementation-focused source files leaner and more scannable
while preserving full access to private items through `use super::*;`.

Modules extracted:
  core:      db, note_parser, payloads, project, references, sync_run,
             timeline_collect, timeline_expand, timeline_seed
  cli:       list (55 tests), who (75 tests)
  documents: extractor (43 tests), regenerator
  embedding: change_detector, chunking
  gitlab:    graphql (wiremock async tests), transformers/issue
  ingestion: dirty_tracker, discussions, issues, mr_diffs

Also adds conflicts_with("explain_score") to the --detail flag in the
who command to prevent mutually exclusive flags from being combined.

All 629 unit tests pass. No behavior changes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 10:54:02 -05:00

90 lines
2.7 KiB
Rust

use rusqlite::Connection;
use crate::core::error::Result;
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
#[derive(Debug)]
pub struct PendingDocument {
pub document_id: i64,
pub content_text: String,
pub content_hash: String,
}
pub fn find_pending_documents(
conn: &Connection,
page_size: usize,
last_id: i64,
model_name: &str,
) -> Result<Vec<PendingDocument>> {
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern.
// This allows SQLite to scan embedding_metadata once instead of three times.
// Semantically identical: returns documents needing (re-)embedding when:
// - No embedding exists (em.document_id IS NULL)
// - Content hash changed (em.document_hash != d.content_hash)
// - Config mismatch (model/dims/chunk_max_bytes)
let sql = r#"
SELECT d.id, d.content_text, d.content_hash
FROM documents d
LEFT JOIN embedding_metadata em
ON em.document_id = d.id AND em.chunk_index = 0
WHERE d.id > ?1
AND (
em.document_id IS NULL
OR em.document_hash != d.content_hash
OR em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?3
OR em.model != ?4
OR em.dims != ?5
)
ORDER BY d.id
LIMIT ?2
"#;
let mut stmt = conn.prepare(sql)?;
let rows = stmt
.query_map(
rusqlite::params![
last_id,
page_size as i64,
CHUNK_MAX_BYTES as i64,
model_name,
EXPECTED_DIMS as i64,
],
|row| {
Ok(PendingDocument {
document_id: row.get(0)?,
content_text: row.get(1)?,
content_hash: row.get(2)?,
})
},
)?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(rows)
}
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern
let count: i64 = conn.query_row(
r#"
SELECT COUNT(*)
FROM documents d
LEFT JOIN embedding_metadata em
ON em.document_id = d.id AND em.chunk_index = 0
WHERE em.document_id IS NULL
OR em.document_hash != d.content_hash
OR em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?1
OR em.model != ?2
OR em.dims != ?3
"#,
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|row| row.get(0),
)?;
Ok(count)
}
#[cfg(test)]
#[path = "change_detector_tests.rs"]
mod tests;