Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files
into dedicated _tests.rs companion files, wired via:
#[cfg(test)]
#[path = "module_tests.rs"]
mod tests;
This keeps implementation-focused source files leaner and more scannable
while preserving full access to private items through `use super::*;`.
Modules extracted:
core: db, note_parser, payloads, project, references, sync_run,
timeline_collect, timeline_expand, timeline_seed
cli: list (55 tests), who (75 tests)
documents: extractor (43 tests), regenerator
embedding: change_detector, chunking
gitlab: graphql (wiremock async tests), transformers/issue
ingestion: dirty_tracker, discussions, issues, mr_diffs
Also adds conflicts_with("explain_score") to the --detail flag in the
who command to prevent mutually exclusive flags from being combined.
All 629 unit tests pass. No behavior changes.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
90 lines
2.7 KiB
Rust
90 lines
2.7 KiB
Rust
use rusqlite::Connection;
|
|
|
|
use crate::core::error::Result;
|
|
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
|
|
|
#[derive(Debug)]
|
|
pub struct PendingDocument {
|
|
pub document_id: i64,
|
|
pub content_text: String,
|
|
pub content_hash: String,
|
|
}
|
|
|
|
pub fn find_pending_documents(
|
|
conn: &Connection,
|
|
page_size: usize,
|
|
last_id: i64,
|
|
model_name: &str,
|
|
) -> Result<Vec<PendingDocument>> {
|
|
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern.
|
|
// This allows SQLite to scan embedding_metadata once instead of three times.
|
|
// Semantically identical: returns documents needing (re-)embedding when:
|
|
// - No embedding exists (em.document_id IS NULL)
|
|
// - Content hash changed (em.document_hash != d.content_hash)
|
|
// - Config mismatch (model/dims/chunk_max_bytes)
|
|
let sql = r#"
|
|
SELECT d.id, d.content_text, d.content_hash
|
|
FROM documents d
|
|
LEFT JOIN embedding_metadata em
|
|
ON em.document_id = d.id AND em.chunk_index = 0
|
|
WHERE d.id > ?1
|
|
AND (
|
|
em.document_id IS NULL
|
|
OR em.document_hash != d.content_hash
|
|
OR em.chunk_max_bytes IS NULL
|
|
OR em.chunk_max_bytes != ?3
|
|
OR em.model != ?4
|
|
OR em.dims != ?5
|
|
)
|
|
ORDER BY d.id
|
|
LIMIT ?2
|
|
"#;
|
|
|
|
let mut stmt = conn.prepare(sql)?;
|
|
let rows = stmt
|
|
.query_map(
|
|
rusqlite::params![
|
|
last_id,
|
|
page_size as i64,
|
|
CHUNK_MAX_BYTES as i64,
|
|
model_name,
|
|
EXPECTED_DIMS as i64,
|
|
],
|
|
|row| {
|
|
Ok(PendingDocument {
|
|
document_id: row.get(0)?,
|
|
content_text: row.get(1)?,
|
|
content_hash: row.get(2)?,
|
|
})
|
|
},
|
|
)?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
Ok(rows)
|
|
}
|
|
|
|
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
|
|
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern
|
|
let count: i64 = conn.query_row(
|
|
r#"
|
|
SELECT COUNT(*)
|
|
FROM documents d
|
|
LEFT JOIN embedding_metadata em
|
|
ON em.document_id = d.id AND em.chunk_index = 0
|
|
WHERE em.document_id IS NULL
|
|
OR em.document_hash != d.content_hash
|
|
OR em.chunk_max_bytes IS NULL
|
|
OR em.chunk_max_bytes != ?1
|
|
OR em.model != ?2
|
|
OR em.dims != ?3
|
|
"#,
|
|
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|
|
|row| row.get(0),
|
|
)?;
|
|
Ok(count)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[path = "change_detector_tests.rs"]
|
|
mod tests;
|