Removes module-level doc comments (//! lines) and excessive inline doc comments that were duplicating information already evident from: - Function/struct names (self-documenting code) - Type signatures (the what is clear from types) - Implementation context (the how is clear from code) Affected modules: - cli/* - Removed command descriptions duplicating clap help text - core/* - Removed module headers and obvious function docs - documents/* - Removed extractor/regenerator/truncation docs - embedding/* - Removed pipeline and chunking docs - gitlab/* - Removed client and transformer docs (kept type definitions) - ingestion/* - Removed orchestrator and ingestion docs - search/* - Removed FTS and vector search docs Philosophy: Code should be self-documenting. Comments should explain "why" (business decisions, non-obvious constraints) not "what" (which the code itself shows). This change reduces noise and maintenance burden while keeping the codebase just as understandable. Retains comments for: - Non-obvious business logic - Important safety invariants - Complex algorithm explanations - Public API boundaries where generated docs matter Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
103 lines
3.0 KiB
Rust
103 lines
3.0 KiB
Rust
use rusqlite::Connection;
|
|
|
|
use crate::core::error::Result;
|
|
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
|
|
|
#[derive(Debug)]
|
|
pub struct PendingDocument {
|
|
pub document_id: i64,
|
|
pub content_text: String,
|
|
pub content_hash: String,
|
|
}
|
|
|
|
pub fn find_pending_documents(
|
|
conn: &Connection,
|
|
page_size: usize,
|
|
last_id: i64,
|
|
model_name: &str,
|
|
) -> Result<Vec<PendingDocument>> {
|
|
let sql = r#"
|
|
SELECT d.id, d.content_text, d.content_hash
|
|
FROM documents d
|
|
WHERE d.id > ?1
|
|
AND (
|
|
NOT EXISTS (
|
|
SELECT 1 FROM embedding_metadata em
|
|
WHERE em.document_id = d.id AND em.chunk_index = 0
|
|
)
|
|
OR EXISTS (
|
|
SELECT 1 FROM embedding_metadata em
|
|
WHERE em.document_id = d.id AND em.chunk_index = 0
|
|
AND em.document_hash != d.content_hash
|
|
)
|
|
OR EXISTS (
|
|
SELECT 1 FROM embedding_metadata em
|
|
WHERE em.document_id = d.id AND em.chunk_index = 0
|
|
AND (
|
|
em.chunk_max_bytes IS NULL
|
|
OR em.chunk_max_bytes != ?3
|
|
OR em.model != ?4
|
|
OR em.dims != ?5
|
|
)
|
|
)
|
|
)
|
|
ORDER BY d.id
|
|
LIMIT ?2
|
|
"#;
|
|
|
|
let mut stmt = conn.prepare(sql)?;
|
|
let rows = stmt
|
|
.query_map(
|
|
rusqlite::params![
|
|
last_id,
|
|
page_size as i64,
|
|
CHUNK_MAX_BYTES as i64,
|
|
model_name,
|
|
EXPECTED_DIMS as i64,
|
|
],
|
|
|row| {
|
|
Ok(PendingDocument {
|
|
document_id: row.get(0)?,
|
|
content_text: row.get(1)?,
|
|
content_hash: row.get(2)?,
|
|
})
|
|
},
|
|
)?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
Ok(rows)
|
|
}
|
|
|
|
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
|
|
let count: i64 = conn.query_row(
|
|
r#"
|
|
SELECT COUNT(*)
|
|
FROM documents d
|
|
WHERE (
|
|
NOT EXISTS (
|
|
SELECT 1 FROM embedding_metadata em
|
|
WHERE em.document_id = d.id AND em.chunk_index = 0
|
|
)
|
|
OR EXISTS (
|
|
SELECT 1 FROM embedding_metadata em
|
|
WHERE em.document_id = d.id AND em.chunk_index = 0
|
|
AND em.document_hash != d.content_hash
|
|
)
|
|
OR EXISTS (
|
|
SELECT 1 FROM embedding_metadata em
|
|
WHERE em.document_id = d.id AND em.chunk_index = 0
|
|
AND (
|
|
em.chunk_max_bytes IS NULL
|
|
OR em.chunk_max_bytes != ?1
|
|
OR em.model != ?2
|
|
OR em.dims != ?3
|
|
)
|
|
)
|
|
)
|
|
"#,
|
|
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|
|
|row| row.get(0),
|
|
)?;
|
|
Ok(count)
|
|
}
|