Files
gitlore/src/ingestion/dirty_tracker.rs
Taylor Eernisse 65583ed5d6 refactor: Remove redundant doc comments throughout codebase
Removes module-level doc comments (//! lines) and excessive inline doc
comments that were duplicating information already evident from:
- Function/struct names (self-documenting code)
- Type signatures (the what is clear from types)
- Implementation context (the how is clear from code)

Affected modules:
- cli/* - Removed command descriptions duplicating clap help text
- core/* - Removed module headers and obvious function docs
- documents/* - Removed extractor/regenerator/truncation docs
- embedding/* - Removed pipeline and chunking docs
- gitlab/* - Removed client and transformer docs (kept type definitions)
- ingestion/* - Removed orchestrator and ingestion docs
- search/* - Removed FTS and vector search docs

Philosophy: Code should be self-documenting. Comments should explain
"why" (business decisions, non-obvious constraints) not "what" (which
the code itself shows). This change reduces noise and maintenance burden
while keeping the codebase just as understandable.

Retains comments for:
- Non-obvious business logic
- Important safety invariants
- Complex algorithm explanations
- Public API boundaries where generated docs matter

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 00:04:32 -05:00

275 lines
8.1 KiB
Rust

use rusqlite::Connection;
use crate::core::backoff::compute_next_attempt_at;
use crate::core::error::Result;
use crate::core::time::now_ms;
use crate::documents::SourceType;
const DIRTY_SOURCES_BATCH_SIZE: usize = 500;
pub fn mark_dirty_tx(
tx: &rusqlite::Transaction<'_>,
source_type: SourceType,
source_id: i64,
) -> Result<()> {
tx.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at)
VALUES (?1, ?2, ?3)
ON CONFLICT(source_type, source_id) DO UPDATE SET
queued_at = excluded.queued_at,
attempt_count = 0,
last_attempt_at = NULL,
last_error = NULL,
next_attempt_at = NULL",
rusqlite::params![source_type.as_str(), source_id, now_ms()],
)?;
Ok(())
}
pub fn mark_dirty(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
conn.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at)
VALUES (?1, ?2, ?3)
ON CONFLICT(source_type, source_id) DO UPDATE SET
queued_at = excluded.queued_at,
attempt_count = 0,
last_attempt_at = NULL,
last_error = NULL,
next_attempt_at = NULL",
rusqlite::params![source_type.as_str(), source_id, now_ms()],
)?;
Ok(())
}
pub fn get_dirty_sources(conn: &Connection) -> Result<Vec<(SourceType, i64)>> {
let now = now_ms();
let mut stmt = conn.prepare(
"SELECT source_type, source_id FROM dirty_sources
WHERE next_attempt_at IS NULL OR next_attempt_at <= ?1
ORDER BY attempt_count ASC, queued_at ASC
LIMIT ?2",
)?;
let rows = stmt
.query_map(
rusqlite::params![now, DIRTY_SOURCES_BATCH_SIZE as i64],
|row| {
let st_str: String = row.get(0)?;
let source_id: i64 = row.get(1)?;
Ok((st_str, source_id))
},
)?
.collect::<std::result::Result<Vec<_>, _>>()?;
let mut results = Vec::with_capacity(rows.len());
for (st_str, source_id) in rows {
let source_type = SourceType::parse(&st_str).ok_or_else(|| {
crate::core::error::LoreError::Other(format!(
"Invalid source_type in dirty_sources: {}",
st_str
))
})?;
results.push((source_type, source_id));
}
Ok(results)
}
pub fn clear_dirty(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
conn.execute(
"DELETE FROM dirty_sources WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
)?;
Ok(())
}
pub fn record_dirty_error(
conn: &Connection,
source_type: SourceType,
source_id: i64,
error: &str,
) -> Result<()> {
let now = now_ms();
let attempt_count: i64 = conn.query_row(
"SELECT attempt_count FROM dirty_sources WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
|row| row.get(0),
)?;
let new_attempt = attempt_count + 1;
let next_at = compute_next_attempt_at(now, new_attempt);
conn.execute(
"UPDATE dirty_sources SET
attempt_count = ?1,
last_attempt_at = ?2,
last_error = ?3,
next_attempt_at = ?4
WHERE source_type = ?5 AND source_id = ?6",
rusqlite::params![
new_attempt,
now,
error,
next_at,
source_type.as_str(),
source_id
],
)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn setup_db() -> Connection {
let conn = Connection::open_in_memory().unwrap();
conn.execute_batch("
CREATE TABLE dirty_sources (
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
source_id INTEGER NOT NULL,
queued_at INTEGER NOT NULL,
attempt_count INTEGER NOT NULL DEFAULT 0,
last_attempt_at INTEGER,
last_error TEXT,
next_attempt_at INTEGER,
PRIMARY KEY(source_type, source_id)
);
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
").unwrap();
conn
}
#[test]
fn test_mark_dirty_inserts() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let count: i64 = conn
.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0))
.unwrap();
assert_eq!(count, 1);
}
#[test]
fn test_mark_dirty_tx_inserts() {
let mut conn = setup_db();
{
let tx = conn.transaction().unwrap();
mark_dirty_tx(&tx, SourceType::Issue, 1).unwrap();
tx.commit().unwrap();
}
let count: i64 = conn
.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0))
.unwrap();
assert_eq!(count, 1);
}
#[test]
fn test_requeue_resets_backoff() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
record_dirty_error(&conn, SourceType::Issue, 1, "test error").unwrap();
let attempt: i64 = conn
.query_row(
"SELECT attempt_count FROM dirty_sources WHERE source_id = 1",
[],
|r| r.get(0),
)
.unwrap();
assert_eq!(attempt, 1);
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let attempt: i64 = conn
.query_row(
"SELECT attempt_count FROM dirty_sources WHERE source_id = 1",
[],
|r| r.get(0),
)
.unwrap();
assert_eq!(attempt, 0);
let next_at: Option<i64> = conn
.query_row(
"SELECT next_attempt_at FROM dirty_sources WHERE source_id = 1",
[],
|r| r.get(0),
)
.unwrap();
assert!(next_at.is_none());
}
#[test]
fn test_get_respects_backoff() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
conn.execute(
"UPDATE dirty_sources SET next_attempt_at = 9999999999999 WHERE source_id = 1",
[],
)
.unwrap();
let results = get_dirty_sources(&conn).unwrap();
assert!(results.is_empty());
}
#[test]
fn test_get_orders_by_attempt_count() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
conn.execute(
"UPDATE dirty_sources SET attempt_count = 2 WHERE source_id = 1",
[],
)
.unwrap();
mark_dirty(&conn, SourceType::Issue, 2).unwrap();
let results = get_dirty_sources(&conn).unwrap();
assert_eq!(results.len(), 2);
assert_eq!(results[0].1, 2);
assert_eq!(results[1].1, 1);
}
#[test]
fn test_batch_size_500() {
let conn = setup_db();
for i in 0..600 {
mark_dirty(&conn, SourceType::Issue, i).unwrap();
}
let results = get_dirty_sources(&conn).unwrap();
assert_eq!(results.len(), 500);
}
#[test]
fn test_clear_removes() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
clear_dirty(&conn, SourceType::Issue, 1).unwrap();
let count: i64 = conn
.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0))
.unwrap();
assert_eq!(count, 0);
}
#[test]
fn test_drain_loop() {
let conn = setup_db();
for i in 0..1200 {
mark_dirty(&conn, SourceType::Issue, i).unwrap();
}
let mut total = 0;
loop {
let batch = get_dirty_sources(&conn).unwrap();
if batch.is_empty() {
break;
}
for (st, id) in &batch {
clear_dirty(&conn, *st, *id).unwrap();
}
total += batch.len();
}
assert_eq!(total, 1200);
}
}