feat(documents): Add document generation pipeline with dirty tracking

Implements the documents module that transforms raw ingested entities
(issues, MRs, discussions) into searchable document blobs stored in
the documents table. This is the foundation for both FTS5 lexical
search and vector embedding.

Key components:

- documents::extractor: Renders entities into structured text documents.
  Issues include title, description, labels, milestone, assignees, and
  threaded discussion summaries. MRs additionally include source/target
  branches, reviewers, and approval status. Discussions are rendered
  with full note threading.

- documents::regenerator: Drains the dirty_queue table to regenerate
  only documents whose source entities changed since last sync. Supports
  full rebuild mode (seeds all entities into dirty queue first) and
  project-scoped regeneration.

- documents::truncation: Safety cap at 2MB per document to prevent
  pathological outliers from degrading FTS or embedding performance.

- ingestion::dirty_tracker: Marks entities as dirty inside the
  ingestion transaction so document regeneration stays consistent
  with data changes. Uses INSERT OR IGNORE to deduplicate.

- ingestion::discussion_queue: Queue-based discussion fetching that
  isolates individual discussion failures from the broader ingestion
  pipeline, preventing a single corrupt discussion from blocking
  an entire project sync.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:46:18 -05:00
parent d31d5292f2
commit 20edff4ab1
7 changed files with 2431 additions and 0 deletions

View File

@@ -0,0 +1,258 @@
use rusqlite::Connection;
use crate::core::backoff::compute_next_attempt_at;
use crate::core::error::Result;
use crate::core::time::now_ms;
use crate::documents::SourceType;
const DIRTY_SOURCES_BATCH_SIZE: usize = 500;
/// Mark a source entity as dirty INSIDE an existing transaction.
/// ON CONFLICT resets ALL backoff/error state so fresh updates are immediately eligible.
pub fn mark_dirty_tx(
tx: &rusqlite::Transaction<'_>,
source_type: SourceType,
source_id: i64,
) -> Result<()> {
tx.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at)
VALUES (?1, ?2, ?3)
ON CONFLICT(source_type, source_id) DO UPDATE SET
queued_at = excluded.queued_at,
attempt_count = 0,
last_attempt_at = NULL,
last_error = NULL,
next_attempt_at = NULL",
rusqlite::params![source_type.as_str(), source_id, now_ms()],
)?;
Ok(())
}
/// Convenience wrapper for non-transactional contexts.
pub fn mark_dirty(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
conn.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at)
VALUES (?1, ?2, ?3)
ON CONFLICT(source_type, source_id) DO UPDATE SET
queued_at = excluded.queued_at,
attempt_count = 0,
last_attempt_at = NULL,
last_error = NULL,
next_attempt_at = NULL",
rusqlite::params![source_type.as_str(), source_id, now_ms()],
)?;
Ok(())
}
/// Get dirty sources ready for processing.
/// Returns entries where next_attempt_at is NULL or <= now.
/// Orders by attempt_count ASC (fresh before failed), then queued_at ASC.
pub fn get_dirty_sources(conn: &Connection) -> Result<Vec<(SourceType, i64)>> {
let now = now_ms();
let mut stmt = conn.prepare(
"SELECT source_type, source_id FROM dirty_sources
WHERE next_attempt_at IS NULL OR next_attempt_at <= ?1
ORDER BY attempt_count ASC, queued_at ASC
LIMIT ?2"
)?;
let rows = stmt
.query_map(rusqlite::params![now, DIRTY_SOURCES_BATCH_SIZE as i64], |row| {
let st_str: String = row.get(0)?;
let source_id: i64 = row.get(1)?;
Ok((st_str, source_id))
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
let mut results = Vec::with_capacity(rows.len());
for (st_str, source_id) in rows {
let source_type = SourceType::parse(&st_str).ok_or_else(|| {
crate::core::error::LoreError::Other(format!(
"Invalid source_type in dirty_sources: {}",
st_str
))
})?;
results.push((source_type, source_id));
}
Ok(results)
}
/// Clear dirty entry after successful processing.
pub fn clear_dirty(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
conn.execute(
"DELETE FROM dirty_sources WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
)?;
Ok(())
}
/// Record an error for a dirty source, incrementing attempt_count and setting backoff.
pub fn record_dirty_error(
conn: &Connection,
source_type: SourceType,
source_id: i64,
error: &str,
) -> Result<()> {
let now = now_ms();
// Get current attempt_count first
let attempt_count: i64 = conn.query_row(
"SELECT attempt_count FROM dirty_sources WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
|row| row.get(0),
)?;
let new_attempt = attempt_count + 1;
let next_at = compute_next_attempt_at(now, new_attempt);
conn.execute(
"UPDATE dirty_sources SET
attempt_count = ?1,
last_attempt_at = ?2,
last_error = ?3,
next_attempt_at = ?4
WHERE source_type = ?5 AND source_id = ?6",
rusqlite::params![new_attempt, now, error, next_at, source_type.as_str(), source_id],
)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn setup_db() -> Connection {
let conn = Connection::open_in_memory().unwrap();
conn.execute_batch("
CREATE TABLE dirty_sources (
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
source_id INTEGER NOT NULL,
queued_at INTEGER NOT NULL,
attempt_count INTEGER NOT NULL DEFAULT 0,
last_attempt_at INTEGER,
last_error TEXT,
next_attempt_at INTEGER,
PRIMARY KEY(source_type, source_id)
);
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
").unwrap();
conn
}
#[test]
fn test_mark_dirty_inserts() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap();
assert_eq!(count, 1);
}
#[test]
fn test_mark_dirty_tx_inserts() {
let mut conn = setup_db();
{
let tx = conn.transaction().unwrap();
mark_dirty_tx(&tx, SourceType::Issue, 1).unwrap();
tx.commit().unwrap();
}
let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap();
assert_eq!(count, 1);
}
#[test]
fn test_requeue_resets_backoff() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
// Simulate error state
record_dirty_error(&conn, SourceType::Issue, 1, "test error").unwrap();
let attempt: i64 = conn.query_row(
"SELECT attempt_count FROM dirty_sources WHERE source_id = 1", [], |r| r.get(0)
).unwrap();
assert_eq!(attempt, 1);
// Re-mark should reset
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
let attempt: i64 = conn.query_row(
"SELECT attempt_count FROM dirty_sources WHERE source_id = 1", [], |r| r.get(0)
).unwrap();
assert_eq!(attempt, 0);
let next_at: Option<i64> = conn.query_row(
"SELECT next_attempt_at FROM dirty_sources WHERE source_id = 1", [], |r| r.get(0)
).unwrap();
assert!(next_at.is_none());
}
#[test]
fn test_get_respects_backoff() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
// Set next_attempt_at far in the future
conn.execute(
"UPDATE dirty_sources SET next_attempt_at = 9999999999999 WHERE source_id = 1",
[],
).unwrap();
let results = get_dirty_sources(&conn).unwrap();
assert!(results.is_empty());
}
#[test]
fn test_get_orders_by_attempt_count() {
let conn = setup_db();
// Insert issue 1 (failed, attempt_count=2)
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
conn.execute(
"UPDATE dirty_sources SET attempt_count = 2 WHERE source_id = 1",
[],
).unwrap();
// Insert issue 2 (fresh, attempt_count=0)
mark_dirty(&conn, SourceType::Issue, 2).unwrap();
let results = get_dirty_sources(&conn).unwrap();
assert_eq!(results.len(), 2);
assert_eq!(results[0].1, 2); // Fresh first
assert_eq!(results[1].1, 1); // Failed second
}
#[test]
fn test_batch_size_500() {
let conn = setup_db();
for i in 0..600 {
mark_dirty(&conn, SourceType::Issue, i).unwrap();
}
let results = get_dirty_sources(&conn).unwrap();
assert_eq!(results.len(), 500);
}
#[test]
fn test_clear_removes() {
let conn = setup_db();
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
clear_dirty(&conn, SourceType::Issue, 1).unwrap();
let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap();
assert_eq!(count, 0);
}
#[test]
fn test_drain_loop() {
let conn = setup_db();
for i in 0..1200 {
mark_dirty(&conn, SourceType::Issue, i).unwrap();
}
let mut total = 0;
loop {
let batch = get_dirty_sources(&conn).unwrap();
if batch.is_empty() {
break;
}
for (st, id) in &batch {
clear_dirty(&conn, *st, *id).unwrap();
}
total += batch.len();
}
assert_eq!(total, 1200);
}
}

View File

@@ -0,0 +1,265 @@
use rusqlite::Connection;
use crate::core::backoff::compute_next_attempt_at;
use crate::core::error::Result;
use crate::core::time::now_ms;
/// Noteable type for discussion queue.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NoteableType {
Issue,
MergeRequest,
}
impl NoteableType {
pub fn as_str(&self) -> &'static str {
match self {
Self::Issue => "Issue",
Self::MergeRequest => "MergeRequest",
}
}
pub fn parse(s: &str) -> Option<Self> {
match s {
"Issue" => Some(Self::Issue),
"MergeRequest" => Some(Self::MergeRequest),
_ => None,
}
}
}
/// A pending discussion fetch entry.
pub struct PendingFetch {
pub project_id: i64,
pub noteable_type: NoteableType,
pub noteable_iid: i64,
pub attempt_count: i32,
}
/// Queue a discussion fetch. ON CONFLICT resets backoff (consistent with dirty_sources).
pub fn queue_discussion_fetch(
conn: &Connection,
project_id: i64,
noteable_type: NoteableType,
noteable_iid: i64,
) -> Result<()> {
conn.execute(
"INSERT INTO pending_discussion_fetches (project_id, noteable_type, noteable_iid, queued_at)
VALUES (?1, ?2, ?3, ?4)
ON CONFLICT(project_id, noteable_type, noteable_iid) DO UPDATE SET
queued_at = excluded.queued_at,
attempt_count = 0,
last_attempt_at = NULL,
last_error = NULL,
next_attempt_at = NULL",
rusqlite::params![project_id, noteable_type.as_str(), noteable_iid, now_ms()],
)?;
Ok(())
}
/// Get next batch of pending fetches (WHERE next_attempt_at IS NULL OR <= now).
pub fn get_pending_fetches(conn: &Connection, limit: usize) -> Result<Vec<PendingFetch>> {
let now = now_ms();
let mut stmt = conn.prepare(
"SELECT project_id, noteable_type, noteable_iid, attempt_count
FROM pending_discussion_fetches
WHERE next_attempt_at IS NULL OR next_attempt_at <= ?1
ORDER BY queued_at ASC
LIMIT ?2"
)?;
let rows = stmt
.query_map(rusqlite::params![now, limit as i64], |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i64>(2)?,
row.get::<_, i32>(3)?,
))
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
let mut results = Vec::with_capacity(rows.len());
for (project_id, nt_str, noteable_iid, attempt_count) in rows {
let noteable_type = NoteableType::parse(&nt_str).ok_or_else(|| {
crate::core::error::LoreError::Other(format!(
"Invalid noteable_type in pending_discussion_fetches: {}",
nt_str
))
})?;
results.push(PendingFetch {
project_id,
noteable_type,
noteable_iid,
attempt_count,
});
}
Ok(results)
}
/// Mark fetch complete (remove from queue).
pub fn complete_fetch(
conn: &Connection,
project_id: i64,
noteable_type: NoteableType,
noteable_iid: i64,
) -> Result<()> {
conn.execute(
"DELETE FROM pending_discussion_fetches
WHERE project_id = ?1 AND noteable_type = ?2 AND noteable_iid = ?3",
rusqlite::params![project_id, noteable_type.as_str(), noteable_iid],
)?;
Ok(())
}
/// Record fetch error with backoff.
pub fn record_fetch_error(
conn: &Connection,
project_id: i64,
noteable_type: NoteableType,
noteable_iid: i64,
error: &str,
) -> Result<()> {
let now = now_ms();
let attempt_count: i64 = conn.query_row(
"SELECT attempt_count FROM pending_discussion_fetches
WHERE project_id = ?1 AND noteable_type = ?2 AND noteable_iid = ?3",
rusqlite::params![project_id, noteable_type.as_str(), noteable_iid],
|row| row.get(0),
)?;
let new_attempt = attempt_count + 1;
let next_at = compute_next_attempt_at(now, new_attempt);
conn.execute(
"UPDATE pending_discussion_fetches SET
attempt_count = ?1,
last_attempt_at = ?2,
last_error = ?3,
next_attempt_at = ?4
WHERE project_id = ?5 AND noteable_type = ?6 AND noteable_iid = ?7",
rusqlite::params![new_attempt, now, error, next_at, project_id, noteable_type.as_str(), noteable_iid],
)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn setup_db() -> Connection {
let conn = Connection::open_in_memory().unwrap();
conn.execute_batch("
CREATE TABLE projects (
id INTEGER PRIMARY KEY,
gitlab_project_id INTEGER UNIQUE NOT NULL,
path_with_namespace TEXT NOT NULL,
default_branch TEXT,
web_url TEXT,
created_at INTEGER,
updated_at INTEGER,
raw_payload_id INTEGER
);
INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project');
CREATE TABLE pending_discussion_fetches (
project_id INTEGER NOT NULL REFERENCES projects(id),
noteable_type TEXT NOT NULL,
noteable_iid INTEGER NOT NULL,
queued_at INTEGER NOT NULL,
attempt_count INTEGER NOT NULL DEFAULT 0,
last_attempt_at INTEGER,
last_error TEXT,
next_attempt_at INTEGER,
PRIMARY KEY(project_id, noteable_type, noteable_iid)
);
CREATE INDEX idx_pending_discussions_next_attempt ON pending_discussion_fetches(next_attempt_at);
").unwrap();
conn
}
#[test]
fn test_queue_and_get() {
let conn = setup_db();
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
let fetches = get_pending_fetches(&conn, 100).unwrap();
assert_eq!(fetches.len(), 1);
assert_eq!(fetches[0].project_id, 1);
assert_eq!(fetches[0].noteable_type, NoteableType::Issue);
assert_eq!(fetches[0].noteable_iid, 42);
assert_eq!(fetches[0].attempt_count, 0);
}
#[test]
fn test_requeue_resets_backoff() {
let conn = setup_db();
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
record_fetch_error(&conn, 1, NoteableType::Issue, 42, "network error").unwrap();
let attempt: i32 = conn.query_row(
"SELECT attempt_count FROM pending_discussion_fetches WHERE noteable_iid = 42",
[], |r| r.get(0),
).unwrap();
assert_eq!(attempt, 1);
// Re-queue should reset
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
let attempt: i32 = conn.query_row(
"SELECT attempt_count FROM pending_discussion_fetches WHERE noteable_iid = 42",
[], |r| r.get(0),
).unwrap();
assert_eq!(attempt, 0);
}
#[test]
fn test_backoff_respected() {
let conn = setup_db();
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
conn.execute(
"UPDATE pending_discussion_fetches SET next_attempt_at = 9999999999999 WHERE noteable_iid = 42",
[],
).unwrap();
let fetches = get_pending_fetches(&conn, 100).unwrap();
assert!(fetches.is_empty());
}
#[test]
fn test_complete_removes() {
let conn = setup_db();
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
complete_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
let count: i64 = conn.query_row(
"SELECT COUNT(*) FROM pending_discussion_fetches", [], |r| r.get(0),
).unwrap();
assert_eq!(count, 0);
}
#[test]
fn test_error_increments_attempts() {
let conn = setup_db();
queue_discussion_fetch(&conn, 1, NoteableType::MergeRequest, 10).unwrap();
record_fetch_error(&conn, 1, NoteableType::MergeRequest, 10, "timeout").unwrap();
let (attempt, error): (i32, Option<String>) = conn.query_row(
"SELECT attempt_count, last_error FROM pending_discussion_fetches WHERE noteable_iid = 10",
[], |r| Ok((r.get(0)?, r.get(1)?)),
).unwrap();
assert_eq!(attempt, 1);
assert_eq!(error, Some("timeout".to_string()));
let next_at: Option<i64> = conn.query_row(
"SELECT next_attempt_at FROM pending_discussion_fetches WHERE noteable_iid = 10",
[], |r| r.get(0),
).unwrap();
assert!(next_at.is_some());
}
#[test]
fn test_noteable_type_parse() {
assert_eq!(NoteableType::parse("Issue"), Some(NoteableType::Issue));
assert_eq!(NoteableType::parse("MergeRequest"), Some(NoteableType::MergeRequest));
assert_eq!(NoteableType::parse("invalid"), None);
}
}

View File

@@ -3,6 +3,8 @@
//! This module handles fetching and storing issues, discussions, and notes
//! from GitLab with cursor-based incremental sync.
pub mod dirty_tracker;
pub mod discussion_queue;
pub mod discussions;
pub mod issues;
pub mod merge_requests;