Files
gitlore/src/documents/regenerator.rs
teernisse 9ec1344945 feat(surgical-sync): add per-IID surgical sync pipeline with preflight validation
Add the ability to sync specific issues or merge requests by IID without
running a full incremental sync. This enables fast, targeted data refresh
for individual entities — useful for agent workflows, debugging, and
real-time investigation of specific issues or MRs.

Architecture:
- New CLI flags: --issue <IID> and --mr <IID> (repeatable, up to 100 total)
  scoped to a single project via -p/--project
- Preflight phase validates all IIDs exist on GitLab before any DB writes,
  with TOCTOU-aware soft verification at ingest time
- 6-stage pipeline: preflight -> fetch -> ingest -> dependents -> docs -> embed
- Each stage is cancellation-aware via ShutdownSignal
- Dedicated SyncRunRecorder extensions track surgical-specific counters
  (issues_fetched, mrs_ingested, docs_regenerated, etc.)

New modules:
- src/ingestion/surgical.rs: Core surgical fetch/ingest/dependent logic
  with preflight_fetch(), ingest_issue_by_iid(), ingest_mr_by_iid(),
  and fetch_dependents_for_{issue,mr}()
- src/cli/commands/sync_surgical.rs: Full CLI orchestrator with progress
  spinners, human/robot output, and cancellation handling
- src/embedding/pipeline.rs: embed_documents_by_ids() for scoped embedding
- src/documents/regenerator.rs: regenerate_dirty_documents_for_sources()
  for scoped document regeneration

Database changes:
- Migration 027: Extends sync_runs with mode, phase, surgical_iids_json,
  per-entity counters, and cancelled_at column
- New indexes: idx_sync_runs_mode_started, idx_sync_runs_status_phase_started

GitLab client:
- get_issue_by_iid() and get_mr_by_iid() single-entity fetch methods

Error handling:
- New SurgicalPreflightFailed error variant with entity_type, iid, project,
  and reason fields. Shares exit code 6 with GitLabNotFound.

Includes comprehensive test coverage:
- 645 lines of surgical ingestion tests (wiremock-based)
- 184 lines of scoped embedding tests
- 85 lines of scoped regeneration tests
- 113 lines of GitLab client single-entity tests
- 236 lines of sync_run surgical column/counter tests
- Unit tests for SyncOptions, error codes, and CLI validation
2026-02-18 16:28:21 -05:00

328 lines
11 KiB
Rust

use rusqlite::Connection;
use rusqlite::OptionalExtension;
use tracing::{debug, instrument, warn};
use crate::core::error::Result;
use crate::documents::{
DocumentData, ParentMetadataCache, SourceType, extract_discussion_document,
extract_issue_document, extract_mr_document, extract_note_document_cached,
};
use crate::ingestion::dirty_tracker::{clear_dirty, get_dirty_sources, record_dirty_error};
#[derive(Debug, Default)]
pub struct RegenerateResult {
pub regenerated: usize,
pub unchanged: usize,
pub errored: usize,
}
#[instrument(
skip(conn, progress_callback),
fields(items_processed, items_skipped, errors)
)]
pub fn regenerate_dirty_documents(
conn: &Connection,
progress_callback: Option<&dyn Fn(usize, usize)>,
) -> Result<RegenerateResult> {
let mut result = RegenerateResult::default();
let mut estimated_total: usize = 0;
let mut cache = ParentMetadataCache::new();
loop {
let dirty = get_dirty_sources(conn)?;
if dirty.is_empty() {
break;
}
let remaining: usize = conn
.query_row("SELECT COUNT(*) FROM dirty_sources", [], |row| row.get(0))
.unwrap_or(0_i64) as usize;
let processed_so_far = result.regenerated + result.unchanged + result.errored;
estimated_total = estimated_total.max(processed_so_far + remaining);
for (source_type, source_id) in &dirty {
match regenerate_one(conn, *source_type, *source_id, &mut cache) {
Ok(changed) => {
if changed {
result.regenerated += 1;
} else {
result.unchanged += 1;
}
clear_dirty(conn, *source_type, *source_id)?;
}
Err(e) => {
warn!(
source_type = %source_type,
source_id,
error = %e,
"Failed to regenerate document"
);
record_dirty_error(conn, *source_type, *source_id, &e.to_string())?;
result.errored += 1;
}
}
let processed = result.regenerated + result.unchanged + result.errored;
if let Some(cb) = progress_callback {
cb(processed, estimated_total);
}
}
}
debug!(
regenerated = result.regenerated,
unchanged = result.unchanged,
errored = result.errored,
"Document regeneration complete"
);
tracing::Span::current().record("items_processed", result.regenerated);
tracing::Span::current().record("items_skipped", result.unchanged);
tracing::Span::current().record("errors", result.errored);
Ok(result)
}
#[derive(Debug, Default)]
pub struct RegenerateForSourcesResult {
pub regenerated: usize,
pub unchanged: usize,
pub errored: usize,
pub document_ids: Vec<i64>,
}
pub fn regenerate_dirty_documents_for_sources(
conn: &Connection,
source_keys: &[(SourceType, i64)],
) -> Result<RegenerateForSourcesResult> {
let mut result = RegenerateForSourcesResult::default();
let mut cache = ParentMetadataCache::new();
for &(source_type, source_id) in source_keys {
match regenerate_one(conn, source_type, source_id, &mut cache) {
Ok(changed) => {
if changed {
result.regenerated += 1;
} else {
result.unchanged += 1;
}
clear_dirty(conn, source_type, source_id)?;
// Try to collect the document_id if a document exists
if let Ok(doc_id) = get_document_id(conn, source_type, source_id) {
result.document_ids.push(doc_id);
}
}
Err(e) => {
warn!(
source_type = %source_type,
source_id,
error = %e,
"Failed to regenerate document for source"
);
record_dirty_error(conn, source_type, source_id, &e.to_string())?;
result.errored += 1;
}
}
}
debug!(
regenerated = result.regenerated,
unchanged = result.unchanged,
errored = result.errored,
document_ids = result.document_ids.len(),
"Scoped document regeneration complete"
);
Ok(result)
}
fn regenerate_one(
conn: &Connection,
source_type: SourceType,
source_id: i64,
cache: &mut ParentMetadataCache,
) -> Result<bool> {
let doc = match source_type {
SourceType::Issue => extract_issue_document(conn, source_id)?,
SourceType::MergeRequest => extract_mr_document(conn, source_id)?,
SourceType::Discussion => extract_discussion_document(conn, source_id)?,
SourceType::Note => extract_note_document_cached(conn, source_id, cache)?,
};
let Some(doc) = doc else {
delete_document(conn, source_type, source_id)?;
return Ok(true);
};
upsert_document(conn, &doc)
}
fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<bool> {
conn.execute_batch("SAVEPOINT upsert_doc")?;
match upsert_document_inner(conn, doc) {
Ok(changed) => {
conn.execute_batch("RELEASE upsert_doc")?;
Ok(changed)
}
Err(e) => {
let _ = conn.execute_batch("ROLLBACK TO upsert_doc; RELEASE upsert_doc");
Err(e)
}
}
}
fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<bool> {
let existing: Option<(i64, String, String, String)> = conn
.query_row(
"SELECT id, content_hash, labels_hash, paths_hash FROM documents
WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![doc.source_type.as_str(), doc.source_id],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
)
.optional()?;
// Fast path: if all three hashes match, nothing changed at all.
if let Some((_, ref old_content_hash, ref old_labels_hash, ref old_paths_hash)) = existing
&& old_content_hash == &doc.content_hash
&& old_labels_hash == &doc.labels_hash
&& old_paths_hash == &doc.paths_hash
{
return Ok(false);
}
// Past this point at least one hash differs, so the document will be updated.
let labels_json = serde_json::to_string(&doc.labels).unwrap_or_else(|_| "[]".to_string());
conn.execute(
"INSERT INTO documents
(source_type, source_id, project_id, author_username, label_names,
labels_hash, paths_hash,
created_at, updated_at, url, title, content_text, content_hash,
is_truncated, truncated_reason)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15)
ON CONFLICT(source_type, source_id) DO UPDATE SET
project_id = excluded.project_id,
author_username = excluded.author_username,
label_names = excluded.label_names,
labels_hash = excluded.labels_hash,
paths_hash = excluded.paths_hash,
updated_at = excluded.updated_at,
url = excluded.url,
title = excluded.title,
content_text = excluded.content_text,
content_hash = excluded.content_hash,
is_truncated = excluded.is_truncated,
truncated_reason = excluded.truncated_reason",
rusqlite::params![
doc.source_type.as_str(),
doc.source_id,
doc.project_id,
doc.author_username,
labels_json,
doc.labels_hash,
doc.paths_hash,
doc.created_at,
doc.updated_at,
doc.url,
doc.title,
doc.content_text,
doc.content_hash,
doc.is_truncated as i32,
doc.truncated_reason,
],
)?;
let doc_id = match existing {
Some((id, _, _, _)) => id,
None => get_document_id(conn, doc.source_type, doc.source_id)?,
};
let labels_changed = match &existing {
Some((_, _, old_hash, _)) => old_hash != &doc.labels_hash,
None => true,
};
if labels_changed {
conn.execute(
"DELETE FROM document_labels WHERE document_id = ?1",
[doc_id],
)?;
if !doc.labels.is_empty() {
let placeholders: String = doc
.labels
.iter()
.enumerate()
.map(|(i, _)| format!("(?1, ?{})", i + 2))
.collect::<Vec<_>>()
.join(", ");
let sql = format!(
"INSERT INTO document_labels (document_id, label_name) VALUES {}",
placeholders
);
let mut params: Vec<Box<dyn rusqlite::types::ToSql>> = vec![Box::new(doc_id)];
for label in &doc.labels {
params.push(Box::new(label.as_str()));
}
let param_refs: Vec<&dyn rusqlite::types::ToSql> =
params.iter().map(|p| p.as_ref()).collect();
conn.execute(&sql, param_refs.as_slice())?;
}
}
let paths_changed = match &existing {
Some((_, _, _, old_hash)) => old_hash != &doc.paths_hash,
None => true,
};
if paths_changed {
conn.execute(
"DELETE FROM document_paths WHERE document_id = ?1",
[doc_id],
)?;
if !doc.paths.is_empty() {
let placeholders: String = doc
.paths
.iter()
.enumerate()
.map(|(i, _)| format!("(?1, ?{})", i + 2))
.collect::<Vec<_>>()
.join(", ");
let sql = format!(
"INSERT INTO document_paths (document_id, path) VALUES {}",
placeholders
);
let mut params: Vec<Box<dyn rusqlite::types::ToSql>> = vec![Box::new(doc_id)];
for path in &doc.paths {
params.push(Box::new(path.as_str()));
}
let param_refs: Vec<&dyn rusqlite::types::ToSql> =
params.iter().map(|p| p.as_ref()).collect();
conn.execute(&sql, param_refs.as_slice())?;
}
}
// We passed the triple-hash fast path, so at least one hash differs.
Ok(true)
}
fn delete_document(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
conn.execute(
"DELETE FROM documents WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
)?;
Ok(())
}
fn get_document_id(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<i64> {
let id: i64 = conn.query_row(
"SELECT id FROM documents WHERE source_type = ?1 AND source_id = ?2",
rusqlite::params![source_type.as_str(), source_id],
|row| row.get(0),
)?;
Ok(id)
}
#[cfg(test)]
#[path = "regenerator_tests.rs"]
mod tests;