feat(surgical-sync): add per-IID surgical sync pipeline with preflight validation
Add the ability to sync specific issues or merge requests by IID without
running a full incremental sync. This enables fast, targeted data refresh
for individual entities — useful for agent workflows, debugging, and
real-time investigation of specific issues or MRs.
Architecture:
- New CLI flags: --issue <IID> and --mr <IID> (repeatable, up to 100 total)
scoped to a single project via -p/--project
- Preflight phase validates all IIDs exist on GitLab before any DB writes,
with TOCTOU-aware soft verification at ingest time
- 6-stage pipeline: preflight -> fetch -> ingest -> dependents -> docs -> embed
- Each stage is cancellation-aware via ShutdownSignal
- Dedicated SyncRunRecorder extensions track surgical-specific counters
(issues_fetched, mrs_ingested, docs_regenerated, etc.)
New modules:
- src/ingestion/surgical.rs: Core surgical fetch/ingest/dependent logic
with preflight_fetch(), ingest_issue_by_iid(), ingest_mr_by_iid(),
and fetch_dependents_for_{issue,mr}()
- src/cli/commands/sync_surgical.rs: Full CLI orchestrator with progress
spinners, human/robot output, and cancellation handling
- src/embedding/pipeline.rs: embed_documents_by_ids() for scoped embedding
- src/documents/regenerator.rs: regenerate_dirty_documents_for_sources()
for scoped document regeneration
Database changes:
- Migration 027: Extends sync_runs with mode, phase, surgical_iids_json,
per-entity counters, and cancelled_at column
- New indexes: idx_sync_runs_mode_started, idx_sync_runs_status_phase_started
GitLab client:
- get_issue_by_iid() and get_mr_by_iid() single-entity fetch methods
Error handling:
- New SurgicalPreflightFailed error variant with entity_type, iid, project,
and reason fields. Shares exit code 6 with GitLabNotFound.
Includes comprehensive test coverage:
- 645 lines of surgical ingestion tests (wiremock-based)
- 184 lines of scoped embedding tests
- 85 lines of scoped regeneration tests
- 113 lines of GitLab client single-entity tests
- 236 lines of sync_run surgical column/counter tests
- Unit tests for SyncOptions, error codes, and CLI validation
This commit is contained in:
@@ -578,3 +578,207 @@ fn sha256_hash(input: &str) -> String {
|
||||
hasher.update(input.as_bytes());
|
||||
format!("{:x}", hasher.finalize())
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EmbedForIdsResult {
|
||||
pub chunks_embedded: usize,
|
||||
pub docs_embedded: usize,
|
||||
pub failed: usize,
|
||||
pub skipped: usize,
|
||||
}
|
||||
|
||||
/// Embed only the documents with the given IDs, skipping any that are
|
||||
/// already embedded with matching config (model, dims, chunk size, hash).
|
||||
pub async fn embed_documents_by_ids(
|
||||
conn: &Connection,
|
||||
client: &OllamaClient,
|
||||
model_name: &str,
|
||||
concurrency: usize,
|
||||
document_ids: &[i64],
|
||||
signal: &ShutdownSignal,
|
||||
) -> Result<EmbedForIdsResult> {
|
||||
let mut result = EmbedForIdsResult::default();
|
||||
|
||||
if document_ids.is_empty() {
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
if signal.is_cancelled() {
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Load documents for the specified IDs, filtering out already-embedded
|
||||
let pending = find_documents_by_ids(conn, document_ids, model_name)?;
|
||||
|
||||
if pending.is_empty() {
|
||||
result.skipped = document_ids.len();
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let skipped_count = document_ids.len() - pending.len();
|
||||
result.skipped = skipped_count;
|
||||
|
||||
info!(
|
||||
requested = document_ids.len(),
|
||||
pending = pending.len(),
|
||||
skipped = skipped_count,
|
||||
"Scoped embedding: processing documents by ID"
|
||||
);
|
||||
|
||||
// Use the same SAVEPOINT + embed_page pattern as the main pipeline
|
||||
let mut last_id: i64 = 0;
|
||||
let mut processed: usize = 0;
|
||||
let total = pending.len();
|
||||
let mut page_stats = EmbedResult::default();
|
||||
|
||||
conn.execute_batch("SAVEPOINT embed_by_ids")?;
|
||||
let page_result = embed_page(
|
||||
conn,
|
||||
client,
|
||||
model_name,
|
||||
concurrency,
|
||||
&pending,
|
||||
&mut page_stats,
|
||||
&mut last_id,
|
||||
&mut processed,
|
||||
total,
|
||||
&None,
|
||||
signal,
|
||||
)
|
||||
.await;
|
||||
|
||||
match page_result {
|
||||
Ok(()) if signal.is_cancelled() => {
|
||||
let _ = conn.execute_batch("ROLLBACK TO embed_by_ids; RELEASE embed_by_ids");
|
||||
info!("Rolled back scoped embed page due to cancellation");
|
||||
}
|
||||
Ok(()) => {
|
||||
conn.execute_batch("RELEASE embed_by_ids")?;
|
||||
|
||||
// Count actual results from DB
|
||||
let (chunks, docs) = count_embedded_results(conn, &pending)?;
|
||||
result.chunks_embedded = chunks;
|
||||
result.docs_embedded = docs;
|
||||
result.failed = page_stats.failed;
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = conn.execute_batch("ROLLBACK TO embed_by_ids; RELEASE embed_by_ids");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
chunks_embedded = result.chunks_embedded,
|
||||
docs_embedded = result.docs_embedded,
|
||||
failed = result.failed,
|
||||
skipped = result.skipped,
|
||||
"Scoped embedding complete"
|
||||
);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Load documents by specific IDs, filtering out those already embedded
|
||||
/// with matching config (same logic as `find_pending_documents` but scoped).
|
||||
fn find_documents_by_ids(
|
||||
conn: &Connection,
|
||||
document_ids: &[i64],
|
||||
model_name: &str,
|
||||
) -> Result<Vec<crate::embedding::change_detector::PendingDocument>> {
|
||||
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
||||
|
||||
if document_ids.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Build IN clause with placeholders
|
||||
let placeholders: Vec<String> = (0..document_ids.len())
|
||||
.map(|i| format!("?{}", i + 1))
|
||||
.collect();
|
||||
let in_clause = placeholders.join(", ");
|
||||
|
||||
let sql = format!(
|
||||
r#"
|
||||
SELECT d.id, d.content_text, d.content_hash
|
||||
FROM documents d
|
||||
LEFT JOIN embedding_metadata em
|
||||
ON em.document_id = d.id AND em.chunk_index = 0
|
||||
WHERE d.id IN ({in_clause})
|
||||
AND (
|
||||
em.document_id IS NULL
|
||||
OR em.document_hash != d.content_hash
|
||||
OR em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?{chunk_bytes_idx}
|
||||
OR em.model != ?{model_idx}
|
||||
OR em.dims != ?{dims_idx}
|
||||
)
|
||||
ORDER BY d.id
|
||||
"#,
|
||||
in_clause = in_clause,
|
||||
chunk_bytes_idx = document_ids.len() + 1,
|
||||
model_idx = document_ids.len() + 2,
|
||||
dims_idx = document_ids.len() + 3,
|
||||
);
|
||||
|
||||
let mut stmt = conn.prepare(&sql)?;
|
||||
|
||||
// Build params: document_ids... then chunk_max_bytes, model, dims
|
||||
let mut params: Vec<Box<dyn rusqlite::types::ToSql>> = Vec::new();
|
||||
for id in document_ids {
|
||||
params.push(Box::new(*id));
|
||||
}
|
||||
params.push(Box::new(CHUNK_MAX_BYTES as i64));
|
||||
params.push(Box::new(model_name.to_string()));
|
||||
params.push(Box::new(EXPECTED_DIMS as i64));
|
||||
|
||||
let param_refs: Vec<&dyn rusqlite::types::ToSql> = params.iter().map(|p| p.as_ref()).collect();
|
||||
|
||||
let rows = stmt
|
||||
.query_map(param_refs.as_slice(), |row| {
|
||||
Ok(crate::embedding::change_detector::PendingDocument {
|
||||
document_id: row.get(0)?,
|
||||
content_text: row.get(1)?,
|
||||
content_hash: row.get(2)?,
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Count how many chunks and complete docs were embedded for the given pending docs.
|
||||
fn count_embedded_results(
|
||||
conn: &Connection,
|
||||
pending: &[crate::embedding::change_detector::PendingDocument],
|
||||
) -> Result<(usize, usize)> {
|
||||
let mut total_chunks: usize = 0;
|
||||
let mut total_docs: usize = 0;
|
||||
|
||||
for doc in pending {
|
||||
let chunk_count: i64 = conn.query_row(
|
||||
"SELECT COUNT(*) FROM embedding_metadata WHERE document_id = ?1 AND last_error IS NULL",
|
||||
[doc.document_id],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
if chunk_count > 0 {
|
||||
total_chunks += chunk_count as usize;
|
||||
// Check if all expected chunks are present (chunk_count metadata on chunk_index=0)
|
||||
let expected: Option<i64> = conn.query_row(
|
||||
"SELECT chunk_count FROM embedding_metadata WHERE document_id = ?1 AND chunk_index = 0",
|
||||
[doc.document_id],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
if let Some(expected_count) = expected
|
||||
&& chunk_count >= expected_count
|
||||
{
|
||||
total_docs += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((total_chunks, total_docs))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "pipeline_tests.rs"]
|
||||
mod tests;
|
||||
|
||||
Reference in New Issue
Block a user