feat(surgical-sync): add per-IID surgical sync pipeline with preflight validation

Add the ability to sync specific issues or merge requests by IID without running a full incremental sync. This enables fast, targeted data refresh for individual entities — useful for agent workflows, debugging, and real-time investigation of specific issues or MRs. Architecture: - New CLI flags: --issue <IID> and --mr <IID> (repeatable, up to 100 total) scoped to a single project via -p/--project - Preflight phase validates all IIDs exist on GitLab before any DB writes, with TOCTOU-aware soft verification at ingest time - 6-stage pipeline: preflight -> fetch -> ingest -> dependents -> docs -> embed - Each stage is cancellation-aware via ShutdownSignal - Dedicated SyncRunRecorder extensions track surgical-specific counters (issues_fetched, mrs_ingested, docs_regenerated, etc.) New modules: - src/ingestion/surgical.rs: Core surgical fetch/ingest/dependent logic with preflight_fetch(), ingest_issue_by_iid(), ingest_mr_by_iid(), and fetch_dependents_for_{issue,mr}() - src/cli/commands/sync_surgical.rs: Full CLI orchestrator with progress spinners, human/robot output, and cancellation handling - src/embedding/pipeline.rs: embed_documents_by_ids() for scoped embedding - src/documents/regenerator.rs: regenerate_dirty_documents_for_sources() for scoped document regeneration Database changes: - Migration 027: Extends sync_runs with mode, phase, surgical_iids_json, per-entity counters, and cancelled_at column - New indexes: idx_sync_runs_mode_started, idx_sync_runs_status_phase_started GitLab client: - get_issue_by_iid() and get_mr_by_iid() single-entity fetch methods Error handling: - New SurgicalPreflightFailed error variant with entity_type, iid, project, and reason fields. Shares exit code 6 with GitLabNotFound. Includes comprehensive test coverage: - 645 lines of surgical ingestion tests (wiremock-based) - 184 lines of scoped embedding tests - 85 lines of scoped regeneration tests - 113 lines of GitLab client single-entity tests - 236 lines of sync_run surgical column/counter tests - Unit tests for SyncOptions, error codes, and CLI validation
2026-02-18 16:27:59 -05:00
parent ea6e45e43f
commit 9ec1344945
25 changed files with 3354 additions and 37 deletions
--- a/src/embedding/pipeline.rs
+++ b/src/embedding/pipeline.rs
@@ -578,3 +578,207 @@ fn sha256_hash(input: &str) -> String {
    hasher.update(input.as_bytes());
    format!("{:x}", hasher.finalize())
 }
+
+#[derive(Debug, Default)]
+pub struct EmbedForIdsResult {
+    pub chunks_embedded: usize,
+    pub docs_embedded: usize,
+    pub failed: usize,
+    pub skipped: usize,
+}
+
+/// Embed only the documents with the given IDs, skipping any that are
+/// already embedded with matching config (model, dims, chunk size, hash).
+pub async fn embed_documents_by_ids(
+    conn: &Connection,
+    client: &OllamaClient,
+    model_name: &str,
+    concurrency: usize,
+    document_ids: &[i64],
+    signal: &ShutdownSignal,
+) -> Result<EmbedForIdsResult> {
+    let mut result = EmbedForIdsResult::default();
+
+    if document_ids.is_empty() {
+        return Ok(result);
+    }
+
+    if signal.is_cancelled() {
+        return Ok(result);
+    }
+
+    // Load documents for the specified IDs, filtering out already-embedded
+    let pending = find_documents_by_ids(conn, document_ids, model_name)?;
+
+    if pending.is_empty() {
+        result.skipped = document_ids.len();
+        return Ok(result);
+    }
+
+    let skipped_count = document_ids.len() - pending.len();
+    result.skipped = skipped_count;
+
+    info!(
+        requested = document_ids.len(),
+        pending = pending.len(),
+        skipped = skipped_count,
+        "Scoped embedding: processing documents by ID"
+    );
+
+    // Use the same SAVEPOINT + embed_page pattern as the main pipeline
+    let mut last_id: i64 = 0;
+    let mut processed: usize = 0;
+    let total = pending.len();
+    let mut page_stats = EmbedResult::default();
+
+    conn.execute_batch("SAVEPOINT embed_by_ids")?;
+    let page_result = embed_page(
+        conn,
+        client,
+        model_name,
+        concurrency,
+        &pending,
+        &mut page_stats,
+        &mut last_id,
+        &mut processed,
+        total,
+        &None,
+        signal,
+    )
+    .await;
+
+    match page_result {
+        Ok(()) if signal.is_cancelled() => {
+            let _ = conn.execute_batch("ROLLBACK TO embed_by_ids; RELEASE embed_by_ids");
+            info!("Rolled back scoped embed page due to cancellation");
+        }
+        Ok(()) => {
+            conn.execute_batch("RELEASE embed_by_ids")?;
+
+            // Count actual results from DB
+            let (chunks, docs) = count_embedded_results(conn, &pending)?;
+            result.chunks_embedded = chunks;
+            result.docs_embedded = docs;
+            result.failed = page_stats.failed;
+        }
+        Err(e) => {
+            let _ = conn.execute_batch("ROLLBACK TO embed_by_ids; RELEASE embed_by_ids");
+            return Err(e);
+        }
+    }
+
+    info!(
+        chunks_embedded = result.chunks_embedded,
+        docs_embedded = result.docs_embedded,
+        failed = result.failed,
+        skipped = result.skipped,
+        "Scoped embedding complete"
+    );
+
+    Ok(result)
+}
+
+/// Load documents by specific IDs, filtering out those already embedded
+/// with matching config (same logic as `find_pending_documents` but scoped).
+fn find_documents_by_ids(
+    conn: &Connection,
+    document_ids: &[i64],
+    model_name: &str,
+) -> Result<Vec<crate::embedding::change_detector::PendingDocument>> {
+    use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
+
+    if document_ids.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    // Build IN clause with placeholders
+    let placeholders: Vec<String> = (0..document_ids.len())
+        .map(|i| format!("?{}", i + 1))
+        .collect();
+    let in_clause = placeholders.join(", ");
+
+    let sql = format!(
+        r#"
+        SELECT d.id, d.content_text, d.content_hash
+        FROM documents d
+        LEFT JOIN embedding_metadata em
+          ON em.document_id = d.id AND em.chunk_index = 0
+        WHERE d.id IN ({in_clause})
+          AND (
+            em.document_id IS NULL
+            OR em.document_hash != d.content_hash
+            OR em.chunk_max_bytes IS NULL
+            OR em.chunk_max_bytes != ?{chunk_bytes_idx}
+            OR em.model != ?{model_idx}
+            OR em.dims != ?{dims_idx}
+          )
+        ORDER BY d.id
+        "#,
+        in_clause = in_clause,
+        chunk_bytes_idx = document_ids.len() + 1,
+        model_idx = document_ids.len() + 2,
+        dims_idx = document_ids.len() + 3,
+    );
+
+    let mut stmt = conn.prepare(&sql)?;
+
+    // Build params: document_ids... then chunk_max_bytes, model, dims
+    let mut params: Vec<Box<dyn rusqlite::types::ToSql>> = Vec::new();
+    for id in document_ids {
+        params.push(Box::new(*id));
+    }
+    params.push(Box::new(CHUNK_MAX_BYTES as i64));
+    params.push(Box::new(model_name.to_string()));
+    params.push(Box::new(EXPECTED_DIMS as i64));
+
+    let param_refs: Vec<&dyn rusqlite::types::ToSql> = params.iter().map(|p| p.as_ref()).collect();
+
+    let rows = stmt
+        .query_map(param_refs.as_slice(), |row| {
+            Ok(crate::embedding::change_detector::PendingDocument {
+                document_id: row.get(0)?,
+                content_text: row.get(1)?,
+                content_hash: row.get(2)?,
+            })
+        })?
+        .collect::<std::result::Result<Vec<_>, _>>()?;
+
+    Ok(rows)
+}
+
+/// Count how many chunks and complete docs were embedded for the given pending docs.
+fn count_embedded_results(
+    conn: &Connection,
+    pending: &[crate::embedding::change_detector::PendingDocument],
+) -> Result<(usize, usize)> {
+    let mut total_chunks: usize = 0;
+    let mut total_docs: usize = 0;
+
+    for doc in pending {
+        let chunk_count: i64 = conn.query_row(
+            "SELECT COUNT(*) FROM embedding_metadata WHERE document_id = ?1 AND last_error IS NULL",
+            [doc.document_id],
+            |row| row.get(0),
+        )?;
+        if chunk_count > 0 {
+            total_chunks += chunk_count as usize;
+            // Check if all expected chunks are present (chunk_count metadata on chunk_index=0)
+            let expected: Option<i64> = conn.query_row(
+                "SELECT chunk_count FROM embedding_metadata WHERE document_id = ?1 AND chunk_index = 0",
+                [doc.document_id],
+                |row| row.get(0),
+            )?;
+            if let Some(expected_count) = expected
+                && chunk_count >= expected_count
+            {
+                total_docs += 1;
+            }
+        }
+    }
+
+    Ok((total_chunks, total_docs))
+}
+
+#[cfg(test)]
+#[path = "pipeline_tests.rs"]
+mod tests;