feat(sync): Instrument pipeline with tracing spans, run_id correlation, and metrics

Add end-to-end observability to the sync and ingest pipelines: Sync command: - Generate UUID-based run_id for each sync invocation, propagated through all child spans for log correlation across stages - Accept MetricsLayer reference to extract hierarchical StageTiming data after pipeline completion for robot-mode performance output - Record sync runs in DB via SyncRunRecorder (start/succeed/fail lifecycle) - Wrap entire sync execution in a root tracing span with run_id field Ingest command: - Wrap run_ingest in an instrumented root span with run_id and resource_type - Add project path prefix to discussion progress bars for multi-project clarity - Reset resource_events_synced_for_updated_at on --full re-sync Sync status: - Expand from single last_run to configurable recent runs list (default 10) - Parse and expose StageTiming metrics from stored metrics_json - Add run_id, total_items_processed, total_errors to SyncRunInfo - Add mr_count to DataSummary for complete entity coverage Orchestrator: - Add #[instrument] with structured fields to issue and MR ingestion functions - Record items_processed, items_skipped, errors on span close for MetricsLayer - Emit granular progress events (IssuesFetchStarted, IssuesFetchComplete) - Pass project_id through to drain_resource_events for scoped job claiming Document regenerator and embedding pipeline: - Add #[instrument] spans with items_processed, items_skipped, errors fields - Record final counts on span close for metrics extraction Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 10:01:28 -05:00
parent 362503d3bf
commit f6d19a9467
6 changed files with 603 additions and 234 deletions
--- a/src/ingestion/orchestrator.rs
+++ b/src/ingestion/orchestrator.rs
@@ -7,11 +7,11 @@

 use futures::future::join_all;
 use rusqlite::Connection;
-use tracing::{debug, info, warn};
+use tracing::{debug, info, instrument, warn};

 use crate::Config;
 use crate::core::dependent_queue::{
-    claim_jobs, complete_job, count_pending_jobs, enqueue_job, fail_job, reclaim_stale_locks,
+    claim_jobs, complete_job, count_claimable_jobs, enqueue_job, fail_job, reclaim_stale_locks,
 };
 use crate::core::error::Result;
 use crate::gitlab::GitLabClient;
@@ -108,6 +108,10 @@ pub async fn ingest_project_issues(
 }

 /// Ingest all issues and their discussions for a project with progress reporting.
+#[instrument(
+    skip(conn, client, config, progress),
+    fields(project_id, gitlab_project_id, items_processed, items_skipped, errors)
+)]
 pub async fn ingest_project_issues_with_progress(
    conn: &Connection,
    client: &GitLabClient,
@@ -124,12 +128,17 @@ pub async fn ingest_project_issues_with_progress(
    };

    // Step 1: Ingest issues
+    emit(ProgressEvent::IssuesFetchStarted);
    let issue_result = ingest_issues(conn, client, config, project_id, gitlab_project_id).await?;

    result.issues_fetched = issue_result.fetched;
    result.issues_upserted = issue_result.upserted;
    result.labels_created = issue_result.labels_created;

+    emit(ProgressEvent::IssuesFetchComplete {
+        total: result.issues_fetched,
+    });
+
    // Step 2: Sync discussions for issues that need it
    let issues_needing_sync = issue_result.issues_needing_discussion_sync;

@@ -189,8 +198,15 @@ pub async fn ingest_project_issues_with_progress(
        }

        // Drain the queue
-        let drain_result =
-            drain_resource_events(conn, client, config, gitlab_project_id, &progress).await?;
+        let drain_result = drain_resource_events(
+            conn,
+            client,
+            config,
+            project_id,
+            gitlab_project_id,
+            &progress,
+        )
+        .await?;
        result.resource_events_fetched = drain_result.fetched;
        result.resource_events_failed = drain_result.failed;
    }
@@ -208,6 +224,10 @@ pub async fn ingest_project_issues_with_progress(
        "Project ingestion complete"
    );

+    tracing::Span::current().record("items_processed", result.issues_upserted);
+    tracing::Span::current().record("items_skipped", result.issues_skipped_discussion_sync);
+    tracing::Span::current().record("errors", result.resource_events_failed);
+
    Ok(result)
 }

@@ -280,6 +300,10 @@ pub async fn ingest_project_merge_requests(
 }

 /// Ingest all merge requests and their discussions for a project with progress reporting.
+#[instrument(
+    skip(conn, client, config, progress),
+    fields(project_id, gitlab_project_id, items_processed, items_skipped, errors)
+)]
 pub async fn ingest_project_merge_requests_with_progress(
    conn: &Connection,
    client: &GitLabClient,
@@ -380,8 +404,15 @@ pub async fn ingest_project_merge_requests_with_progress(
            debug!(enqueued, "Enqueued resource events jobs for MRs");
        }

-        let drain_result =
-            drain_resource_events(conn, client, config, gitlab_project_id, &progress).await?;
+        let drain_result = drain_resource_events(
+            conn,
+            client,
+            config,
+            project_id,
+            gitlab_project_id,
+            &progress,
+        )
+        .await?;
        result.resource_events_fetched = drain_result.fetched;
        result.resource_events_failed = drain_result.failed;
    }
@@ -400,6 +431,10 @@ pub async fn ingest_project_merge_requests_with_progress(
        "MR project ingestion complete"
    );

+    tracing::Span::current().record("items_processed", result.mrs_upserted);
+    tracing::Span::current().record("items_skipped", result.mrs_skipped_discussion_sync);
+    tracing::Span::current().record("errors", result.resource_events_failed);
+
    Ok(result)
 }

@@ -455,6 +490,7 @@ async fn sync_mr_discussions_sequential(
 pub struct DrainResult {
    pub fetched: usize,
    pub failed: usize,
+    pub skipped_not_found: usize,
 }

 /// Enqueue resource_events jobs for all entities of a given type in a project.
@@ -466,21 +502,60 @@ fn enqueue_resource_events_for_entity_type(
    project_id: i64,
    entity_type: &str,
 ) -> Result<usize> {
-    // Query all entities for this project and enqueue resource_events jobs.
-    // The UNIQUE constraint on pending_dependent_fetches makes this idempotent -
-    // already-queued entities are silently skipped via INSERT OR IGNORE.
+    // Clean up obsolete jobs: remove resource_events jobs for entities whose
+    // watermark is already current (updated_at <= resource_events_synced_for_updated_at).
+    // These are leftover from prior runs that failed after watermark-stamping but
+    // before job deletion, or from entities that no longer need syncing.
+    // We intentionally keep jobs for entities that still need syncing (including
+    // in-progress or failed-with-backoff jobs) to preserve retry state.
+    match entity_type {
+        "issue" => {
+            conn.execute(
+                "DELETE FROM pending_dependent_fetches \
+                 WHERE project_id = ?1 AND entity_type = 'issue' AND job_type = 'resource_events' \
+                   AND entity_local_id IN ( \
+                       SELECT id FROM issues \
+                       WHERE project_id = ?1 \
+                         AND updated_at <= COALESCE(resource_events_synced_for_updated_at, 0) \
+                   )",
+                [project_id],
+            )?;
+        }
+        "merge_request" => {
+            conn.execute(
+                "DELETE FROM pending_dependent_fetches \
+                 WHERE project_id = ?1 AND entity_type = 'merge_request' AND job_type = 'resource_events' \
+                   AND entity_local_id IN ( \
+                       SELECT id FROM merge_requests \
+                       WHERE project_id = ?1 \
+                         AND updated_at <= COALESCE(resource_events_synced_for_updated_at, 0) \
+                   )",
+                [project_id],
+            )?;
+        }
+        _ => {}
+    }
+
+    // Enqueue resource_events jobs only for entities whose updated_at exceeds
+    // their last resource event sync watermark.
    //
    // Use separate hardcoded queries per entity type to avoid format!-based SQL.
    let entities: Vec<(i64, i64)> = match entity_type {
        "issue" => {
-            let mut stmt =
-                conn.prepare_cached("SELECT id, iid FROM issues WHERE project_id = ?1")?;
+            let mut stmt = conn.prepare_cached(
+                "SELECT id, iid FROM issues \
+                 WHERE project_id = ?1 \
+                   AND updated_at > COALESCE(resource_events_synced_for_updated_at, 0)",
+            )?;
            stmt.query_map([project_id], |row| Ok((row.get(0)?, row.get(1)?)))?
                .collect::<std::result::Result<Vec<_>, _>>()?
        }
        "merge_request" => {
-            let mut stmt =
-                conn.prepare_cached("SELECT id, iid FROM merge_requests WHERE project_id = ?1")?;
+            let mut stmt = conn.prepare_cached(
+                "SELECT id, iid FROM merge_requests \
+                 WHERE project_id = ?1 \
+                   AND updated_at > COALESCE(resource_events_synced_for_updated_at, 0)",
+            )?;
            stmt.query_map([project_id], |row| Ok((row.get(0)?, row.get(1)?)))?
                .collect::<std::result::Result<Vec<_>, _>>()?
        }
@@ -509,10 +584,15 @@ fn enqueue_resource_events_for_entity_type(
 ///
 /// Processes jobs sequentially since `rusqlite::Connection` is not `Send`.
 /// Uses exponential backoff on failure via `fail_job`.
+#[instrument(
+    skip(conn, client, config, progress),
+    fields(project_id, gitlab_project_id, items_processed, errors)
+)]
 async fn drain_resource_events(
    conn: &Connection,
    client: &GitLabClient,
    config: &Config,
+    project_id: i64,
    gitlab_project_id: i64,
    progress: &Option<ProgressCallback>,
 ) -> Result<DrainResult> {
@@ -525,9 +605,15 @@ async fn drain_resource_events(
        info!(reclaimed, "Reclaimed stale resource event locks");
    }

-    // Count total pending jobs for progress reporting
-    let pending_counts = count_pending_jobs(conn)?;
-    let total_pending = pending_counts.get("resource_events").copied().unwrap_or(0);
+    // Count only claimable jobs (unlocked, past retry backoff) for accurate progress.
+    // Using count_pending_jobs here would inflate the total with locked/backing-off
+    // jobs that can't be claimed in this drain run, causing the progress bar to
+    // never reach 100%.
+    let claimable_counts = count_claimable_jobs(conn, project_id)?;
+    let total_pending = claimable_counts
+        .get("resource_events")
+        .copied()
+        .unwrap_or(0);

    if total_pending == 0 {
        return Ok(result);
@@ -547,15 +633,19 @@ async fn drain_resource_events(
    let mut seen_job_ids = std::collections::HashSet::new();

    loop {
-        let jobs = claim_jobs(conn, "resource_events", batch_size)?;
+        let jobs = claim_jobs(conn, "resource_events", project_id, batch_size)?;
        if jobs.is_empty() {
            break;
        }

+        // Track whether any job in this batch was actually new. If every
+        // claimed job was already seen, break to avoid an infinite loop
+        // (can happen with clock skew or zero-backoff edge cases).
+        let mut any_new_in_batch = false;
+
        for job in &jobs {
            // Guard against re-processing a job that was failed and re-claimed
-            // within the same drain run (shouldn't happen due to backoff, but
-            // defensive against clock skew or zero-backoff edge cases).
+            // within the same drain run.
            if !seen_job_ids.insert(job.id) {
                warn!(
                    job_id = job.id,
@@ -563,6 +653,7 @@ async fn drain_resource_events(
                );
                continue;
            }
+            any_new_in_batch = true;

            match client
                .fetch_all_resource_events(gitlab_project_id, &job.entity_type, job.entity_iid)
@@ -582,6 +673,11 @@ async fn drain_resource_events(
                    match store_result {
                        Ok(()) => {
                            complete_job(conn, job.id)?;
+                            update_resource_event_watermark(
+                                conn,
+                                &job.entity_type,
+                                job.entity_local_id,
+                            )?;
                            result.fetched += 1;
                        }
                        Err(e) => {
@@ -597,14 +693,34 @@ async fn drain_resource_events(
                    }
                }
                Err(e) => {
-                    warn!(
-                        entity_type = %job.entity_type,
-                        entity_iid = job.entity_iid,
-                        error = %e,
-                        "Failed to fetch resource events from GitLab"
-                    );
-                    fail_job(conn, job.id, &e.to_string())?;
-                    result.failed += 1;
+                    // Only 404 (not found) is truly permanent -- the resource
+                    // events endpoint doesn't exist for this entity. Stamp the
+                    // watermark so we skip it next run. All other errors
+                    // (403, auth, network) get backoff retry.
+                    if e.is_permanent_api_error() {
+                        debug!(
+                            entity_type = %job.entity_type,
+                            entity_iid = job.entity_iid,
+                            error = %e,
+                            "Permanent API error for resource events, marking complete"
+                        );
+                        complete_job(conn, job.id)?;
+                        update_resource_event_watermark(
+                            conn,
+                            &job.entity_type,
+                            job.entity_local_id,
+                        )?;
+                        result.skipped_not_found += 1;
+                    } else {
+                        warn!(
+                            entity_type = %job.entity_type,
+                            entity_iid = job.entity_iid,
+                            error = %e,
+                            "Failed to fetch resource events from GitLab"
+                        );
+                        fail_job(conn, job.id, &e.to_string())?;
+                        result.failed += 1;
+                    }
                }
            }

@@ -614,6 +730,12 @@ async fn drain_resource_events(
                total: total_pending,
            });
        }
+
+        // If every job in this batch was already seen, stop to prevent spinning.
+        if !any_new_in_batch {
+            warn!("All claimed jobs were already processed, breaking drain loop");
+            break;
+        }
    }

    emit(ProgressEvent::ResourceEventsFetchComplete {
@@ -629,6 +751,9 @@ async fn drain_resource_events(
        );
    }

+    tracing::Span::current().record("items_processed", result.fetched);
+    tracing::Span::current().record("errors", result.failed);
+
    Ok(result)
 }

@@ -680,6 +805,33 @@ fn store_resource_events(
    Ok(())
 }

+/// Update the resource event watermark for an entity after successful event fetch.
+///
+/// Sets `resource_events_synced_for_updated_at = updated_at` so the entity
+/// won't be re-enqueued until its `updated_at` advances again.
+fn update_resource_event_watermark(
+    conn: &Connection,
+    entity_type: &str,
+    entity_local_id: i64,
+) -> Result<()> {
+    match entity_type {
+        "issue" => {
+            conn.execute(
+                "UPDATE issues SET resource_events_synced_for_updated_at = updated_at WHERE id = ?",
+                [entity_local_id],
+            )?;
+        }
+        "merge_request" => {
+            conn.execute(
+                "UPDATE merge_requests SET resource_events_synced_for_updated_at = updated_at WHERE id = ?",
+                [entity_local_id],
+            )?;
+        }
+        _ => {}
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -722,6 +874,7 @@ mod tests {
        let result = DrainResult::default();
        assert_eq!(result.fetched, 0);
        assert_eq!(result.failed, 0);
+        assert_eq!(result.skipped_not_found, 0);
    }

    #[test]