feat(sync): Instrument pipeline with tracing spans, run_id correlation, and metrics

Add end-to-end observability to the sync and ingest pipelines:

Sync command:
- Generate UUID-based run_id for each sync invocation, propagated through
  all child spans for log correlation across stages
- Accept MetricsLayer reference to extract hierarchical StageTiming data
  after pipeline completion for robot-mode performance output
- Record sync runs in DB via SyncRunRecorder (start/succeed/fail lifecycle)
- Wrap entire sync execution in a root tracing span with run_id field

Ingest command:
- Wrap run_ingest in an instrumented root span with run_id and resource_type
- Add project path prefix to discussion progress bars for multi-project clarity
- Reset resource_events_synced_for_updated_at on --full re-sync

Sync status:
- Expand from single last_run to configurable recent runs list (default 10)
- Parse and expose StageTiming metrics from stored metrics_json
- Add run_id, total_items_processed, total_errors to SyncRunInfo
- Add mr_count to DataSummary for complete entity coverage

Orchestrator:
- Add #[instrument] with structured fields to issue and MR ingestion functions
- Record items_processed, items_skipped, errors on span close for MetricsLayer
- Emit granular progress events (IssuesFetchStarted, IssuesFetchComplete)
- Pass project_id through to drain_resource_events for scoped job claiming

Document regenerator and embedding pipeline:
- Add #[instrument] spans with items_processed, items_skipped, errors fields
- Record final counts on span close for metrics extraction

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
teernisse
2026-02-04 10:01:28 -05:00
parent 362503d3bf
commit f6d19a9467
6 changed files with 603 additions and 234 deletions

View File

@@ -7,11 +7,11 @@
use futures::future::join_all;
use rusqlite::Connection;
use tracing::{debug, info, warn};
use tracing::{debug, info, instrument, warn};
use crate::Config;
use crate::core::dependent_queue::{
claim_jobs, complete_job, count_pending_jobs, enqueue_job, fail_job, reclaim_stale_locks,
claim_jobs, complete_job, count_claimable_jobs, enqueue_job, fail_job, reclaim_stale_locks,
};
use crate::core::error::Result;
use crate::gitlab::GitLabClient;
@@ -108,6 +108,10 @@ pub async fn ingest_project_issues(
}
/// Ingest all issues and their discussions for a project with progress reporting.
#[instrument(
skip(conn, client, config, progress),
fields(project_id, gitlab_project_id, items_processed, items_skipped, errors)
)]
pub async fn ingest_project_issues_with_progress(
conn: &Connection,
client: &GitLabClient,
@@ -124,12 +128,17 @@ pub async fn ingest_project_issues_with_progress(
};
// Step 1: Ingest issues
emit(ProgressEvent::IssuesFetchStarted);
let issue_result = ingest_issues(conn, client, config, project_id, gitlab_project_id).await?;
result.issues_fetched = issue_result.fetched;
result.issues_upserted = issue_result.upserted;
result.labels_created = issue_result.labels_created;
emit(ProgressEvent::IssuesFetchComplete {
total: result.issues_fetched,
});
// Step 2: Sync discussions for issues that need it
let issues_needing_sync = issue_result.issues_needing_discussion_sync;
@@ -189,8 +198,15 @@ pub async fn ingest_project_issues_with_progress(
}
// Drain the queue
let drain_result =
drain_resource_events(conn, client, config, gitlab_project_id, &progress).await?;
let drain_result = drain_resource_events(
conn,
client,
config,
project_id,
gitlab_project_id,
&progress,
)
.await?;
result.resource_events_fetched = drain_result.fetched;
result.resource_events_failed = drain_result.failed;
}
@@ -208,6 +224,10 @@ pub async fn ingest_project_issues_with_progress(
"Project ingestion complete"
);
tracing::Span::current().record("items_processed", result.issues_upserted);
tracing::Span::current().record("items_skipped", result.issues_skipped_discussion_sync);
tracing::Span::current().record("errors", result.resource_events_failed);
Ok(result)
}
@@ -280,6 +300,10 @@ pub async fn ingest_project_merge_requests(
}
/// Ingest all merge requests and their discussions for a project with progress reporting.
#[instrument(
skip(conn, client, config, progress),
fields(project_id, gitlab_project_id, items_processed, items_skipped, errors)
)]
pub async fn ingest_project_merge_requests_with_progress(
conn: &Connection,
client: &GitLabClient,
@@ -380,8 +404,15 @@ pub async fn ingest_project_merge_requests_with_progress(
debug!(enqueued, "Enqueued resource events jobs for MRs");
}
let drain_result =
drain_resource_events(conn, client, config, gitlab_project_id, &progress).await?;
let drain_result = drain_resource_events(
conn,
client,
config,
project_id,
gitlab_project_id,
&progress,
)
.await?;
result.resource_events_fetched = drain_result.fetched;
result.resource_events_failed = drain_result.failed;
}
@@ -400,6 +431,10 @@ pub async fn ingest_project_merge_requests_with_progress(
"MR project ingestion complete"
);
tracing::Span::current().record("items_processed", result.mrs_upserted);
tracing::Span::current().record("items_skipped", result.mrs_skipped_discussion_sync);
tracing::Span::current().record("errors", result.resource_events_failed);
Ok(result)
}
@@ -455,6 +490,7 @@ async fn sync_mr_discussions_sequential(
pub struct DrainResult {
pub fetched: usize,
pub failed: usize,
pub skipped_not_found: usize,
}
/// Enqueue resource_events jobs for all entities of a given type in a project.
@@ -466,21 +502,60 @@ fn enqueue_resource_events_for_entity_type(
project_id: i64,
entity_type: &str,
) -> Result<usize> {
// Query all entities for this project and enqueue resource_events jobs.
// The UNIQUE constraint on pending_dependent_fetches makes this idempotent -
// already-queued entities are silently skipped via INSERT OR IGNORE.
// Clean up obsolete jobs: remove resource_events jobs for entities whose
// watermark is already current (updated_at <= resource_events_synced_for_updated_at).
// These are leftover from prior runs that failed after watermark-stamping but
// before job deletion, or from entities that no longer need syncing.
// We intentionally keep jobs for entities that still need syncing (including
// in-progress or failed-with-backoff jobs) to preserve retry state.
match entity_type {
"issue" => {
conn.execute(
"DELETE FROM pending_dependent_fetches \
WHERE project_id = ?1 AND entity_type = 'issue' AND job_type = 'resource_events' \
AND entity_local_id IN ( \
SELECT id FROM issues \
WHERE project_id = ?1 \
AND updated_at <= COALESCE(resource_events_synced_for_updated_at, 0) \
)",
[project_id],
)?;
}
"merge_request" => {
conn.execute(
"DELETE FROM pending_dependent_fetches \
WHERE project_id = ?1 AND entity_type = 'merge_request' AND job_type = 'resource_events' \
AND entity_local_id IN ( \
SELECT id FROM merge_requests \
WHERE project_id = ?1 \
AND updated_at <= COALESCE(resource_events_synced_for_updated_at, 0) \
)",
[project_id],
)?;
}
_ => {}
}
// Enqueue resource_events jobs only for entities whose updated_at exceeds
// their last resource event sync watermark.
//
// Use separate hardcoded queries per entity type to avoid format!-based SQL.
let entities: Vec<(i64, i64)> = match entity_type {
"issue" => {
let mut stmt =
conn.prepare_cached("SELECT id, iid FROM issues WHERE project_id = ?1")?;
let mut stmt = conn.prepare_cached(
"SELECT id, iid FROM issues \
WHERE project_id = ?1 \
AND updated_at > COALESCE(resource_events_synced_for_updated_at, 0)",
)?;
stmt.query_map([project_id], |row| Ok((row.get(0)?, row.get(1)?)))?
.collect::<std::result::Result<Vec<_>, _>>()?
}
"merge_request" => {
let mut stmt =
conn.prepare_cached("SELECT id, iid FROM merge_requests WHERE project_id = ?1")?;
let mut stmt = conn.prepare_cached(
"SELECT id, iid FROM merge_requests \
WHERE project_id = ?1 \
AND updated_at > COALESCE(resource_events_synced_for_updated_at, 0)",
)?;
stmt.query_map([project_id], |row| Ok((row.get(0)?, row.get(1)?)))?
.collect::<std::result::Result<Vec<_>, _>>()?
}
@@ -509,10 +584,15 @@ fn enqueue_resource_events_for_entity_type(
///
/// Processes jobs sequentially since `rusqlite::Connection` is not `Send`.
/// Uses exponential backoff on failure via `fail_job`.
#[instrument(
skip(conn, client, config, progress),
fields(project_id, gitlab_project_id, items_processed, errors)
)]
async fn drain_resource_events(
conn: &Connection,
client: &GitLabClient,
config: &Config,
project_id: i64,
gitlab_project_id: i64,
progress: &Option<ProgressCallback>,
) -> Result<DrainResult> {
@@ -525,9 +605,15 @@ async fn drain_resource_events(
info!(reclaimed, "Reclaimed stale resource event locks");
}
// Count total pending jobs for progress reporting
let pending_counts = count_pending_jobs(conn)?;
let total_pending = pending_counts.get("resource_events").copied().unwrap_or(0);
// Count only claimable jobs (unlocked, past retry backoff) for accurate progress.
// Using count_pending_jobs here would inflate the total with locked/backing-off
// jobs that can't be claimed in this drain run, causing the progress bar to
// never reach 100%.
let claimable_counts = count_claimable_jobs(conn, project_id)?;
let total_pending = claimable_counts
.get("resource_events")
.copied()
.unwrap_or(0);
if total_pending == 0 {
return Ok(result);
@@ -547,15 +633,19 @@ async fn drain_resource_events(
let mut seen_job_ids = std::collections::HashSet::new();
loop {
let jobs = claim_jobs(conn, "resource_events", batch_size)?;
let jobs = claim_jobs(conn, "resource_events", project_id, batch_size)?;
if jobs.is_empty() {
break;
}
// Track whether any job in this batch was actually new. If every
// claimed job was already seen, break to avoid an infinite loop
// (can happen with clock skew or zero-backoff edge cases).
let mut any_new_in_batch = false;
for job in &jobs {
// Guard against re-processing a job that was failed and re-claimed
// within the same drain run (shouldn't happen due to backoff, but
// defensive against clock skew or zero-backoff edge cases).
// within the same drain run.
if !seen_job_ids.insert(job.id) {
warn!(
job_id = job.id,
@@ -563,6 +653,7 @@ async fn drain_resource_events(
);
continue;
}
any_new_in_batch = true;
match client
.fetch_all_resource_events(gitlab_project_id, &job.entity_type, job.entity_iid)
@@ -582,6 +673,11 @@ async fn drain_resource_events(
match store_result {
Ok(()) => {
complete_job(conn, job.id)?;
update_resource_event_watermark(
conn,
&job.entity_type,
job.entity_local_id,
)?;
result.fetched += 1;
}
Err(e) => {
@@ -597,14 +693,34 @@ async fn drain_resource_events(
}
}
Err(e) => {
warn!(
entity_type = %job.entity_type,
entity_iid = job.entity_iid,
error = %e,
"Failed to fetch resource events from GitLab"
);
fail_job(conn, job.id, &e.to_string())?;
result.failed += 1;
// Only 404 (not found) is truly permanent -- the resource
// events endpoint doesn't exist for this entity. Stamp the
// watermark so we skip it next run. All other errors
// (403, auth, network) get backoff retry.
if e.is_permanent_api_error() {
debug!(
entity_type = %job.entity_type,
entity_iid = job.entity_iid,
error = %e,
"Permanent API error for resource events, marking complete"
);
complete_job(conn, job.id)?;
update_resource_event_watermark(
conn,
&job.entity_type,
job.entity_local_id,
)?;
result.skipped_not_found += 1;
} else {
warn!(
entity_type = %job.entity_type,
entity_iid = job.entity_iid,
error = %e,
"Failed to fetch resource events from GitLab"
);
fail_job(conn, job.id, &e.to_string())?;
result.failed += 1;
}
}
}
@@ -614,6 +730,12 @@ async fn drain_resource_events(
total: total_pending,
});
}
// If every job in this batch was already seen, stop to prevent spinning.
if !any_new_in_batch {
warn!("All claimed jobs were already processed, breaking drain loop");
break;
}
}
emit(ProgressEvent::ResourceEventsFetchComplete {
@@ -629,6 +751,9 @@ async fn drain_resource_events(
);
}
tracing::Span::current().record("items_processed", result.fetched);
tracing::Span::current().record("errors", result.failed);
Ok(result)
}
@@ -680,6 +805,33 @@ fn store_resource_events(
Ok(())
}
/// Update the resource event watermark for an entity after successful event fetch.
///
/// Sets `resource_events_synced_for_updated_at = updated_at` so the entity
/// won't be re-enqueued until its `updated_at` advances again.
fn update_resource_event_watermark(
conn: &Connection,
entity_type: &str,
entity_local_id: i64,
) -> Result<()> {
match entity_type {
"issue" => {
conn.execute(
"UPDATE issues SET resource_events_synced_for_updated_at = updated_at WHERE id = ?",
[entity_local_id],
)?;
}
"merge_request" => {
conn.execute(
"UPDATE merge_requests SET resource_events_synced_for_updated_at = updated_at WHERE id = ?",
[entity_local_id],
)?;
}
_ => {}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
@@ -722,6 +874,7 @@ mod tests {
let result = DrainResult::default();
assert_eq!(result.fetched, 0);
assert_eq!(result.failed, 0);
assert_eq!(result.skipped_not_found, 0);
}
#[test]