feat(sync): concurrent drains, atomic watermarks, graceful Ctrl+C shutdown

Three fixes to the sync pipeline:

1. Atomic watermarks: wrap complete_job + update_watermark in a single
   SQLite transaction so crash between them can't leave partial state.

2. Concurrent drain loops: prefetch HTTP requests via join_all (batch
   size = dependent_concurrency), then write serially to DB. Reduces
   ~9K sequential requests from ~19 min to ~2.4 min.

3. Graceful shutdown: install Ctrl+C handler via ShutdownSignal
   (Arc<AtomicBool>), thread through orchestrator/CLI, release locked
   jobs on interrupt, record sync_run as "failed".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-06 11:22:04 -05:00
parent 32783080f1
commit 405e5370dc
9 changed files with 536 additions and 92 deletions

View File

@@ -4,12 +4,13 @@ use tracing::{debug, info, instrument, warn};
use crate::Config;
use crate::core::dependent_queue::{
claim_jobs, complete_job, count_claimable_jobs, enqueue_job, fail_job, reclaim_stale_locks,
claim_jobs, complete_job_tx, count_claimable_jobs, enqueue_job, fail_job, reclaim_stale_locks,
};
use crate::core::error::Result;
use crate::core::references::{
EntityReference, insert_entity_reference, resolve_issue_local_id, resolve_project_path,
};
use crate::core::shutdown::ShutdownSignal;
use crate::gitlab::GitLabClient;
use super::discussions::ingest_issue_discussions;
@@ -84,12 +85,21 @@ pub async fn ingest_project_issues(
project_id: i64,
gitlab_project_id: i64,
) -> Result<IngestProjectResult> {
ingest_project_issues_with_progress(conn, client, config, project_id, gitlab_project_id, None)
.await
let signal = ShutdownSignal::new();
ingest_project_issues_with_progress(
conn,
client,
config,
project_id,
gitlab_project_id,
None,
&signal,
)
.await
}
#[instrument(
skip(conn, client, config, progress),
skip(conn, client, config, progress, signal),
fields(project_id, gitlab_project_id, items_processed, items_skipped, errors)
)]
pub async fn ingest_project_issues_with_progress(
@@ -99,6 +109,7 @@ pub async fn ingest_project_issues_with_progress(
project_id: i64,
gitlab_project_id: i64,
progress: Option<ProgressCallback>,
signal: &ShutdownSignal,
) -> Result<IngestProjectResult> {
let mut result = IngestProjectResult::default();
let emit = |event: ProgressEvent| {
@@ -130,6 +141,11 @@ pub async fn ingest_project_issues_with_progress(
let total_issues = total_issues as usize;
result.issues_skipped_discussion_sync = total_issues.saturating_sub(issues_needing_sync.len());
if signal.is_cancelled() {
info!("Shutdown requested, returning partial issue results");
return Ok(result);
}
if issues_needing_sync.is_empty() {
info!("No issues need discussion sync");
} else {
@@ -150,6 +166,7 @@ pub async fn ingest_project_issues_with_progress(
project_id,
&issues_needing_sync,
&progress,
signal,
)
.await?;
@@ -163,6 +180,11 @@ pub async fn ingest_project_issues_with_progress(
}
}
if signal.is_cancelled() {
info!("Shutdown requested, returning partial issue results");
return Ok(result);
}
if config.sync.fetch_resource_events {
let enqueued = enqueue_resource_events_for_entity_type(conn, project_id, "issue")?;
if enqueued > 0 {
@@ -176,6 +198,7 @@ pub async fn ingest_project_issues_with_progress(
project_id,
gitlab_project_id,
&progress,
signal,
)
.await?;
result.resource_events_fetched = drain_result.fetched;
@@ -211,6 +234,7 @@ pub async fn ingest_project_issues_with_progress(
Ok(result)
}
#[allow(clippy::too_many_arguments)]
async fn sync_discussions_sequential(
conn: &Connection,
client: &GitLabClient,
@@ -219,6 +243,7 @@ async fn sync_discussions_sequential(
local_project_id: i64,
issues: &[IssueForDiscussionSync],
progress: &Option<ProgressCallback>,
signal: &ShutdownSignal,
) -> Result<Vec<super::discussions::IngestDiscussionsResult>> {
let batch_size = config.sync.dependent_concurrency as usize;
let total = issues.len();
@@ -226,6 +251,10 @@ async fn sync_discussions_sequential(
let mut results = Vec::with_capacity(issues.len());
for chunk in issues.chunks(batch_size) {
if signal.is_cancelled() {
info!("Shutdown requested during discussion sync, returning partial results");
break;
}
for issue in chunk {
let disc_result = ingest_issue_discussions(
conn,
@@ -258,6 +287,7 @@ pub async fn ingest_project_merge_requests(
gitlab_project_id: i64,
full_sync: bool,
) -> Result<IngestMrProjectResult> {
let signal = ShutdownSignal::new();
ingest_project_merge_requests_with_progress(
conn,
client,
@@ -266,14 +296,16 @@ pub async fn ingest_project_merge_requests(
gitlab_project_id,
full_sync,
None,
&signal,
)
.await
}
#[instrument(
skip(conn, client, config, progress),
skip(conn, client, config, progress, signal),
fields(project_id, gitlab_project_id, items_processed, items_skipped, errors)
)]
#[allow(clippy::too_many_arguments)]
pub async fn ingest_project_merge_requests_with_progress(
conn: &Connection,
client: &GitLabClient,
@@ -282,6 +314,7 @@ pub async fn ingest_project_merge_requests_with_progress(
gitlab_project_id: i64,
full_sync: bool,
progress: Option<ProgressCallback>,
signal: &ShutdownSignal,
) -> Result<IngestMrProjectResult> {
let mut result = IngestMrProjectResult::default();
let emit = |event: ProgressEvent| {
@@ -323,6 +356,11 @@ pub async fn ingest_project_merge_requests_with_progress(
let total_mrs = total_mrs as usize;
result.mrs_skipped_discussion_sync = total_mrs.saturating_sub(mrs_needing_sync.len());
if signal.is_cancelled() {
info!("Shutdown requested, returning partial MR results");
return Ok(result);
}
if mrs_needing_sync.is_empty() {
info!("No MRs need discussion sync");
} else {
@@ -343,6 +381,7 @@ pub async fn ingest_project_merge_requests_with_progress(
project_id,
&mrs_needing_sync,
&progress,
signal,
)
.await?;
@@ -360,6 +399,11 @@ pub async fn ingest_project_merge_requests_with_progress(
}
}
if signal.is_cancelled() {
info!("Shutdown requested, returning partial MR results");
return Ok(result);
}
if config.sync.fetch_resource_events {
let enqueued = enqueue_resource_events_for_entity_type(conn, project_id, "merge_request")?;
if enqueued > 0 {
@@ -373,6 +417,7 @@ pub async fn ingest_project_merge_requests_with_progress(
project_id,
gitlab_project_id,
&progress,
signal,
)
.await?;
result.resource_events_fetched = drain_result.fetched;
@@ -388,6 +433,11 @@ pub async fn ingest_project_merge_requests_with_progress(
}
}
if signal.is_cancelled() {
info!("Shutdown requested, returning partial MR results");
return Ok(result);
}
let note_refs = crate::core::note_parser::extract_refs_from_system_notes(conn, project_id)?;
if note_refs.inserted > 0 || note_refs.skipped_unresolvable > 0 {
debug!(
@@ -411,6 +461,7 @@ pub async fn ingest_project_merge_requests_with_progress(
project_id,
gitlab_project_id,
&progress,
signal,
)
.await?;
result.closes_issues_fetched = closes_result.fetched;
@@ -440,6 +491,7 @@ pub async fn ingest_project_merge_requests_with_progress(
Ok(result)
}
#[allow(clippy::too_many_arguments)]
async fn sync_mr_discussions_sequential(
conn: &Connection,
client: &GitLabClient,
@@ -448,6 +500,7 @@ async fn sync_mr_discussions_sequential(
local_project_id: i64,
mrs: &[MrForDiscussionSync],
progress: &Option<ProgressCallback>,
signal: &ShutdownSignal,
) -> Result<Vec<super::mr_discussions::IngestMrDiscussionsResult>> {
let batch_size = config.sync.dependent_concurrency as usize;
let total = mrs.len();
@@ -456,6 +509,10 @@ async fn sync_mr_discussions_sequential(
let mut processed = 0;
for chunk in mrs.chunks(batch_size) {
if signal.is_cancelled() {
info!("Shutdown requested during MR discussion sync, returning partial results");
break;
}
let prefetch_futures = chunk.iter().map(|mr| {
prefetch_mr_discussions(client, gitlab_project_id, local_project_id, mr.clone())
});
@@ -559,8 +616,48 @@ fn enqueue_resource_events_for_entity_type(
Ok(enqueued)
}
/// Result of a concurrent HTTP prefetch for resource events.
#[allow(clippy::type_complexity)]
struct PrefetchedResourceEvents {
job_id: i64,
project_id: i64,
entity_type: String,
entity_iid: i64,
entity_local_id: i64,
result: std::result::Result<
(
Vec<crate::gitlab::types::GitLabStateEvent>,
Vec<crate::gitlab::types::GitLabLabelEvent>,
Vec<crate::gitlab::types::GitLabMilestoneEvent>,
),
crate::core::error::LoreError,
>,
}
async fn prefetch_resource_events(
client: &GitLabClient,
gitlab_project_id: i64,
job_id: i64,
project_id: i64,
entity_type: String,
entity_iid: i64,
entity_local_id: i64,
) -> PrefetchedResourceEvents {
let result = client
.fetch_all_resource_events(gitlab_project_id, &entity_type, entity_iid)
.await;
PrefetchedResourceEvents {
job_id,
project_id,
entity_type,
entity_iid,
entity_local_id,
result,
}
}
#[instrument(
skip(conn, client, config, progress),
skip(conn, client, config, progress, signal),
fields(project_id, gitlab_project_id, items_processed, errors)
)]
async fn drain_resource_events(
@@ -570,6 +667,7 @@ async fn drain_resource_events(
project_id: i64,
gitlab_project_id: i64,
progress: &Option<ProgressCallback>,
signal: &ShutdownSignal,
) -> Result<DrainResult> {
let mut result = DrainResult::default();
let batch_size = config.sync.dependent_concurrency as usize;
@@ -603,33 +701,49 @@ async fn drain_resource_events(
let mut seen_job_ids = std::collections::HashSet::new();
loop {
if signal.is_cancelled() {
info!("Shutdown requested during resource events drain, returning partial results");
break;
}
let jobs = claim_jobs(conn, "resource_events", project_id, batch_size)?;
if jobs.is_empty() {
break;
}
let mut any_new_in_batch = false;
// Phase 1: Concurrent HTTP fetches
let futures: Vec<_> = jobs
.iter()
.filter(|j| seen_job_ids.insert(j.id))
.map(|j| {
prefetch_resource_events(
client,
gitlab_project_id,
j.id,
j.project_id,
j.entity_type.clone(),
j.entity_iid,
j.entity_local_id,
)
})
.collect();
for job in &jobs {
if !seen_job_ids.insert(job.id) {
warn!(
job_id = job.id,
"Skipping already-processed job in same drain run"
);
continue;
}
any_new_in_batch = true;
if futures.is_empty() {
warn!("All claimed jobs were already processed, breaking drain loop");
break;
}
match client
.fetch_all_resource_events(gitlab_project_id, &job.entity_type, job.entity_iid)
.await
{
let prefetched = join_all(futures).await;
// Phase 2: Serial DB writes
for p in prefetched {
match p.result {
Ok((state_events, label_events, milestone_events)) => {
let store_result = store_resource_events(
conn,
job.project_id,
&job.entity_type,
job.entity_local_id,
p.project_id,
&p.entity_type,
p.entity_local_id,
&state_events,
&label_events,
&milestone_events,
@@ -637,22 +751,24 @@ async fn drain_resource_events(
match store_result {
Ok(()) => {
complete_job(conn, job.id)?;
update_resource_event_watermark(
conn,
&job.entity_type,
job.entity_local_id,
let tx = conn.unchecked_transaction()?;
complete_job_tx(&tx, p.job_id)?;
update_resource_event_watermark_tx(
&tx,
&p.entity_type,
p.entity_local_id,
)?;
tx.commit()?;
result.fetched += 1;
}
Err(e) => {
warn!(
entity_type = %job.entity_type,
entity_iid = job.entity_iid,
entity_type = %p.entity_type,
entity_iid = p.entity_iid,
error = %e,
"Failed to store resource events"
);
fail_job(conn, job.id, &e.to_string())?;
fail_job(conn, p.job_id, &e.to_string())?;
result.failed += 1;
}
}
@@ -660,26 +776,24 @@ async fn drain_resource_events(
Err(e) => {
if e.is_permanent_api_error() {
debug!(
entity_type = %job.entity_type,
entity_iid = job.entity_iid,
entity_type = %p.entity_type,
entity_iid = p.entity_iid,
error = %e,
"Permanent API error for resource events, marking complete"
);
complete_job(conn, job.id)?;
update_resource_event_watermark(
conn,
&job.entity_type,
job.entity_local_id,
)?;
let tx = conn.unchecked_transaction()?;
complete_job_tx(&tx, p.job_id)?;
update_resource_event_watermark_tx(&tx, &p.entity_type, p.entity_local_id)?;
tx.commit()?;
result.skipped_not_found += 1;
} else {
warn!(
entity_type = %job.entity_type,
entity_iid = job.entity_iid,
entity_type = %p.entity_type,
entity_iid = p.entity_iid,
error = %e,
"Failed to fetch resource events from GitLab"
);
fail_job(conn, job.id, &e.to_string())?;
fail_job(conn, p.job_id, &e.to_string())?;
result.failed += 1;
}
}
@@ -691,11 +805,6 @@ async fn drain_resource_events(
total: total_pending,
});
}
if !any_new_in_batch {
warn!("All claimed jobs were already processed, breaking drain loop");
break;
}
}
emit(ProgressEvent::ResourceEventsFetchComplete {
@@ -762,20 +871,31 @@ fn store_resource_events(
Ok(())
}
fn update_resource_event_watermark(
conn: &Connection,
fn update_closes_issues_watermark_tx(
tx: &rusqlite::Transaction<'_>,
mr_local_id: i64,
) -> Result<()> {
tx.execute(
"UPDATE merge_requests SET closes_issues_synced_for_updated_at = updated_at WHERE id = ?",
[mr_local_id],
)?;
Ok(())
}
fn update_resource_event_watermark_tx(
tx: &rusqlite::Transaction<'_>,
entity_type: &str,
entity_local_id: i64,
) -> Result<()> {
match entity_type {
"issue" => {
conn.execute(
tx.execute(
"UPDATE issues SET resource_events_synced_for_updated_at = updated_at WHERE id = ?",
[entity_local_id],
)?;
}
"merge_request" => {
conn.execute(
tx.execute(
"UPDATE merge_requests SET resource_events_synced_for_updated_at = updated_at WHERE id = ?",
[entity_local_id],
)?;
@@ -785,14 +905,6 @@ fn update_resource_event_watermark(
Ok(())
}
fn update_closes_issues_watermark(conn: &Connection, mr_local_id: i64) -> Result<()> {
conn.execute(
"UPDATE merge_requests SET closes_issues_synced_for_updated_at = updated_at WHERE id = ?",
[mr_local_id],
)?;
Ok(())
}
fn enqueue_mr_closes_issues_jobs(conn: &Connection, project_id: i64) -> Result<usize> {
// Remove stale jobs for MRs that haven't changed since their last closes_issues sync
conn.execute(
@@ -833,8 +945,37 @@ fn enqueue_mr_closes_issues_jobs(conn: &Connection, project_id: i64) -> Result<u
Ok(enqueued)
}
/// Result of a concurrent HTTP prefetch for closes-issues references.
struct PrefetchedClosesIssues {
job_id: i64,
entity_iid: i64,
entity_local_id: i64,
result: std::result::Result<
Vec<crate::gitlab::types::GitLabIssueRef>,
crate::core::error::LoreError,
>,
}
async fn prefetch_closes_issues(
client: &GitLabClient,
gitlab_project_id: i64,
job_id: i64,
entity_iid: i64,
entity_local_id: i64,
) -> PrefetchedClosesIssues {
let result = client
.fetch_mr_closes_issues(gitlab_project_id, entity_iid)
.await;
PrefetchedClosesIssues {
job_id,
entity_iid,
entity_local_id,
result,
}
}
#[instrument(
skip(conn, client, config, progress),
skip(conn, client, config, progress, signal),
fields(project_id, gitlab_project_id, items_processed, errors)
)]
async fn drain_mr_closes_issues(
@@ -844,6 +985,7 @@ async fn drain_mr_closes_issues(
project_id: i64,
gitlab_project_id: i64,
progress: &Option<ProgressCallback>,
signal: &ShutdownSignal,
) -> Result<DrainResult> {
let mut result = DrainResult::default();
let batch_size = config.sync.dependent_concurrency as usize;
@@ -877,48 +1019,64 @@ async fn drain_mr_closes_issues(
let mut seen_job_ids = std::collections::HashSet::new();
loop {
if signal.is_cancelled() {
info!("Shutdown requested during closes_issues drain, returning partial results");
break;
}
let jobs = claim_jobs(conn, "mr_closes_issues", project_id, batch_size)?;
if jobs.is_empty() {
break;
}
let mut any_new_in_batch = false;
// Phase 1: Concurrent HTTP fetches
let futures: Vec<_> = jobs
.iter()
.filter(|j| seen_job_ids.insert(j.id))
.map(|j| {
prefetch_closes_issues(
client,
gitlab_project_id,
j.id,
j.entity_iid,
j.entity_local_id,
)
})
.collect();
for job in &jobs {
if !seen_job_ids.insert(job.id) {
warn!(
job_id = job.id,
"Skipping already-processed mr_closes_issues job"
);
continue;
}
any_new_in_batch = true;
if futures.is_empty() {
warn!("All claimed mr_closes_issues jobs were already processed, breaking drain loop");
break;
}
match client
.fetch_mr_closes_issues(gitlab_project_id, job.entity_iid)
.await
{
let prefetched = join_all(futures).await;
// Phase 2: Serial DB writes
for p in prefetched {
match p.result {
Ok(closes_issues) => {
let store_result = store_closes_issues_refs(
conn,
project_id,
job.entity_local_id,
p.entity_local_id,
&closes_issues,
);
match store_result {
Ok(()) => {
complete_job(conn, job.id)?;
update_closes_issues_watermark(conn, job.entity_local_id)?;
let tx = conn.unchecked_transaction()?;
complete_job_tx(&tx, p.job_id)?;
update_closes_issues_watermark_tx(&tx, p.entity_local_id)?;
tx.commit()?;
result.fetched += 1;
}
Err(e) => {
warn!(
entity_iid = job.entity_iid,
entity_iid = p.entity_iid,
error = %e,
"Failed to store closes_issues references"
);
fail_job(conn, job.id, &e.to_string())?;
fail_job(conn, p.job_id, &e.to_string())?;
result.failed += 1;
}
}
@@ -926,20 +1084,22 @@ async fn drain_mr_closes_issues(
Err(e) => {
if e.is_permanent_api_error() {
debug!(
entity_iid = job.entity_iid,
entity_iid = p.entity_iid,
error = %e,
"Permanent API error for closes_issues, marking complete"
);
complete_job(conn, job.id)?;
update_closes_issues_watermark(conn, job.entity_local_id)?;
let tx = conn.unchecked_transaction()?;
complete_job_tx(&tx, p.job_id)?;
update_closes_issues_watermark_tx(&tx, p.entity_local_id)?;
tx.commit()?;
result.skipped_not_found += 1;
} else {
warn!(
entity_iid = job.entity_iid,
entity_iid = p.entity_iid,
error = %e,
"Failed to fetch closes_issues from GitLab"
);
fail_job(conn, job.id, &e.to_string())?;
fail_job(conn, p.job_id, &e.to_string())?;
result.failed += 1;
}
}
@@ -951,11 +1111,6 @@ async fn drain_mr_closes_issues(
total: total_pending,
});
}
if !any_new_in_batch {
warn!("All claimed mr_closes_issues jobs were already processed, breaking drain loop");
break;
}
}
emit(ProgressEvent::ClosesIssuesFetchComplete {