feat(sync): concurrent drains, atomic watermarks, graceful Ctrl+C shutdown

Three fixes to the sync pipeline:

1. Atomic watermarks: wrap complete_job + update_watermark in a single
   SQLite transaction so crash between them can't leave partial state.

2. Concurrent drain loops: prefetch HTTP requests via join_all (batch
   size = dependent_concurrency), then write serially to DB. Reduces
   ~9K sequential requests from ~19 min to ~2.4 min.

3. Graceful shutdown: install Ctrl+C handler via ShutdownSignal
   (Arc<AtomicBool>), thread through orchestrator/CLI, release locked
   jobs on interrupt, record sync_run as "failed".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-06 11:22:04 -05:00
parent 32783080f1
commit 405e5370dc
9 changed files with 536 additions and 92 deletions

View File

@@ -30,10 +30,12 @@ use lore::cli::{
use lore::core::db::{
LATEST_SCHEMA_VERSION, create_connection, get_schema_version, run_migrations,
};
use lore::core::dependent_queue::release_all_locked_jobs;
use lore::core::error::{LoreError, RobotErrorOutput};
use lore::core::logging;
use lore::core::metrics::MetricsLayer;
use lore::core::paths::{get_config_path, get_db_path, get_log_dir};
use lore::core::shutdown::ShutdownSignal;
use lore::core::sync_run::SyncRunRecorder;
#[tokio::main]
@@ -658,6 +660,13 @@ async fn handle_ingest(
let run_id_short = &run_id[..8];
let recorder = SyncRunRecorder::start(&recorder_conn, &command, run_id_short)?;
let signal = ShutdownSignal::new();
let signal_for_handler = signal.clone();
tokio::spawn(async move {
let _ = tokio::signal::ctrl_c().await;
signal_for_handler.cancel();
});
let ingest_result: std::result::Result<(), Box<dyn std::error::Error>> = async {
match args.entity.as_deref() {
Some(resource_type) => {
@@ -670,6 +679,7 @@ async fn handle_ingest(
false,
display,
None,
&signal,
)
.await?;
@@ -697,6 +707,7 @@ async fn handle_ingest(
false,
display,
None,
&signal,
)
.await?;
@@ -709,6 +720,7 @@ async fn handle_ingest(
false,
display,
None,
&signal,
)
.await?;
@@ -725,6 +737,22 @@ async fn handle_ingest(
.await;
match ingest_result {
Ok(()) if signal.is_cancelled() => {
let stages = metrics.extract_timings();
let _ = release_all_locked_jobs(&recorder_conn);
let _ = recorder.fail(
&recorder_conn,
"Interrupted by user (Ctrl+C)",
Some(&stages),
);
if !robot_mode {
eprintln!(
"{}",
style("Interrupted by Ctrl+C. Partial data has been saved.").yellow()
);
}
Ok(())
}
Ok(()) => {
let stages = metrics.extract_timings();
let total_items: usize = stages.iter().map(|s| s.items_processed).sum();
@@ -734,6 +762,7 @@ async fn handle_ingest(
}
Err(e) => {
let stages = metrics.extract_timings();
let _ = release_all_locked_jobs(&recorder_conn);
let _ = recorder.fail(&recorder_conn, &e.to_string(), Some(&stages));
Err(e)
}
@@ -1521,7 +1550,8 @@ async fn handle_sync_cmd(
// For dry_run, skip recording and just show the preview
if dry_run {
run_sync(&config, options, None).await?;
let signal = ShutdownSignal::new();
run_sync(&config, options, None, &signal).await?;
return Ok(());
}
@@ -1531,8 +1561,43 @@ async fn handle_sync_cmd(
let run_id_short = &run_id[..8];
let recorder = SyncRunRecorder::start(&recorder_conn, "sync", run_id_short)?;
let signal = ShutdownSignal::new();
let signal_for_handler = signal.clone();
tokio::spawn(async move {
let _ = tokio::signal::ctrl_c().await;
signal_for_handler.cancel();
});
let start = std::time::Instant::now();
match run_sync(&config, options, Some(run_id_short)).await {
match run_sync(&config, options, Some(run_id_short), &signal).await {
Ok(result) if signal.is_cancelled() => {
let elapsed = start.elapsed();
let stages = metrics.extract_timings();
let released = release_all_locked_jobs(&recorder_conn).unwrap_or(0);
let _ = recorder.fail(
&recorder_conn,
"Interrupted by user (Ctrl+C)",
Some(&stages),
);
if robot_mode {
print_sync_json(&result, elapsed.as_millis() as u64, Some(metrics));
} else {
eprintln!();
eprintln!(
"{}",
console::style("Interrupted by Ctrl+C. Partial results:").yellow()
);
print_sync(&result, elapsed, Some(metrics));
if released > 0 {
eprintln!(
"{}",
console::style(format!("Released {released} locked jobs")).dim()
);
}
}
Ok(())
}
Ok(result) => {
let elapsed = start.elapsed();
let stages = metrics.extract_timings();
@@ -1552,6 +1617,7 @@ async fn handle_sync_cmd(
}
Err(e) => {
let stages = metrics.extract_timings();
let _ = release_all_locked_jobs(&recorder_conn);
let _ = recorder.fail(&recorder_conn, &e.to_string(), Some(&stages));
Err(e.into())
}