feat(sync): concurrent drains, atomic watermarks, graceful Ctrl+C shutdown

Three fixes to the sync pipeline:

1. Atomic watermarks: wrap complete_job + update_watermark in a single
   SQLite transaction so crash between them can't leave partial state.

2. Concurrent drain loops: prefetch HTTP requests via join_all (batch
   size = dependent_concurrency), then write serially to DB. Reduces
   ~9K sequential requests from ~19 min to ~2.4 min.

3. Graceful shutdown: install Ctrl+C handler via ShutdownSignal
   (Arc<AtomicBool>), thread through orchestrator/CLI, release locked
   jobs on interrupt, record sync_run as "failed".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-06 11:22:04 -05:00
parent 32783080f1
commit 405e5370dc
9 changed files with 536 additions and 92 deletions

View File

@@ -103,6 +103,28 @@ pub fn complete_job(conn: &Connection, job_id: i64) -> Result<()> {
Ok(())
}
/// Same DELETE as `complete_job`, but on an existing transaction so the caller
/// can bundle it atomically with a watermark update.
pub fn complete_job_tx(tx: &rusqlite::Transaction<'_>, job_id: i64) -> Result<()> {
tx.execute(
"DELETE FROM pending_dependent_fetches WHERE id = ?1",
rusqlite::params![job_id],
)?;
Ok(())
}
/// Release all currently locked jobs (set `locked_at = NULL`).
/// Used during graceful shutdown so the next sync doesn't wait for stale locks.
pub fn release_all_locked_jobs(conn: &Connection) -> Result<usize> {
let changes = conn.execute(
"UPDATE pending_dependent_fetches SET locked_at = NULL WHERE locked_at IS NOT NULL",
[],
)?;
Ok(changes)
}
pub fn fail_job(conn: &Connection, job_id: i64, error: &str) -> Result<()> {
let now = now_ms();
@@ -200,3 +222,109 @@ pub fn count_claimable_jobs(conn: &Connection, project_id: i64) -> Result<HashMa
Ok(counts)
}
#[cfg(test)]
mod tests {
use std::path::Path;
use super::*;
use crate::core::db::{create_connection, run_migrations};
fn setup_db_with_job() -> (Connection, i64) {
let conn = create_connection(Path::new(":memory:")).unwrap();
run_migrations(&conn).unwrap();
conn.execute(
"INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url) \
VALUES (1, 'group/repo', 'https://gitlab.com/group/repo')",
[],
)
.unwrap();
let project_id: i64 = conn
.query_row("SELECT id FROM projects LIMIT 1", [], |row| row.get(0))
.unwrap();
enqueue_job(&conn, project_id, "issue", 42, 100, "resource_events", None).unwrap();
let job_id: i64 = conn
.query_row(
"SELECT id FROM pending_dependent_fetches LIMIT 1",
[],
|row| row.get(0),
)
.unwrap();
(conn, job_id)
}
#[test]
fn complete_job_tx_commits() {
let (conn, job_id) = setup_db_with_job();
let tx = conn.unchecked_transaction().unwrap();
complete_job_tx(&tx, job_id).unwrap();
tx.commit().unwrap();
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM pending_dependent_fetches WHERE id = ?1",
[job_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(count, 0, "job should be deleted after commit");
}
#[test]
fn complete_job_tx_rollback() {
let (conn, job_id) = setup_db_with_job();
{
let tx = conn.unchecked_transaction().unwrap();
complete_job_tx(&tx, job_id).unwrap();
// drop tx without commit = rollback
}
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM pending_dependent_fetches WHERE id = ?1",
[job_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(count, 1, "job should survive dropped (rolled-back) tx");
}
#[test]
fn release_all_locked_jobs_clears_locks() {
let (conn, _job_id) = setup_db_with_job();
let project_id: i64 = conn
.query_row("SELECT id FROM projects LIMIT 1", [], |row| row.get(0))
.unwrap();
let jobs = claim_jobs(&conn, "resource_events", project_id, 10).unwrap();
assert_eq!(jobs.len(), 1);
let locked: bool = conn
.query_row(
"SELECT locked_at IS NOT NULL FROM pending_dependent_fetches WHERE id = ?1",
[jobs[0].id],
|row| row.get(0),
)
.unwrap();
assert!(locked, "job should be locked after claim");
let released = release_all_locked_jobs(&conn).unwrap();
assert_eq!(released, 1);
let locked: bool = conn
.query_row(
"SELECT locked_at IS NOT NULL FROM pending_dependent_fetches WHERE id = ?1",
[jobs[0].id],
|row| row.get(0),
)
.unwrap();
assert!(!locked, "job should be unlocked after release_all");
}
}

View File

@@ -12,6 +12,7 @@ pub mod paths;
pub mod payloads;
pub mod project;
pub mod references;
pub mod shutdown;
pub mod sync_run;
pub mod time;
pub mod timeline;

63
src/core/shutdown.rs Normal file
View File

@@ -0,0 +1,63 @@
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
/// A cooperative cancellation token for graceful shutdown.
///
/// Clone-able and cheaply checkable from any thread or async task.
/// When `cancel()` is called (typically from a Ctrl+C signal handler),
/// all clones observe the cancellation via `is_cancelled()`.
#[derive(Clone)]
pub struct ShutdownSignal {
cancelled: Arc<AtomicBool>,
}
impl ShutdownSignal {
pub fn new() -> Self {
Self {
cancelled: Arc::new(AtomicBool::new(false)),
}
}
pub fn cancel(&self) {
self.cancelled.store(true, Ordering::Relaxed);
}
pub fn is_cancelled(&self) -> bool {
self.cancelled.load(Ordering::Relaxed)
}
}
impl Default for ShutdownSignal {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn signal_starts_uncancelled() {
let signal = ShutdownSignal::new();
assert!(!signal.is_cancelled());
}
#[test]
fn cancel_sets_flag() {
let signal = ShutdownSignal::new();
signal.cancel();
assert!(signal.is_cancelled());
}
#[test]
fn clone_propagates_cancellation() {
let signal = ShutdownSignal::new();
let clone = signal.clone();
signal.cancel();
assert!(
clone.is_cancelled(),
"clone should see cancellation from original"
);
}
}