feat(sync): concurrent drains, atomic watermarks, graceful Ctrl+C shutdown

Three fixes to the sync pipeline: 1. Atomic watermarks: wrap complete_job + update_watermark in a single SQLite transaction so crash between them can't leave partial state. 2. Concurrent drain loops: prefetch HTTP requests via join_all (batch size = dependent_concurrency), then write serially to DB. Reduces ~9K sequential requests from ~19 min to ~2.4 min. 3. Graceful shutdown: install Ctrl+C handler via ShutdownSignal (Arc<AtomicBool>), thread through orchestrator/CLI, release locked jobs on interrupt, record sync_run as "failed". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 11:22:04 -05:00
parent 32783080f1
commit 405e5370dc
9 changed files with 536 additions and 92 deletions
--- a/src/core/dependent_queue.rs
+++ b/src/core/dependent_queue.rs
@@ -103,6 +103,28 @@ pub fn complete_job(conn: &Connection, job_id: i64) -> Result<()> {
    Ok(())
 }

+/// Same DELETE as `complete_job`, but on an existing transaction so the caller
+/// can bundle it atomically with a watermark update.
+pub fn complete_job_tx(tx: &rusqlite::Transaction<'_>, job_id: i64) -> Result<()> {
+    tx.execute(
+        "DELETE FROM pending_dependent_fetches WHERE id = ?1",
+        rusqlite::params![job_id],
+    )?;
+
+    Ok(())
+}
+
+/// Release all currently locked jobs (set `locked_at = NULL`).
+/// Used during graceful shutdown so the next sync doesn't wait for stale locks.
+pub fn release_all_locked_jobs(conn: &Connection) -> Result<usize> {
+    let changes = conn.execute(
+        "UPDATE pending_dependent_fetches SET locked_at = NULL WHERE locked_at IS NOT NULL",
+        [],
+    )?;
+
+    Ok(changes)
+}
+
 pub fn fail_job(conn: &Connection, job_id: i64, error: &str) -> Result<()> {
    let now = now_ms();

@@ -200,3 +222,109 @@ pub fn count_claimable_jobs(conn: &Connection, project_id: i64) -> Result<HashMa

    Ok(counts)
 }
+
+#[cfg(test)]
+mod tests {
+    use std::path::Path;
+
+    use super::*;
+    use crate::core::db::{create_connection, run_migrations};
+
+    fn setup_db_with_job() -> (Connection, i64) {
+        let conn = create_connection(Path::new(":memory:")).unwrap();
+        run_migrations(&conn).unwrap();
+
+        conn.execute(
+            "INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url) \
+             VALUES (1, 'group/repo', 'https://gitlab.com/group/repo')",
+            [],
+        )
+        .unwrap();
+
+        let project_id: i64 = conn
+            .query_row("SELECT id FROM projects LIMIT 1", [], |row| row.get(0))
+            .unwrap();
+
+        enqueue_job(&conn, project_id, "issue", 42, 100, "resource_events", None).unwrap();
+
+        let job_id: i64 = conn
+            .query_row(
+                "SELECT id FROM pending_dependent_fetches LIMIT 1",
+                [],
+                |row| row.get(0),
+            )
+            .unwrap();
+
+        (conn, job_id)
+    }
+
+    #[test]
+    fn complete_job_tx_commits() {
+        let (conn, job_id) = setup_db_with_job();
+
+        let tx = conn.unchecked_transaction().unwrap();
+        complete_job_tx(&tx, job_id).unwrap();
+        tx.commit().unwrap();
+
+        let count: i64 = conn
+            .query_row(
+                "SELECT COUNT(*) FROM pending_dependent_fetches WHERE id = ?1",
+                [job_id],
+                |row| row.get(0),
+            )
+            .unwrap();
+        assert_eq!(count, 0, "job should be deleted after commit");
+    }
+
+    #[test]
+    fn complete_job_tx_rollback() {
+        let (conn, job_id) = setup_db_with_job();
+
+        {
+            let tx = conn.unchecked_transaction().unwrap();
+            complete_job_tx(&tx, job_id).unwrap();
+            // drop tx without commit = rollback
+        }
+
+        let count: i64 = conn
+            .query_row(
+                "SELECT COUNT(*) FROM pending_dependent_fetches WHERE id = ?1",
+                [job_id],
+                |row| row.get(0),
+            )
+            .unwrap();
+        assert_eq!(count, 1, "job should survive dropped (rolled-back) tx");
+    }
+
+    #[test]
+    fn release_all_locked_jobs_clears_locks() {
+        let (conn, _job_id) = setup_db_with_job();
+
+        let project_id: i64 = conn
+            .query_row("SELECT id FROM projects LIMIT 1", [], |row| row.get(0))
+            .unwrap();
+        let jobs = claim_jobs(&conn, "resource_events", project_id, 10).unwrap();
+        assert_eq!(jobs.len(), 1);
+
+        let locked: bool = conn
+            .query_row(
+                "SELECT locked_at IS NOT NULL FROM pending_dependent_fetches WHERE id = ?1",
+                [jobs[0].id],
+                |row| row.get(0),
+            )
+            .unwrap();
+        assert!(locked, "job should be locked after claim");
+
+        let released = release_all_locked_jobs(&conn).unwrap();
+        assert_eq!(released, 1);
+
+        let locked: bool = conn
+            .query_row(
+                "SELECT locked_at IS NOT NULL FROM pending_dependent_fetches WHERE id = ?1",
+                [jobs[0].id],
+                |row| row.get(0),
+            )
+            .unwrap();
+        assert!(!locked, "job should be unlocked after release_all");
+    }
+}
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -12,6 +12,7 @@ pub mod paths;
 pub mod payloads;
 pub mod project;
 pub mod references;
+pub mod shutdown;
 pub mod sync_run;
 pub mod time;
 pub mod timeline;
--- a/src/core/shutdown.rs
+++ b/src/core/shutdown.rs
@@ -0,0 +1,63 @@
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+/// A cooperative cancellation token for graceful shutdown.
+///
+/// Clone-able and cheaply checkable from any thread or async task.
+/// When `cancel()` is called (typically from a Ctrl+C signal handler),
+/// all clones observe the cancellation via `is_cancelled()`.
+#[derive(Clone)]
+pub struct ShutdownSignal {
+    cancelled: Arc<AtomicBool>,
+}
+
+impl ShutdownSignal {
+    pub fn new() -> Self {
+        Self {
+            cancelled: Arc::new(AtomicBool::new(false)),
+        }
+    }
+
+    pub fn cancel(&self) {
+        self.cancelled.store(true, Ordering::Relaxed);
+    }
+
+    pub fn is_cancelled(&self) -> bool {
+        self.cancelled.load(Ordering::Relaxed)
+    }
+}
+
+impl Default for ShutdownSignal {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn signal_starts_uncancelled() {
+        let signal = ShutdownSignal::new();
+        assert!(!signal.is_cancelled());
+    }
+
+    #[test]
+    fn cancel_sets_flag() {
+        let signal = ShutdownSignal::new();
+        signal.cancel();
+        assert!(signal.is_cancelled());
+    }
+
+    #[test]
+    fn clone_propagates_cancellation() {
+        let signal = ShutdownSignal::new();
+        let clone = signal.clone();
+        signal.cancel();
+        assert!(
+            clone.is_cancelled(),
+            "clone should see cancellation from original"
+        );
+    }
+}