feat(surgical-sync): add per-IID surgical sync pipeline with preflight validation

Add the ability to sync specific issues or merge requests by IID without
running a full incremental sync. This enables fast, targeted data refresh
for individual entities — useful for agent workflows, debugging, and
real-time investigation of specific issues or MRs.

Architecture:
- New CLI flags: --issue <IID> and --mr <IID> (repeatable, up to 100 total)
  scoped to a single project via -p/--project
- Preflight phase validates all IIDs exist on GitLab before any DB writes,
  with TOCTOU-aware soft verification at ingest time
- 6-stage pipeline: preflight -> fetch -> ingest -> dependents -> docs -> embed
- Each stage is cancellation-aware via ShutdownSignal
- Dedicated SyncRunRecorder extensions track surgical-specific counters
  (issues_fetched, mrs_ingested, docs_regenerated, etc.)

New modules:
- src/ingestion/surgical.rs: Core surgical fetch/ingest/dependent logic
  with preflight_fetch(), ingest_issue_by_iid(), ingest_mr_by_iid(),
  and fetch_dependents_for_{issue,mr}()
- src/cli/commands/sync_surgical.rs: Full CLI orchestrator with progress
  spinners, human/robot output, and cancellation handling
- src/embedding/pipeline.rs: embed_documents_by_ids() for scoped embedding
- src/documents/regenerator.rs: regenerate_dirty_documents_for_sources()
  for scoped document regeneration

Database changes:
- Migration 027: Extends sync_runs with mode, phase, surgical_iids_json,
  per-entity counters, and cancelled_at column
- New indexes: idx_sync_runs_mode_started, idx_sync_runs_status_phase_started

GitLab client:
- get_issue_by_iid() and get_mr_by_iid() single-entity fetch methods

Error handling:
- New SurgicalPreflightFailed error variant with entity_type, iid, project,
  and reason fields. Shares exit code 6 with GitLabNotFound.

Includes comprehensive test coverage:
- 645 lines of surgical ingestion tests (wiremock-based)
- 184 lines of scoped embedding tests
- 85 lines of scoped regeneration tests
- 113 lines of GitLab client single-entity tests
- 236 lines of sync_run surgical column/counter tests
- Unit tests for SyncOptions, error codes, and CLI validation
This commit is contained in:
teernisse
2026-02-18 16:27:59 -05:00
parent ea6e45e43f
commit 9ec1344945
25 changed files with 3354 additions and 37 deletions

View File

@@ -89,6 +89,10 @@ const MIGRATIONS: &[(&str, &str)] = &[
"026",
include_str!("../../migrations/026_scoring_indexes.sql"),
),
(
"027",
include_str!("../../migrations/027_surgical_sync_runs.sql"),
),
];
pub fn create_connection(db_path: &Path) -> Result<Connection> {

View File

@@ -21,6 +21,7 @@ pub enum ErrorCode {
EmbeddingFailed,
NotFound,
Ambiguous,
SurgicalPreflightFailed,
}
impl std::fmt::Display for ErrorCode {
@@ -44,6 +45,7 @@ impl std::fmt::Display for ErrorCode {
Self::EmbeddingFailed => "EMBEDDING_FAILED",
Self::NotFound => "NOT_FOUND",
Self::Ambiguous => "AMBIGUOUS",
Self::SurgicalPreflightFailed => "SURGICAL_PREFLIGHT_FAILED",
};
write!(f, "{code}")
}
@@ -70,6 +72,9 @@ impl ErrorCode {
Self::EmbeddingFailed => 16,
Self::NotFound => 17,
Self::Ambiguous => 18,
// Shares exit code 6 with GitLabNotFound — same semantic category (resource not found).
// Robot consumers distinguish via ErrorCode string, not exit code.
Self::SurgicalPreflightFailed => 6,
}
}
}
@@ -111,7 +116,7 @@ pub enum LoreError {
source: Option<rusqlite::Error>,
},
#[error("GitLab token not set. Export {env_var} environment variable.")]
#[error("GitLab token not set. Run 'lore token set' or export {env_var}.")]
TokenNotSet { env_var: String },
#[error("Database error: {0}")]
@@ -153,6 +158,14 @@ pub enum LoreError {
#[error("No embeddings found. Run: lore embed")]
EmbeddingsNotBuilt,
#[error("Surgical preflight failed for {entity_type} !{iid} in {project}: {reason}")]
SurgicalPreflightFailed {
entity_type: String,
iid: u64,
project: String,
reason: String,
},
}
impl LoreError {
@@ -179,6 +192,7 @@ impl LoreError {
Self::OllamaModelNotFound { .. } => ErrorCode::OllamaModelNotFound,
Self::EmbeddingFailed { .. } => ErrorCode::EmbeddingFailed,
Self::EmbeddingsNotBuilt => ErrorCode::EmbeddingFailed,
Self::SurgicalPreflightFailed { .. } => ErrorCode::SurgicalPreflightFailed,
}
}
@@ -207,7 +221,7 @@ impl LoreError {
"Check database file permissions or reset with 'lore reset'.\n\n Example:\n lore migrate\n lore reset --yes",
),
Self::TokenNotSet { .. } => Some(
"Export the token to your shell:\n\n export GITLAB_TOKEN=glpat-xxxxxxxxxxxx\n\n Your token needs the read_api scope.",
"Set your token:\n\n lore token set\n\n Or export to your shell:\n\n export GITLAB_TOKEN=glpat-xxxxxxxxxxxx\n\n Your token needs the read_api scope.",
),
Self::Database(_) => Some(
"Check database file permissions or reset with 'lore reset'.\n\n Example:\n lore doctor\n lore reset --yes",
@@ -227,6 +241,9 @@ impl LoreError {
Some("Check Ollama logs or retry with 'lore embed --retry-failed'")
}
Self::EmbeddingsNotBuilt => Some("Generate embeddings first: lore embed"),
Self::SurgicalPreflightFailed { .. } => Some(
"Verify the IID exists in the project and you have access.\n\n Example:\n lore issues -p <project>\n lore mrs -p <project>",
),
Self::Json(_) | Self::Io(_) | Self::Transform(_) | Self::Other(_) => None,
}
}
@@ -246,7 +263,7 @@ impl LoreError {
Self::GitLabAuthFailed => {
vec!["export GITLAB_TOKEN=glpat-xxx", "lore auth"]
}
Self::TokenNotSet { .. } => vec!["export GITLAB_TOKEN=glpat-xxx"],
Self::TokenNotSet { .. } => vec!["lore token set", "export GITLAB_TOKEN=glpat-xxx"],
Self::OllamaUnavailable { .. } => vec!["ollama serve"],
Self::OllamaModelNotFound { .. } => vec!["ollama pull nomic-embed-text"],
Self::DatabaseLocked { .. } => vec!["lore ingest --force"],
@@ -254,6 +271,9 @@ impl LoreError {
Self::EmbeddingFailed { .. } => vec!["lore embed --retry-failed"],
Self::MigrationFailed { .. } => vec!["lore migrate"],
Self::GitLabNetworkError { .. } => vec!["lore doctor"],
Self::SurgicalPreflightFailed { .. } => {
vec!["lore issues -p <project>", "lore mrs -p <project>"]
}
_ => vec![],
}
}
@@ -293,3 +313,40 @@ impl From<&LoreError> for RobotErrorOutput {
}
pub type Result<T> = std::result::Result<T, LoreError>;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn surgical_preflight_failed_display() {
let err = LoreError::SurgicalPreflightFailed {
entity_type: "issue".to_string(),
iid: 42,
project: "group/repo".to_string(),
reason: "not found on GitLab".to_string(),
};
let msg = err.to_string();
assert!(msg.contains("issue"), "missing entity_type: {msg}");
assert!(msg.contains("42"), "missing iid: {msg}");
assert!(msg.contains("group/repo"), "missing project: {msg}");
assert!(msg.contains("not found on GitLab"), "missing reason: {msg}");
}
#[test]
fn surgical_preflight_failed_error_code() {
let code = ErrorCode::SurgicalPreflightFailed;
assert_eq!(code.exit_code(), 6);
}
#[test]
fn surgical_preflight_failed_code_mapping() {
let err = LoreError::SurgicalPreflightFailed {
entity_type: "merge_request".to_string(),
iid: 99,
project: "ns/proj".to_string(),
reason: "404".to_string(),
};
assert_eq!(err.code(), ErrorCode::SurgicalPreflightFailed);
}
}

View File

@@ -20,6 +20,75 @@ impl SyncRunRecorder {
Ok(Self { row_id })
}
/// Returns the database row ID of this sync run.
pub fn row_id(&self) -> i64 {
self.row_id
}
/// Sets surgical-mode metadata on the run (mode, phase, IID manifest).
pub fn set_surgical_metadata(
&self,
conn: &Connection,
mode: &str,
phase: &str,
surgical_iids_json: &str,
) -> Result<()> {
conn.execute(
"UPDATE sync_runs
SET mode = ?1, phase = ?2, surgical_iids_json = ?3
WHERE id = ?4",
rusqlite::params![mode, phase, surgical_iids_json, self.row_id],
)?;
Ok(())
}
/// Updates the current phase and refreshes the heartbeat timestamp.
pub fn update_phase(&self, conn: &Connection, phase: &str) -> Result<()> {
let now = now_ms();
conn.execute(
"UPDATE sync_runs SET phase = ?1, heartbeat_at = ?2 WHERE id = ?3",
rusqlite::params![phase, now, self.row_id],
)?;
Ok(())
}
/// Increments a counter column by 1 based on entity type and stage.
/// Unknown (entity_type, stage) combinations are silently ignored.
pub fn record_entity_result(
&self,
conn: &Connection,
entity_type: &str,
stage: &str,
) -> Result<()> {
let column = match (entity_type, stage) {
("issue", "fetched") => "issues_fetched",
("issue", "ingested") => "issues_ingested",
("mr", "fetched") => "mrs_fetched",
("mr", "ingested") => "mrs_ingested",
("issue" | "mr", "skipped_stale") => "skipped_stale",
("doc", "regenerated") => "docs_regenerated",
("doc", "embedded") => "docs_embedded",
(_, "warning") => "warnings_count",
_ => return Ok(()),
};
// Column name is from a hardcoded match, not user input — safe to interpolate.
let sql = format!("UPDATE sync_runs SET {column} = {column} + 1 WHERE id = ?1");
conn.execute(&sql, rusqlite::params![self.row_id])?;
Ok(())
}
/// Marks the run as cancelled with a reason. Consumes self (terminal state).
pub fn cancel(self, conn: &Connection, reason: &str) -> Result<()> {
let now = now_ms();
conn.execute(
"UPDATE sync_runs
SET status = 'cancelled', error = ?1, cancelled_at = ?2, finished_at = ?3
WHERE id = ?4",
rusqlite::params![reason, now, now, self.row_id],
)?;
Ok(())
}
pub fn succeed(
self,
conn: &Connection,

View File

@@ -146,3 +146,239 @@ fn test_sync_run_recorder_fail_with_partial_metrics() {
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].name, "ingest_issues");
}
#[test]
fn sync_run_surgical_columns_exist() {
let conn = setup_test_db();
conn.execute(
"INSERT INTO sync_runs (started_at, heartbeat_at, status, command, mode, phase, surgical_iids_json)
VALUES (1000, 1000, 'running', 'sync', 'surgical', 'preflight', '{\"issues\":[7],\"mrs\":[]}')",
[],
)
.unwrap();
let (mode, phase, iids_json): (String, String, String) = conn
.query_row(
"SELECT mode, phase, surgical_iids_json FROM sync_runs WHERE mode = 'surgical'",
[],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
)
.unwrap();
assert_eq!(mode, "surgical");
assert_eq!(phase, "preflight");
assert!(iids_json.contains("7"));
}
#[test]
fn sync_run_counter_defaults_are_zero() {
let conn = setup_test_db();
conn.execute(
"INSERT INTO sync_runs (started_at, heartbeat_at, status, command)
VALUES (2000, 2000, 'running', 'sync')",
[],
)
.unwrap();
let row_id = conn.last_insert_rowid();
let (issues_fetched, mrs_fetched, docs_regenerated, warnings_count): (i64, i64, i64, i64) =
conn.query_row(
"SELECT issues_fetched, mrs_fetched, docs_regenerated, warnings_count FROM sync_runs WHERE id = ?1",
[row_id],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
)
.unwrap();
assert_eq!(issues_fetched, 0);
assert_eq!(mrs_fetched, 0);
assert_eq!(docs_regenerated, 0);
assert_eq!(warnings_count, 0);
}
#[test]
fn sync_run_nullable_columns_default_to_null() {
let conn = setup_test_db();
conn.execute(
"INSERT INTO sync_runs (started_at, heartbeat_at, status, command)
VALUES (3000, 3000, 'running', 'sync')",
[],
)
.unwrap();
let row_id = conn.last_insert_rowid();
let (mode, phase, cancelled_at): (Option<String>, Option<String>, Option<i64>) = conn
.query_row(
"SELECT mode, phase, cancelled_at FROM sync_runs WHERE id = ?1",
[row_id],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
)
.unwrap();
assert!(mode.is_none());
assert!(phase.is_none());
assert!(cancelled_at.is_none());
}
#[test]
fn sync_run_counter_round_trip() {
let conn = setup_test_db();
conn.execute(
"INSERT INTO sync_runs (started_at, heartbeat_at, status, command, mode, issues_fetched, mrs_ingested, docs_embedded)
VALUES (4000, 4000, 'succeeded', 'sync', 'surgical', 3, 2, 5)",
[],
)
.unwrap();
let row_id = conn.last_insert_rowid();
let (issues_fetched, mrs_ingested, docs_embedded): (i64, i64, i64) = conn
.query_row(
"SELECT issues_fetched, mrs_ingested, docs_embedded FROM sync_runs WHERE id = ?1",
[row_id],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
)
.unwrap();
assert_eq!(issues_fetched, 3);
assert_eq!(mrs_ingested, 2);
assert_eq!(docs_embedded, 5);
}
#[test]
fn surgical_lifecycle_start_metadata_succeed() {
let conn = setup_test_db();
let recorder = SyncRunRecorder::start(&conn, "sync", "surg001").unwrap();
let row_id = recorder.row_id();
recorder
.set_surgical_metadata(
&conn,
"surgical",
"preflight",
r#"{"issues":[7,8],"mrs":[101]}"#,
)
.unwrap();
recorder.update_phase(&conn, "ingest").unwrap();
recorder
.record_entity_result(&conn, "issue", "fetched")
.unwrap();
recorder
.record_entity_result(&conn, "issue", "fetched")
.unwrap();
recorder
.record_entity_result(&conn, "issue", "ingested")
.unwrap();
recorder
.record_entity_result(&conn, "mr", "fetched")
.unwrap();
recorder
.record_entity_result(&conn, "mr", "ingested")
.unwrap();
recorder.succeed(&conn, &[], 3, 0).unwrap();
#[allow(clippy::type_complexity)]
let (mode, phase, iids, issues_fetched, mrs_fetched, issues_ingested, mrs_ingested, status): (
String,
String,
String,
i64,
i64,
i64,
i64,
String,
) = conn
.query_row(
"SELECT mode, phase, surgical_iids_json, issues_fetched, mrs_fetched, \
issues_ingested, mrs_ingested, status \
FROM sync_runs WHERE id = ?1",
[row_id],
|r| {
Ok((
r.get(0)?,
r.get(1)?,
r.get(2)?,
r.get(3)?,
r.get(4)?,
r.get(5)?,
r.get(6)?,
r.get(7)?,
))
},
)
.unwrap();
assert_eq!(mode, "surgical");
assert_eq!(phase, "ingest");
assert!(iids.contains("101"));
assert_eq!(issues_fetched, 2);
assert_eq!(mrs_fetched, 1);
assert_eq!(issues_ingested, 1);
assert_eq!(mrs_ingested, 1);
assert_eq!(status, "succeeded");
}
#[test]
fn surgical_lifecycle_cancel() {
let conn = setup_test_db();
let recorder = SyncRunRecorder::start(&conn, "sync", "cancel01").unwrap();
let row_id = recorder.row_id();
recorder
.set_surgical_metadata(&conn, "surgical", "preflight", "{}")
.unwrap();
recorder
.cancel(&conn, "User requested cancellation")
.unwrap();
let (status, error, cancelled_at, finished_at): (
String,
Option<String>,
Option<i64>,
Option<i64>,
) = conn
.query_row(
"SELECT status, error, cancelled_at, finished_at FROM sync_runs WHERE id = ?1",
[row_id],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
)
.unwrap();
assert_eq!(status, "cancelled");
assert_eq!(error.as_deref(), Some("User requested cancellation"));
assert!(cancelled_at.is_some());
assert!(finished_at.is_some());
}
#[test]
fn record_entity_result_ignores_unknown() {
let conn = setup_test_db();
let recorder = SyncRunRecorder::start(&conn, "sync", "unk001").unwrap();
recorder
.record_entity_result(&conn, "widget", "exploded")
.unwrap();
}
#[test]
fn record_entity_result_doc_counters() {
let conn = setup_test_db();
let recorder = SyncRunRecorder::start(&conn, "sync", "cnt001").unwrap();
let row_id = recorder.row_id();
recorder
.record_entity_result(&conn, "doc", "regenerated")
.unwrap();
recorder
.record_entity_result(&conn, "doc", "regenerated")
.unwrap();
recorder
.record_entity_result(&conn, "doc", "embedded")
.unwrap();
recorder
.record_entity_result(&conn, "issue", "skipped_stale")
.unwrap();
let (docs_regen, docs_embed, skipped): (i64, i64, i64) = conn
.query_row(
"SELECT docs_regenerated, docs_embedded, skipped_stale FROM sync_runs WHERE id = ?1",
[row_id],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
)
.unwrap();
assert_eq!(docs_regen, 2);
assert_eq!(docs_embed, 1);
assert_eq!(skipped, 1);
}