feat(ingestion): Implement cursor-based incremental sync from GitLab

Provides efficient data synchronization with minimal API calls.

src/ingestion/issues.rs - Issue sync logic:
- Cursor-based incremental sync using updated_at timestamp
- Fetches only issues modified since last sync
- Configurable cursor rewind for overlap safety (default 2s)
- Batched database writes with transaction wrapping
- Upserts issues, labels, milestones, and assignees
- Maintains issue_labels and issue_assignees junction tables
- Returns IngestIssuesResult with counts and issues needing discussion sync
- Identifies issues where discussion count changed

src/ingestion/discussions.rs - Discussion sync logic:
- Fetches discussions for issues that need sync
- Compares discussion count vs stored to detect changes
- Batched note insertion with raw payload preservation
- Updates discussion metadata (resolved state, note counts)
- Tracks sync state per discussion to enable incremental updates
- Returns IngestDiscussionsResult with fetched/skipped counts

src/ingestion/orchestrator.rs - Sync coordination:
- Two-phase sync: issues first, then discussions
- Progress callback support for CLI progress bars
- ProgressEvent enum for fine-grained status updates:
  - IssueFetch, IssueProcess, DiscussionFetch, DiscussionSkip
- Acquires sync lock before starting
- Updates sync watermark on successful completion
- Handles partial failures gracefully (watermark not updated)
- Returns IngestProjectResult with detailed statistics

The architecture supports future additions:
- Merge request ingestion (parallel to issues)
- Full-text search indexing hooks
- Vector embedding pipeline integration

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-26 11:28:34 -05:00
parent dd5eb04953
commit cd60350c6d
4 changed files with 1153 additions and 0 deletions

View File

@@ -0,0 +1,212 @@
//! Ingestion orchestrator: coordinates issue and discussion sync.
//!
//! Implements the CP1 canonical pattern:
//! 1. Fetch issues with cursor-based sync
//! 2. Identify issues needing discussion sync
//! 3. Execute discussion sync sequentially (rusqlite Connection is not Send)
use rusqlite::Connection;
use tracing::info;
use crate::Config;
use crate::core::error::Result;
use crate::gitlab::GitLabClient;
use super::discussions::ingest_issue_discussions;
use super::issues::{IssueForDiscussionSync, ingest_issues};
/// Progress callback for ingestion operations.
pub type ProgressCallback = Box<dyn Fn(ProgressEvent) + Send + Sync>;
/// Progress events emitted during ingestion.
#[derive(Debug, Clone)]
pub enum ProgressEvent {
/// Issue fetching started
IssuesFetchStarted,
/// An issue was fetched (current count)
IssueFetched { count: usize },
/// Issue fetching complete
IssuesFetchComplete { total: usize },
/// Discussion sync started (total issues to sync)
DiscussionSyncStarted { total: usize },
/// Discussion synced for an issue (current/total)
DiscussionSynced { current: usize, total: usize },
/// Discussion sync complete
DiscussionSyncComplete,
}
/// Result of full project ingestion.
#[derive(Debug, Default)]
pub struct IngestProjectResult {
pub issues_fetched: usize,
pub issues_upserted: usize,
pub labels_created: usize,
pub discussions_fetched: usize,
pub discussions_upserted: usize,
pub notes_upserted: usize,
pub issues_synced_discussions: usize,
pub issues_skipped_discussion_sync: usize,
}
/// Ingest all issues and their discussions for a project.
pub async fn ingest_project_issues(
conn: &Connection,
client: &GitLabClient,
config: &Config,
project_id: i64,
gitlab_project_id: i64,
) -> Result<IngestProjectResult> {
ingest_project_issues_with_progress(conn, client, config, project_id, gitlab_project_id, None)
.await
}
/// Ingest all issues and their discussions for a project with progress reporting.
pub async fn ingest_project_issues_with_progress(
conn: &Connection,
client: &GitLabClient,
config: &Config,
project_id: i64,
gitlab_project_id: i64,
progress: Option<ProgressCallback>,
) -> Result<IngestProjectResult> {
let mut result = IngestProjectResult::default();
let emit = |event: ProgressEvent| {
if let Some(ref cb) = progress {
cb(event);
}
};
// Step 1: Ingest issues
let issue_result = ingest_issues(conn, client, config, project_id, gitlab_project_id).await?;
result.issues_fetched = issue_result.fetched;
result.issues_upserted = issue_result.upserted;
result.labels_created = issue_result.labels_created;
// Step 2: Sync discussions for issues that need it
let issues_needing_sync = issue_result.issues_needing_discussion_sync;
// Query actual total issues for accurate skip count (issues_upserted only counts this run)
let total_issues: i64 = conn
.query_row(
"SELECT COUNT(*) FROM issues WHERE project_id = ?",
[project_id],
|row| row.get(0),
)
.unwrap_or(0);
let total_issues = total_issues as usize;
result.issues_skipped_discussion_sync = total_issues.saturating_sub(issues_needing_sync.len());
if issues_needing_sync.is_empty() {
info!("No issues need discussion sync");
return Ok(result);
}
info!(
count = issues_needing_sync.len(),
"Starting discussion sync for issues"
);
emit(ProgressEvent::DiscussionSyncStarted {
total: issues_needing_sync.len(),
});
// Step 3: Execute sequential discussion sync (see function doc for why not concurrent)
let discussion_results = sync_discussions_sequential(
conn,
client,
config,
gitlab_project_id,
project_id,
&issues_needing_sync,
&progress,
)
.await?;
emit(ProgressEvent::DiscussionSyncComplete);
// Aggregate discussion results
for disc_result in discussion_results {
result.discussions_fetched += disc_result.discussions_fetched;
result.discussions_upserted += disc_result.discussions_upserted;
result.notes_upserted += disc_result.notes_upserted;
result.issues_synced_discussions += 1;
}
info!(
issues_fetched = result.issues_fetched,
issues_upserted = result.issues_upserted,
labels_created = result.labels_created,
discussions_fetched = result.discussions_fetched,
notes_upserted = result.notes_upserted,
issues_synced = result.issues_synced_discussions,
issues_skipped = result.issues_skipped_discussion_sync,
"Project ingestion complete"
);
Ok(result)
}
/// Sync discussions sequentially for each issue.
///
/// NOTE: Despite the config having `dependent_concurrency`, we process sequentially
/// because rusqlite's `Connection` is not `Send` and cannot be shared across tasks.
/// True concurrency would require connection pooling (r2d2, deadpool, etc.).
/// The batch_size from config is used for progress logging granularity.
async fn sync_discussions_sequential(
conn: &Connection,
client: &GitLabClient,
config: &Config,
gitlab_project_id: i64,
local_project_id: i64,
issues: &[IssueForDiscussionSync],
progress: &Option<ProgressCallback>,
) -> Result<Vec<super::discussions::IngestDiscussionsResult>> {
let batch_size = config.sync.dependent_concurrency as usize;
let total = issues.len();
let mut results = Vec::with_capacity(issues.len());
// Process in batches for progress feedback (actual processing is sequential)
for chunk in issues.chunks(batch_size) {
for issue in chunk {
let disc_result = ingest_issue_discussions(
conn,
client,
config,
gitlab_project_id,
local_project_id,
std::slice::from_ref(issue),
)
.await?;
results.push(disc_result);
// Emit progress
if let Some(cb) = progress {
cb(ProgressEvent::DiscussionSynced {
current: results.len(),
total,
});
}
}
}
Ok(results)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn result_default_has_zero_counts() {
let result = IngestProjectResult::default();
assert_eq!(result.issues_fetched, 0);
assert_eq!(result.issues_upserted, 0);
assert_eq!(result.labels_created, 0);
assert_eq!(result.discussions_fetched, 0);
assert_eq!(result.notes_upserted, 0);
assert_eq!(result.issues_synced_discussions, 0);
assert_eq!(result.issues_skipped_discussion_sync, 0);
}
}