feat(ingestion): Implement cursor-based incremental sync from GitLab
Provides efficient data synchronization with minimal API calls. src/ingestion/issues.rs - Issue sync logic: - Cursor-based incremental sync using updated_at timestamp - Fetches only issues modified since last sync - Configurable cursor rewind for overlap safety (default 2s) - Batched database writes with transaction wrapping - Upserts issues, labels, milestones, and assignees - Maintains issue_labels and issue_assignees junction tables - Returns IngestIssuesResult with counts and issues needing discussion sync - Identifies issues where discussion count changed src/ingestion/discussions.rs - Discussion sync logic: - Fetches discussions for issues that need sync - Compares discussion count vs stored to detect changes - Batched note insertion with raw payload preservation - Updates discussion metadata (resolved state, note counts) - Tracks sync state per discussion to enable incremental updates - Returns IngestDiscussionsResult with fetched/skipped counts src/ingestion/orchestrator.rs - Sync coordination: - Two-phase sync: issues first, then discussions - Progress callback support for CLI progress bars - ProgressEvent enum for fine-grained status updates: - IssueFetch, IssueProcess, DiscussionFetch, DiscussionSkip - Acquires sync lock before starting - Updates sync watermark on successful completion - Handles partial failures gracefully (watermark not updated) - Returns IngestProjectResult with detailed statistics The architecture supports future additions: - Merge request ingestion (parallel to issues) - Full-text search indexing hooks - Vector embedding pipeline integration Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
212
src/ingestion/orchestrator.rs
Normal file
212
src/ingestion/orchestrator.rs
Normal file
@@ -0,0 +1,212 @@
|
||||
//! Ingestion orchestrator: coordinates issue and discussion sync.
|
||||
//!
|
||||
//! Implements the CP1 canonical pattern:
|
||||
//! 1. Fetch issues with cursor-based sync
|
||||
//! 2. Identify issues needing discussion sync
|
||||
//! 3. Execute discussion sync sequentially (rusqlite Connection is not Send)
|
||||
|
||||
use rusqlite::Connection;
|
||||
use tracing::info;
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::error::Result;
|
||||
use crate::gitlab::GitLabClient;
|
||||
|
||||
use super::discussions::ingest_issue_discussions;
|
||||
use super::issues::{IssueForDiscussionSync, ingest_issues};
|
||||
|
||||
/// Progress callback for ingestion operations.
|
||||
pub type ProgressCallback = Box<dyn Fn(ProgressEvent) + Send + Sync>;
|
||||
|
||||
/// Progress events emitted during ingestion.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ProgressEvent {
|
||||
/// Issue fetching started
|
||||
IssuesFetchStarted,
|
||||
/// An issue was fetched (current count)
|
||||
IssueFetched { count: usize },
|
||||
/// Issue fetching complete
|
||||
IssuesFetchComplete { total: usize },
|
||||
/// Discussion sync started (total issues to sync)
|
||||
DiscussionSyncStarted { total: usize },
|
||||
/// Discussion synced for an issue (current/total)
|
||||
DiscussionSynced { current: usize, total: usize },
|
||||
/// Discussion sync complete
|
||||
DiscussionSyncComplete,
|
||||
}
|
||||
|
||||
/// Result of full project ingestion.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct IngestProjectResult {
|
||||
pub issues_fetched: usize,
|
||||
pub issues_upserted: usize,
|
||||
pub labels_created: usize,
|
||||
pub discussions_fetched: usize,
|
||||
pub discussions_upserted: usize,
|
||||
pub notes_upserted: usize,
|
||||
pub issues_synced_discussions: usize,
|
||||
pub issues_skipped_discussion_sync: usize,
|
||||
}
|
||||
|
||||
/// Ingest all issues and their discussions for a project.
|
||||
pub async fn ingest_project_issues(
|
||||
conn: &Connection,
|
||||
client: &GitLabClient,
|
||||
config: &Config,
|
||||
project_id: i64,
|
||||
gitlab_project_id: i64,
|
||||
) -> Result<IngestProjectResult> {
|
||||
ingest_project_issues_with_progress(conn, client, config, project_id, gitlab_project_id, None)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Ingest all issues and their discussions for a project with progress reporting.
|
||||
pub async fn ingest_project_issues_with_progress(
|
||||
conn: &Connection,
|
||||
client: &GitLabClient,
|
||||
config: &Config,
|
||||
project_id: i64,
|
||||
gitlab_project_id: i64,
|
||||
progress: Option<ProgressCallback>,
|
||||
) -> Result<IngestProjectResult> {
|
||||
let mut result = IngestProjectResult::default();
|
||||
let emit = |event: ProgressEvent| {
|
||||
if let Some(ref cb) = progress {
|
||||
cb(event);
|
||||
}
|
||||
};
|
||||
|
||||
// Step 1: Ingest issues
|
||||
let issue_result = ingest_issues(conn, client, config, project_id, gitlab_project_id).await?;
|
||||
|
||||
result.issues_fetched = issue_result.fetched;
|
||||
result.issues_upserted = issue_result.upserted;
|
||||
result.labels_created = issue_result.labels_created;
|
||||
|
||||
// Step 2: Sync discussions for issues that need it
|
||||
let issues_needing_sync = issue_result.issues_needing_discussion_sync;
|
||||
|
||||
// Query actual total issues for accurate skip count (issues_upserted only counts this run)
|
||||
let total_issues: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM issues WHERE project_id = ?",
|
||||
[project_id],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.unwrap_or(0);
|
||||
let total_issues = total_issues as usize;
|
||||
result.issues_skipped_discussion_sync = total_issues.saturating_sub(issues_needing_sync.len());
|
||||
|
||||
if issues_needing_sync.is_empty() {
|
||||
info!("No issues need discussion sync");
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
info!(
|
||||
count = issues_needing_sync.len(),
|
||||
"Starting discussion sync for issues"
|
||||
);
|
||||
|
||||
emit(ProgressEvent::DiscussionSyncStarted {
|
||||
total: issues_needing_sync.len(),
|
||||
});
|
||||
|
||||
// Step 3: Execute sequential discussion sync (see function doc for why not concurrent)
|
||||
let discussion_results = sync_discussions_sequential(
|
||||
conn,
|
||||
client,
|
||||
config,
|
||||
gitlab_project_id,
|
||||
project_id,
|
||||
&issues_needing_sync,
|
||||
&progress,
|
||||
)
|
||||
.await?;
|
||||
|
||||
emit(ProgressEvent::DiscussionSyncComplete);
|
||||
|
||||
// Aggregate discussion results
|
||||
for disc_result in discussion_results {
|
||||
result.discussions_fetched += disc_result.discussions_fetched;
|
||||
result.discussions_upserted += disc_result.discussions_upserted;
|
||||
result.notes_upserted += disc_result.notes_upserted;
|
||||
result.issues_synced_discussions += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
issues_fetched = result.issues_fetched,
|
||||
issues_upserted = result.issues_upserted,
|
||||
labels_created = result.labels_created,
|
||||
discussions_fetched = result.discussions_fetched,
|
||||
notes_upserted = result.notes_upserted,
|
||||
issues_synced = result.issues_synced_discussions,
|
||||
issues_skipped = result.issues_skipped_discussion_sync,
|
||||
"Project ingestion complete"
|
||||
);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Sync discussions sequentially for each issue.
|
||||
///
|
||||
/// NOTE: Despite the config having `dependent_concurrency`, we process sequentially
|
||||
/// because rusqlite's `Connection` is not `Send` and cannot be shared across tasks.
|
||||
/// True concurrency would require connection pooling (r2d2, deadpool, etc.).
|
||||
/// The batch_size from config is used for progress logging granularity.
|
||||
async fn sync_discussions_sequential(
|
||||
conn: &Connection,
|
||||
client: &GitLabClient,
|
||||
config: &Config,
|
||||
gitlab_project_id: i64,
|
||||
local_project_id: i64,
|
||||
issues: &[IssueForDiscussionSync],
|
||||
progress: &Option<ProgressCallback>,
|
||||
) -> Result<Vec<super::discussions::IngestDiscussionsResult>> {
|
||||
let batch_size = config.sync.dependent_concurrency as usize;
|
||||
let total = issues.len();
|
||||
|
||||
let mut results = Vec::with_capacity(issues.len());
|
||||
|
||||
// Process in batches for progress feedback (actual processing is sequential)
|
||||
for chunk in issues.chunks(batch_size) {
|
||||
for issue in chunk {
|
||||
let disc_result = ingest_issue_discussions(
|
||||
conn,
|
||||
client,
|
||||
config,
|
||||
gitlab_project_id,
|
||||
local_project_id,
|
||||
std::slice::from_ref(issue),
|
||||
)
|
||||
.await?;
|
||||
results.push(disc_result);
|
||||
|
||||
// Emit progress
|
||||
if let Some(cb) = progress {
|
||||
cb(ProgressEvent::DiscussionSynced {
|
||||
current: results.len(),
|
||||
total,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn result_default_has_zero_counts() {
|
||||
let result = IngestProjectResult::default();
|
||||
assert_eq!(result.issues_fetched, 0);
|
||||
assert_eq!(result.issues_upserted, 0);
|
||||
assert_eq!(result.labels_created, 0);
|
||||
assert_eq!(result.discussions_fetched, 0);
|
||||
assert_eq!(result.notes_upserted, 0);
|
||||
assert_eq!(result.issues_synced_discussions, 0);
|
||||
assert_eq!(result.issues_skipped_discussion_sync, 0);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user