feat(ingestion): Implement MR sync with parallel discussion prefetch

Adds complete merge request ingestion pipeline with a novel two-phase
discussion sync strategy optimized for throughput.

New modules:
- merge_requests.rs: MR upsert with labels/assignees/reviewers handling,
  stale MR cleanup, and watermark-based incremental sync
- mr_discussions.rs: Parallel prefetch strategy for MR discussions

Two-phase MR discussion sync:
1. PREFETCH PHASE: Spawn concurrent tasks to fetch discussions for
   multiple MRs simultaneously (configurable concurrency, default 8).
   Transform and validate in parallel, storing results in memory.
2. WRITE PHASE: Serial database writes to avoid lock contention.
   Each MR's discussions written in a single transaction, with
   proper stale discussion cleanup.

This approach achieves ~4-8x throughput vs serial fetching while
maintaining database consistency. Transform errors are tracked per-MR
to prevent partial writes from corrupting watermarks.

Orchestrator updates:
- ingest_merge_requests(): Coordinates MR fetch -> discussion sync flow
- Progress callbacks emit MR-specific events for UI feedback
- Respects --full flag to reset discussion watermarks for full resync

The prefetch strategy is critical for MRs which typically have more
discussions than issues, and where API latency dominates sync time.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-26 22:45:48 -05:00
parent d33f24c91b
commit cd44e516e3
6 changed files with 1458 additions and 26 deletions

View File

@@ -1,10 +1,11 @@
//! Ingestion orchestrator: coordinates issue and discussion sync.
//! Ingestion orchestrator: coordinates issue/MR and discussion sync.
//!
//! Implements the CP1 canonical pattern:
//! 1. Fetch issues with cursor-based sync
//! 2. Identify issues needing discussion sync
//! 3. Execute discussion sync sequentially (rusqlite Connection is not Send)
//! Implements the canonical pattern:
//! 1. Fetch resources (issues or MRs) with cursor-based sync
//! 2. Identify resources needing discussion sync
//! 3. Execute discussion sync with parallel prefetch (fetch in parallel, write serially)
use futures::future::join_all;
use rusqlite::Connection;
use tracing::info;
@@ -14,6 +15,10 @@ use crate::gitlab::GitLabClient;
use super::discussions::ingest_issue_discussions;
use super::issues::{IssueForDiscussionSync, ingest_issues};
use super::merge_requests::{
MrForDiscussionSync, get_mrs_needing_discussion_sync, ingest_merge_requests,
};
use super::mr_discussions::{prefetch_mr_discussions, write_prefetched_mr_discussions};
/// Progress callback for ingestion operations.
pub type ProgressCallback = Box<dyn Fn(ProgressEvent) + Send + Sync>;
@@ -33,9 +38,21 @@ pub enum ProgressEvent {
DiscussionSynced { current: usize, total: usize },
/// Discussion sync complete
DiscussionSyncComplete,
/// MR fetching started
MrsFetchStarted,
/// An MR was fetched (current count)
MrFetched { count: usize },
/// MR fetching complete
MrsFetchComplete { total: usize },
/// MR discussion sync started (total MRs to sync)
MrDiscussionSyncStarted { total: usize },
/// MR discussion synced (current/total)
MrDiscussionSynced { current: usize, total: usize },
/// MR discussion sync complete
MrDiscussionSyncComplete,
}
/// Result of full project ingestion.
/// Result of full project ingestion (issues).
#[derive(Debug, Default)]
pub struct IngestProjectResult {
pub issues_fetched: usize,
@@ -48,6 +65,23 @@ pub struct IngestProjectResult {
pub issues_skipped_discussion_sync: usize,
}
/// Result of MR ingestion for a project.
#[derive(Debug, Default)]
pub struct IngestMrProjectResult {
pub mrs_fetched: usize,
pub mrs_upserted: usize,
pub labels_created: usize,
pub assignees_linked: usize,
pub reviewers_linked: usize,
pub discussions_fetched: usize,
pub discussions_upserted: usize,
pub notes_upserted: usize,
pub notes_skipped_bad_timestamp: usize,
pub diffnotes_count: usize,
pub mrs_synced_discussions: usize,
pub mrs_skipped_discussion_sync: usize,
}
/// Ingest all issues and their discussions for a project.
pub async fn ingest_project_issues(
conn: &Connection,
@@ -194,6 +228,183 @@ async fn sync_discussions_sequential(
Ok(results)
}
/// Ingest all merge requests and their discussions for a project.
pub async fn ingest_project_merge_requests(
conn: &Connection,
client: &GitLabClient,
config: &Config,
project_id: i64,
gitlab_project_id: i64,
full_sync: bool,
) -> Result<IngestMrProjectResult> {
ingest_project_merge_requests_with_progress(
conn,
client,
config,
project_id,
gitlab_project_id,
full_sync,
None,
)
.await
}
/// Ingest all merge requests and their discussions for a project with progress reporting.
pub async fn ingest_project_merge_requests_with_progress(
conn: &Connection,
client: &GitLabClient,
config: &Config,
project_id: i64,
gitlab_project_id: i64,
full_sync: bool,
progress: Option<ProgressCallback>,
) -> Result<IngestMrProjectResult> {
let mut result = IngestMrProjectResult::default();
let emit = |event: ProgressEvent| {
if let Some(ref cb) = progress {
cb(event);
}
};
// Step 1: Ingest MRs
emit(ProgressEvent::MrsFetchStarted);
let mr_result = ingest_merge_requests(
conn,
client,
config,
project_id,
gitlab_project_id,
full_sync,
)
.await?;
result.mrs_fetched = mr_result.fetched;
result.mrs_upserted = mr_result.upserted;
result.labels_created = mr_result.labels_created;
result.assignees_linked = mr_result.assignees_linked;
result.reviewers_linked = mr_result.reviewers_linked;
emit(ProgressEvent::MrsFetchComplete {
total: result.mrs_fetched,
});
// Step 2: Query DB for MRs needing discussion sync
// CRITICAL: Query AFTER ingestion to avoid memory growth during large ingests
let mrs_needing_sync = get_mrs_needing_discussion_sync(conn, project_id)?;
// Query total MRs for accurate skip count
let total_mrs: i64 = conn
.query_row(
"SELECT COUNT(*) FROM merge_requests WHERE project_id = ?",
[project_id],
|row| row.get(0),
)
.unwrap_or(0);
let total_mrs = total_mrs as usize;
result.mrs_skipped_discussion_sync = total_mrs.saturating_sub(mrs_needing_sync.len());
if mrs_needing_sync.is_empty() {
info!("No MRs need discussion sync");
return Ok(result);
}
info!(
count = mrs_needing_sync.len(),
"Starting discussion sync for MRs"
);
emit(ProgressEvent::MrDiscussionSyncStarted {
total: mrs_needing_sync.len(),
});
// Step 3: Execute sequential MR discussion sync
let discussion_results = sync_mr_discussions_sequential(
conn,
client,
config,
gitlab_project_id,
project_id,
&mrs_needing_sync,
&progress,
)
.await?;
emit(ProgressEvent::MrDiscussionSyncComplete);
// Aggregate discussion results
for disc_result in discussion_results {
result.discussions_fetched += disc_result.discussions_fetched;
result.discussions_upserted += disc_result.discussions_upserted;
result.notes_upserted += disc_result.notes_upserted;
result.notes_skipped_bad_timestamp += disc_result.notes_skipped_bad_timestamp;
result.diffnotes_count += disc_result.diffnotes_count;
if disc_result.pagination_succeeded {
result.mrs_synced_discussions += 1;
}
}
info!(
mrs_fetched = result.mrs_fetched,
mrs_upserted = result.mrs_upserted,
labels_created = result.labels_created,
discussions_fetched = result.discussions_fetched,
notes_upserted = result.notes_upserted,
diffnotes = result.diffnotes_count,
mrs_synced = result.mrs_synced_discussions,
mrs_skipped = result.mrs_skipped_discussion_sync,
"MR project ingestion complete"
);
Ok(result)
}
/// Sync discussions for MRs with parallel API prefetching.
///
/// Pattern: Fetch discussions for multiple MRs in parallel, then write serially.
/// This overlaps network I/O while respecting rusqlite's single-connection constraint.
async fn sync_mr_discussions_sequential(
conn: &Connection,
client: &GitLabClient,
config: &Config,
gitlab_project_id: i64,
local_project_id: i64,
mrs: &[MrForDiscussionSync],
progress: &Option<ProgressCallback>,
) -> Result<Vec<super::mr_discussions::IngestMrDiscussionsResult>> {
let batch_size = config.sync.dependent_concurrency as usize;
let total = mrs.len();
let mut results = Vec::with_capacity(mrs.len());
let mut processed = 0;
// Process in batches: parallel API fetch, serial DB write
for chunk in mrs.chunks(batch_size) {
// Step 1: Prefetch discussions for all MRs in this batch in parallel
let prefetch_futures = chunk.iter().map(|mr| {
prefetch_mr_discussions(client, gitlab_project_id, local_project_id, mr.clone())
});
let prefetched_batch = join_all(prefetch_futures).await;
// Step 2: Write each prefetched result serially
for prefetched in prefetched_batch {
let disc_result =
write_prefetched_mr_discussions(conn, config, local_project_id, prefetched)?;
results.push(disc_result);
processed += 1;
// Emit progress
if let Some(cb) = progress {
cb(ProgressEvent::MrDiscussionSynced {
current: processed,
total,
});
}
}
}
Ok(results)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -209,4 +420,21 @@ mod tests {
assert_eq!(result.issues_synced_discussions, 0);
assert_eq!(result.issues_skipped_discussion_sync, 0);
}
#[test]
fn mr_result_default_has_zero_counts() {
let result = IngestMrProjectResult::default();
assert_eq!(result.mrs_fetched, 0);
assert_eq!(result.mrs_upserted, 0);
assert_eq!(result.labels_created, 0);
assert_eq!(result.assignees_linked, 0);
assert_eq!(result.reviewers_linked, 0);
assert_eq!(result.discussions_fetched, 0);
assert_eq!(result.discussions_upserted, 0);
assert_eq!(result.notes_upserted, 0);
assert_eq!(result.notes_skipped_bad_timestamp, 0);
assert_eq!(result.diffnotes_count, 0);
assert_eq!(result.mrs_synced_discussions, 0);
assert_eq!(result.mrs_skipped_discussion_sync, 0);
}
}