Files
gitlore/src/cli/commands/ingest.rs
Taylor Eernisse 266ed78e73 feat(sync): Wire progress callbacks through sync pipeline stages
The sync command's stage spinners now show real-time aggregate progress
for each pipeline phase instead of static "syncing..." messages.

- Add `progress_callback` parameter to `run_embed` and
  `run_generate_docs` so callers can receive `(processed, total)` updates
- Add `stage_bar` parameter to `run_ingest` for aggregate progress
  across concurrently-ingested projects using shared AtomicUsize counters
- Update `stage_spinner` to use `{prefix}` for the `[N/M]` label,
  allowing `{msg}` to be updated independently with progress details
- Thread `ProgressBar` clones into each concurrent project task so
  per-entity progress (fetch, discussions, events) is reflected on the
  aggregate spinner
- Pass `None` for progress callbacks at standalone CLI entry points
  (handle_ingest, handle_generate_docs, handle_embed) to preserve
  existing behavior when commands are run outside of sync

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 14:16:21 -05:00

803 lines
29 KiB
Rust

//! Ingest command - fetch data from GitLab.
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
use console::style;
use indicatif::{ProgressBar, ProgressStyle};
use rusqlite::Connection;
use serde::Serialize;
use tracing::Instrument;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::{LoreError, Result};
use crate::core::lock::{AppLock, LockOptions};
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::gitlab::GitLabClient;
use crate::ingestion::{
IngestMrProjectResult, IngestProjectResult, ProgressEvent, ingest_project_issues_with_progress,
ingest_project_merge_requests_with_progress,
};
/// Result of ingest command for display.
#[derive(Default)]
pub struct IngestResult {
pub resource_type: String,
pub projects_synced: usize,
// Issue-specific fields
pub issues_fetched: usize,
pub issues_upserted: usize,
pub issues_synced_discussions: usize,
pub issues_skipped_discussion_sync: usize,
// MR-specific fields
pub mrs_fetched: usize,
pub mrs_upserted: usize,
pub mrs_synced_discussions: usize,
pub mrs_skipped_discussion_sync: usize,
pub assignees_linked: usize,
pub reviewers_linked: usize,
pub diffnotes_count: usize,
// Shared fields
pub labels_created: usize,
pub discussions_fetched: usize,
pub notes_upserted: usize,
// Resource events
pub resource_events_fetched: usize,
pub resource_events_failed: usize,
}
/// Outcome of ingesting a single project, used to aggregate results
/// from concurrent project processing.
enum ProjectIngestOutcome {
Issues {
path: String,
result: IngestProjectResult,
},
Mrs {
path: String,
result: IngestMrProjectResult,
},
}
/// Controls what interactive UI elements `run_ingest` displays.
///
/// Separates progress indicators (spinners, bars) from text output (headers,
/// per-project summaries) so callers like `sync` can show progress without
/// duplicating summary text.
#[derive(Debug, Clone, Copy)]
pub struct IngestDisplay {
/// Show animated spinners and progress bars.
pub show_progress: bool,
/// Show the per-project spinner. When called from `sync`, the stage
/// spinner already covers this, so a second spinner causes flashing.
pub show_spinner: bool,
/// Show text headers ("Ingesting...") and per-project summary lines.
pub show_text: bool,
}
impl IngestDisplay {
/// Interactive mode: everything visible.
pub fn interactive() -> Self {
Self {
show_progress: true,
show_spinner: true,
show_text: true,
}
}
/// Robot/JSON mode: everything hidden.
pub fn silent() -> Self {
Self {
show_progress: false,
show_spinner: false,
show_text: false,
}
}
/// Progress bars only, no spinner or text (used by sync which provides its
/// own stage spinner).
pub fn progress_only() -> Self {
Self {
show_progress: true,
show_spinner: false,
show_text: false,
}
}
}
/// Run the ingest command.
///
/// `stage_bar` is an optional `ProgressBar` (typically from sync's stage spinner)
/// that will be updated with aggregate progress across all projects.
pub async fn run_ingest(
config: &Config,
resource_type: &str,
project_filter: Option<&str>,
force: bool,
full: bool,
display: IngestDisplay,
stage_bar: Option<ProgressBar>,
) -> Result<IngestResult> {
let run_id = uuid::Uuid::new_v4().simple().to_string();
let run_id = &run_id[..8];
let span = tracing::info_span!("ingest", %run_id, %resource_type);
run_ingest_inner(
config,
resource_type,
project_filter,
force,
full,
display,
stage_bar,
)
.instrument(span)
.await
}
/// Inner implementation of run_ingest, instrumented with a root span.
async fn run_ingest_inner(
config: &Config,
resource_type: &str,
project_filter: Option<&str>,
force: bool,
full: bool,
display: IngestDisplay,
stage_bar: Option<ProgressBar>,
) -> Result<IngestResult> {
// Validate resource type early
if resource_type != "issues" && resource_type != "mrs" {
return Err(LoreError::Other(format!(
"Invalid resource type '{}'. Valid types: issues, mrs",
resource_type
)));
}
// Get database path and create connection
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
// Acquire single-flight lock
let lock_conn = create_connection(&db_path)?;
let mut lock = AppLock::new(
lock_conn,
LockOptions {
name: "sync".to_string(),
stale_lock_minutes: config.sync.stale_lock_minutes,
heartbeat_interval_seconds: config.sync.heartbeat_interval_seconds,
},
);
lock.acquire(force)?;
// Get token from environment
let token =
std::env::var(&config.gitlab.token_env_var).map_err(|_| LoreError::TokenNotSet {
env_var: config.gitlab.token_env_var.clone(),
})?;
// Create GitLab client
let client = GitLabClient::new(
&config.gitlab.base_url,
&token,
Some(config.sync.requests_per_second),
);
// Get projects to sync
let projects = get_projects_to_sync(&conn, &config.projects, project_filter)?;
// If --full flag is set, reset sync cursors and discussion watermarks for a complete re-fetch
if full {
if display.show_text {
println!(
"{}",
style("Full sync: resetting cursors to fetch all data...").yellow()
);
}
for (local_project_id, _, path) in &projects {
if resource_type == "issues" {
// Reset issue discussion and resource event watermarks so everything gets re-synced
conn.execute(
"UPDATE issues SET discussions_synced_for_updated_at = NULL, resource_events_synced_for_updated_at = NULL WHERE project_id = ?",
[*local_project_id],
)?;
} else if resource_type == "mrs" {
// Reset MR discussion and resource event watermarks
conn.execute(
"UPDATE merge_requests SET discussions_synced_for_updated_at = NULL, resource_events_synced_for_updated_at = NULL WHERE project_id = ?",
[*local_project_id],
)?;
}
// Then reset sync cursor
conn.execute(
"DELETE FROM sync_cursors WHERE project_id = ? AND resource_type = ?",
(*local_project_id, resource_type),
)?;
tracing::info!(project = %path, resource_type, "Reset sync cursor and discussion watermarks for full re-fetch");
}
}
if projects.is_empty() {
if let Some(filter) = project_filter {
return Err(LoreError::Other(format!(
"Project '{}' not found in configuration",
filter
)));
}
return Err(LoreError::Other(
"No projects configured. Run 'lore init' first.".to_string(),
));
}
let mut total = IngestResult {
resource_type: resource_type.to_string(),
..Default::default()
};
let type_label = if resource_type == "issues" {
"issues"
} else {
"merge requests"
};
if display.show_text {
println!("{}", style(format!("Ingesting {type_label}...")).blue());
println!();
}
// Process projects concurrently. Each project gets its own DB connection
// while sharing the rate limiter through the cloned GitLabClient.
let concurrency = config.sync.primary_concurrency as usize;
let resource_type_owned = resource_type.to_string();
// Aggregate counters for stage_bar updates (shared across concurrent projects)
let agg_fetched = Arc::new(AtomicUsize::new(0));
let agg_discussions = Arc::new(AtomicUsize::new(0));
let agg_disc_total = Arc::new(AtomicUsize::new(0));
let agg_events = Arc::new(AtomicUsize::new(0));
let agg_events_total = Arc::new(AtomicUsize::new(0));
let stage_bar = stage_bar.unwrap_or_else(ProgressBar::hidden);
use futures::stream::{self, StreamExt};
let project_results: Vec<Result<ProjectIngestOutcome>> = stream::iter(projects.iter())
.map(|(local_project_id, gitlab_project_id, path)| {
let client = client.clone();
let db_path = db_path.clone();
let config = config.clone();
let resource_type = resource_type_owned.clone();
let path = path.clone();
let local_project_id = *local_project_id;
let gitlab_project_id = *gitlab_project_id;
let stage_bar = stage_bar.clone();
let agg_fetched = Arc::clone(&agg_fetched);
let agg_discussions = Arc::clone(&agg_discussions);
let agg_disc_total = Arc::clone(&agg_disc_total);
let agg_events = Arc::clone(&agg_events);
let agg_events_total = Arc::clone(&agg_events_total);
async move {
let proj_conn = create_connection(&db_path)?;
let multi = crate::cli::progress::multi();
let spinner = if !display.show_spinner {
ProgressBar::hidden()
} else {
let s = multi.add(ProgressBar::new_spinner());
s.set_style(
ProgressStyle::default_spinner()
.template("{spinner:.blue} {msg}")
.unwrap(),
);
s.set_message(format!("Fetching {type_label} from {path}..."));
s.enable_steady_tick(std::time::Duration::from_millis(100));
s
};
let disc_bar = if !display.show_progress {
ProgressBar::hidden()
} else {
let b = multi.add(ProgressBar::new(0));
b.set_style(
ProgressStyle::default_bar()
.template(
" {spinner:.blue} {prefix:.cyan} Syncing discussions [{bar:30.cyan/dim}] {pos}/{len}",
)
.unwrap()
.progress_chars("=> "),
);
b.set_prefix(path.clone());
b
};
let spinner_clone = spinner.clone();
let disc_bar_clone = disc_bar.clone();
let stage_bar_clone = stage_bar.clone();
let agg_fetched_clone = Arc::clone(&agg_fetched);
let agg_discussions_clone = Arc::clone(&agg_discussions);
let agg_disc_total_clone = Arc::clone(&agg_disc_total);
let agg_events_clone = Arc::clone(&agg_events);
let agg_events_total_clone = Arc::clone(&agg_events_total);
let path_for_cb = path.clone();
let progress_callback: crate::ingestion::ProgressCallback = if !display.show_progress {
Box::new(|_| {})
} else {
Box::new(move |event: ProgressEvent| match event {
ProgressEvent::IssuesFetchStarted | ProgressEvent::MrsFetchStarted => {
// Spinner already showing fetch message
}
ProgressEvent::IssuesFetchComplete { total } | ProgressEvent::MrsFetchComplete { total } => {
let agg = agg_fetched_clone.fetch_add(total, Ordering::Relaxed) + total;
spinner_clone.set_message(format!(
"{path_for_cb}: {total} {type_label} fetched"
));
stage_bar_clone.set_message(format!(
"Fetching {type_label}... ({agg} fetched across projects)"
));
}
ProgressEvent::IssueFetched { count } | ProgressEvent::MrFetched { count } => {
spinner_clone.set_message(format!(
"{path_for_cb}: {count} fetched so far..."
));
}
ProgressEvent::DiscussionSyncStarted { total } => {
spinner_clone.finish_and_clear();
let agg_total = agg_disc_total_clone.fetch_add(total, Ordering::Relaxed) + total;
disc_bar_clone.set_length(total as u64);
disc_bar_clone.enable_steady_tick(std::time::Duration::from_millis(100));
stage_bar_clone.set_message(format!(
"Syncing discussions... (0/{agg_total})"
));
}
ProgressEvent::DiscussionSynced { current, total: _ } => {
disc_bar_clone.set_position(current as u64);
let agg = agg_discussions_clone.fetch_add(1, Ordering::Relaxed) + 1;
let agg_total = agg_disc_total_clone.load(Ordering::Relaxed);
stage_bar_clone.set_message(format!(
"Syncing discussions... ({agg}/{agg_total})"
));
}
ProgressEvent::DiscussionSyncComplete => {
disc_bar_clone.finish_and_clear();
}
ProgressEvent::MrDiscussionSyncStarted { total } => {
spinner_clone.finish_and_clear();
let agg_total = agg_disc_total_clone.fetch_add(total, Ordering::Relaxed) + total;
disc_bar_clone.set_length(total as u64);
disc_bar_clone.enable_steady_tick(std::time::Duration::from_millis(100));
stage_bar_clone.set_message(format!(
"Syncing discussions... (0/{agg_total})"
));
}
ProgressEvent::MrDiscussionSynced { current, total: _ } => {
disc_bar_clone.set_position(current as u64);
let agg = agg_discussions_clone.fetch_add(1, Ordering::Relaxed) + 1;
let agg_total = agg_disc_total_clone.load(Ordering::Relaxed);
stage_bar_clone.set_message(format!(
"Syncing discussions... ({agg}/{agg_total})"
));
}
ProgressEvent::MrDiscussionSyncComplete => {
disc_bar_clone.finish_and_clear();
}
ProgressEvent::ResourceEventsFetchStarted { total } => {
disc_bar_clone.reset();
disc_bar_clone.set_length(total as u64);
disc_bar_clone.set_style(
ProgressStyle::default_bar()
.template(" {spinner:.blue} {prefix:.cyan} Fetching resource events [{bar:30.cyan/dim}] {pos}/{len}")
.unwrap()
.progress_chars("=> "),
);
disc_bar_clone.enable_steady_tick(std::time::Duration::from_millis(100));
agg_events_total_clone.fetch_add(total, Ordering::Relaxed);
stage_bar_clone.set_message(
"Fetching resource events...".to_string()
);
}
ProgressEvent::ResourceEventFetched { current, total: _ } => {
disc_bar_clone.set_position(current as u64);
let agg = agg_events_clone.fetch_add(1, Ordering::Relaxed) + 1;
let agg_total = agg_events_total_clone.load(Ordering::Relaxed);
stage_bar_clone.set_message(format!(
"Fetching resource events... ({agg}/{agg_total})"
));
}
ProgressEvent::ResourceEventsFetchComplete { .. } => {
disc_bar_clone.finish_and_clear();
}
})
};
let outcome = if resource_type == "issues" {
let result = ingest_project_issues_with_progress(
&proj_conn,
&client,
&config,
local_project_id,
gitlab_project_id,
Some(progress_callback),
)
.await?;
spinner.finish_and_clear();
disc_bar.finish_and_clear();
ProjectIngestOutcome::Issues { path, result }
} else {
let result = ingest_project_merge_requests_with_progress(
&proj_conn,
&client,
&config,
local_project_id,
gitlab_project_id,
full,
Some(progress_callback),
)
.await?;
spinner.finish_and_clear();
disc_bar.finish_and_clear();
ProjectIngestOutcome::Mrs { path, result }
};
Ok(outcome)
}
})
.buffer_unordered(concurrency)
.collect()
.await;
// Aggregate results and print per-project summaries.
// Process all successes first, then return the first error (if any)
// so that successful project summaries are always printed.
let mut first_error: Option<LoreError> = None;
for project_result in project_results {
match project_result {
Err(e) => {
if first_error.is_none() {
first_error = Some(e);
}
}
Ok(ProjectIngestOutcome::Issues {
ref path,
ref result,
}) => {
if display.show_text {
print_issue_project_summary(path, result);
}
total.projects_synced += 1;
total.issues_fetched += result.issues_fetched;
total.issues_upserted += result.issues_upserted;
total.labels_created += result.labels_created;
total.discussions_fetched += result.discussions_fetched;
total.notes_upserted += result.notes_upserted;
total.issues_synced_discussions += result.issues_synced_discussions;
total.issues_skipped_discussion_sync += result.issues_skipped_discussion_sync;
total.resource_events_fetched += result.resource_events_fetched;
total.resource_events_failed += result.resource_events_failed;
}
Ok(ProjectIngestOutcome::Mrs {
ref path,
ref result,
}) => {
if display.show_text {
print_mr_project_summary(path, result);
}
total.projects_synced += 1;
total.mrs_fetched += result.mrs_fetched;
total.mrs_upserted += result.mrs_upserted;
total.labels_created += result.labels_created;
total.assignees_linked += result.assignees_linked;
total.reviewers_linked += result.reviewers_linked;
total.discussions_fetched += result.discussions_fetched;
total.notes_upserted += result.notes_upserted;
total.diffnotes_count += result.diffnotes_count;
total.mrs_synced_discussions += result.mrs_synced_discussions;
total.mrs_skipped_discussion_sync += result.mrs_skipped_discussion_sync;
total.resource_events_fetched += result.resource_events_fetched;
total.resource_events_failed += result.resource_events_failed;
}
}
}
if let Some(e) = first_error {
return Err(e);
}
// Lock is released on drop
Ok(total)
}
/// Get projects to sync from database, optionally filtered.
fn get_projects_to_sync(
conn: &Connection,
configured_projects: &[crate::core::config::ProjectConfig],
filter: Option<&str>,
) -> Result<Vec<(i64, i64, String)>> {
// If a filter is provided, resolve it to a specific project
if let Some(filter_str) = filter {
let project_id = resolve_project(conn, filter_str)?;
// Verify the resolved project is in our config
let row: Option<(i64, String)> = conn
.query_row(
"SELECT gitlab_project_id, path_with_namespace FROM projects WHERE id = ?1",
[project_id],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.ok();
if let Some((gitlab_id, path)) = row {
// Confirm it's a configured project
if configured_projects.iter().any(|p| p.path == path) {
return Ok(vec![(project_id, gitlab_id, path)]);
}
return Err(LoreError::Other(format!(
"Project '{}' exists in database but is not in configuration",
path
)));
}
return Err(LoreError::Other(format!(
"Project '{}' not found in database",
filter_str
)));
}
// No filter: return all configured projects
let mut projects = Vec::new();
for project_config in configured_projects {
let result: Option<(i64, i64)> = conn
.query_row(
"SELECT id, gitlab_project_id FROM projects WHERE path_with_namespace = ?",
[&project_config.path],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.ok();
if let Some((local_id, gitlab_id)) = result {
projects.push((local_id, gitlab_id, project_config.path.clone()));
}
}
Ok(projects)
}
/// Print summary for a single project (issues).
fn print_issue_project_summary(path: &str, result: &IngestProjectResult) {
let labels_str = if result.labels_created > 0 {
format!(", {} new labels", result.labels_created)
} else {
String::new()
};
println!(
" {}: {} issues fetched{}",
style(path).cyan(),
result.issues_upserted,
labels_str
);
if result.issues_synced_discussions > 0 {
println!(
" {} issues -> {} discussions, {} notes",
result.issues_synced_discussions, result.discussions_fetched, result.notes_upserted
);
}
if result.issues_skipped_discussion_sync > 0 {
println!(
" {} unchanged issues (discussion sync skipped)",
style(result.issues_skipped_discussion_sync).dim()
);
}
}
/// Print summary for a single project (merge requests).
fn print_mr_project_summary(path: &str, result: &IngestMrProjectResult) {
let labels_str = if result.labels_created > 0 {
format!(", {} new labels", result.labels_created)
} else {
String::new()
};
let assignees_str = if result.assignees_linked > 0 || result.reviewers_linked > 0 {
format!(
", {} assignees, {} reviewers",
result.assignees_linked, result.reviewers_linked
)
} else {
String::new()
};
println!(
" {}: {} MRs fetched{}{}",
style(path).cyan(),
result.mrs_upserted,
labels_str,
assignees_str
);
if result.mrs_synced_discussions > 0 {
let diffnotes_str = if result.diffnotes_count > 0 {
format!(" ({} diff notes)", result.diffnotes_count)
} else {
String::new()
};
println!(
" {} MRs -> {} discussions, {} notes{}",
result.mrs_synced_discussions,
result.discussions_fetched,
result.notes_upserted,
diffnotes_str
);
}
if result.mrs_skipped_discussion_sync > 0 {
println!(
" {} unchanged MRs (discussion sync skipped)",
style(result.mrs_skipped_discussion_sync).dim()
);
}
}
/// JSON output structures for robot mode.
#[derive(Serialize)]
struct IngestJsonOutput {
ok: bool,
data: IngestJsonData,
}
#[derive(Serialize)]
struct IngestJsonData {
resource_type: String,
projects_synced: usize,
#[serde(skip_serializing_if = "Option::is_none")]
issues: Option<IngestIssueStats>,
#[serde(skip_serializing_if = "Option::is_none")]
merge_requests: Option<IngestMrStats>,
labels_created: usize,
discussions_fetched: usize,
notes_upserted: usize,
resource_events_fetched: usize,
resource_events_failed: usize,
}
#[derive(Serialize)]
struct IngestIssueStats {
fetched: usize,
upserted: usize,
synced_discussions: usize,
skipped_discussion_sync: usize,
}
#[derive(Serialize)]
struct IngestMrStats {
fetched: usize,
upserted: usize,
synced_discussions: usize,
skipped_discussion_sync: usize,
assignees_linked: usize,
reviewers_linked: usize,
diffnotes_count: usize,
}
/// Print final summary as JSON (robot mode).
pub fn print_ingest_summary_json(result: &IngestResult) {
let (issues, merge_requests) = if result.resource_type == "issues" {
(
Some(IngestIssueStats {
fetched: result.issues_fetched,
upserted: result.issues_upserted,
synced_discussions: result.issues_synced_discussions,
skipped_discussion_sync: result.issues_skipped_discussion_sync,
}),
None,
)
} else {
(
None,
Some(IngestMrStats {
fetched: result.mrs_fetched,
upserted: result.mrs_upserted,
synced_discussions: result.mrs_synced_discussions,
skipped_discussion_sync: result.mrs_skipped_discussion_sync,
assignees_linked: result.assignees_linked,
reviewers_linked: result.reviewers_linked,
diffnotes_count: result.diffnotes_count,
}),
)
};
let output = IngestJsonOutput {
ok: true,
data: IngestJsonData {
resource_type: result.resource_type.clone(),
projects_synced: result.projects_synced,
issues,
merge_requests,
labels_created: result.labels_created,
discussions_fetched: result.discussions_fetched,
notes_upserted: result.notes_upserted,
resource_events_fetched: result.resource_events_fetched,
resource_events_failed: result.resource_events_failed,
},
};
println!("{}", serde_json::to_string(&output).unwrap());
}
/// Print final summary.
pub fn print_ingest_summary(result: &IngestResult) {
println!();
if result.resource_type == "issues" {
println!(
"{}",
style(format!(
"Total: {} issues, {} discussions, {} notes",
result.issues_upserted, result.discussions_fetched, result.notes_upserted
))
.green()
);
if result.issues_skipped_discussion_sync > 0 {
println!(
"{}",
style(format!(
"Skipped discussion sync for {} unchanged issues.",
result.issues_skipped_discussion_sync
))
.dim()
);
}
} else {
let diffnotes_str = if result.diffnotes_count > 0 {
format!(" ({} diff notes)", result.diffnotes_count)
} else {
String::new()
};
println!(
"{}",
style(format!(
"Total: {} MRs, {} discussions, {} notes{}",
result.mrs_upserted,
result.discussions_fetched,
result.notes_upserted,
diffnotes_str
))
.green()
);
if result.mrs_skipped_discussion_sync > 0 {
println!(
"{}",
style(format!(
"Skipped discussion sync for {} unchanged MRs.",
result.mrs_skipped_discussion_sync
))
.dim()
);
}
}
if result.resource_events_fetched > 0 || result.resource_events_failed > 0 {
println!(
" Resource events: {} fetched{}",
result.resource_events_fetched,
if result.resource_events_failed > 0 {
format!(", {} failed", result.resource_events_failed)
} else {
String::new()
}
);
}
}