Embedding pipeline improvements building on the concurrent batching foundation: - Track docs_embedded vs chunks_embedded separately. A document counts as embedded only when ALL its chunks succeed, giving accurate progress reporting. The sync command reads docs_embedded for its document count. - Reuse a single Vec<u8> buffer (embed_buf) across all store_embedding calls instead of allocating per chunk. Eliminates ~3KB allocation per 768-dim embedding. - Detect and record errors when Ollama silently returns fewer embeddings than inputs (batch mismatch). Previously these dropped chunks were invisible. - Improve retry error messages: distinguish "retry returned unexpected result" (wrong dims/count) from "retry request failed" (network error) instead of generic "chunk too large" message. - Convert all hot-path SQL from conn.execute() to prepare_cached() for statement cache reuse (clear_document_embeddings, store_embedding, record_embedding_error). - Record embedding_metadata errors for empty documents so they don't appear as perpetually pending on subsequent runs. - Accept concurrency parameter (configurable via config.embedding.concurrency) instead of hardcoded EMBED_CONCURRENCY=2. - Add schema version pre-flight check in embed command to fail fast with actionable error instead of cryptic SQL errors. - Fix --retry-failed to use DELETE instead of UPDATE. UPDATE clears last_error but the row still matches config params in the LEFT JOIN, making the doc permanently invisible to find_pending_documents. DELETE removes the row entirely so the LEFT JOIN returns NULL. Regression test added (old_update_approach_leaves_doc_invisible). - Add chunking forward-progress guard: after floor_char_boundary() rounds backward, ensure start advances by at least one full character to prevent infinite loops on multi-byte sequences (box-drawing chars, smart quotes). Test cases cover the exact patterns that caused production hangs on document 18526. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
563 lines
18 KiB
Rust
563 lines
18 KiB
Rust
use console::style;
|
|
use indicatif::{ProgressBar, ProgressStyle};
|
|
use serde::Serialize;
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicBool, Ordering};
|
|
use tracing::Instrument;
|
|
use tracing::{info, warn};
|
|
|
|
use crate::Config;
|
|
use crate::core::error::Result;
|
|
use crate::core::metrics::{MetricsLayer, StageTiming};
|
|
use crate::core::shutdown::ShutdownSignal;
|
|
|
|
use super::embed::run_embed;
|
|
use super::generate_docs::run_generate_docs;
|
|
use super::ingest::{DryRunPreview, IngestDisplay, run_ingest, run_ingest_dry_run};
|
|
|
|
#[derive(Debug, Default)]
|
|
pub struct SyncOptions {
|
|
pub full: bool,
|
|
pub force: bool,
|
|
pub no_embed: bool,
|
|
pub no_docs: bool,
|
|
pub no_events: bool,
|
|
pub robot_mode: bool,
|
|
pub dry_run: bool,
|
|
}
|
|
|
|
#[derive(Debug, Default, Serialize)]
|
|
pub struct SyncResult {
|
|
#[serde(skip)]
|
|
pub run_id: String,
|
|
pub issues_updated: usize,
|
|
pub mrs_updated: usize,
|
|
pub discussions_fetched: usize,
|
|
pub resource_events_fetched: usize,
|
|
pub resource_events_failed: usize,
|
|
pub documents_regenerated: usize,
|
|
pub documents_embedded: usize,
|
|
}
|
|
|
|
fn stage_spinner(stage: u8, total: u8, msg: &str, robot_mode: bool) -> ProgressBar {
|
|
if robot_mode {
|
|
return ProgressBar::hidden();
|
|
}
|
|
let pb = crate::cli::progress::multi().add(ProgressBar::new_spinner());
|
|
pb.set_style(
|
|
ProgressStyle::default_spinner()
|
|
.template("{spinner:.blue} {prefix} {msg}")
|
|
.expect("valid template"),
|
|
);
|
|
pb.enable_steady_tick(std::time::Duration::from_millis(80));
|
|
pb.set_prefix(format!("[{stage}/{total}]"));
|
|
pb.set_message(msg.to_string());
|
|
pb
|
|
}
|
|
|
|
pub async fn run_sync(
|
|
config: &Config,
|
|
options: SyncOptions,
|
|
run_id: Option<&str>,
|
|
signal: &ShutdownSignal,
|
|
) -> Result<SyncResult> {
|
|
let generated_id;
|
|
let run_id = match run_id {
|
|
Some(id) => id,
|
|
None => {
|
|
generated_id = uuid::Uuid::new_v4().simple().to_string();
|
|
&generated_id[..8]
|
|
}
|
|
};
|
|
let span = tracing::info_span!("sync", %run_id);
|
|
|
|
async move {
|
|
let mut result = SyncResult {
|
|
run_id: run_id.to_string(),
|
|
..SyncResult::default()
|
|
};
|
|
|
|
// Handle dry_run mode - show preview without making any changes
|
|
if options.dry_run {
|
|
return run_sync_dry_run(config, &options).await;
|
|
}
|
|
|
|
let ingest_display = if options.robot_mode {
|
|
IngestDisplay::silent()
|
|
} else {
|
|
IngestDisplay::progress_only()
|
|
};
|
|
|
|
let total_stages: u8 = if options.no_docs && options.no_embed {
|
|
2
|
|
} else if options.no_docs || options.no_embed {
|
|
3
|
|
} else {
|
|
4
|
|
};
|
|
let mut current_stage: u8 = 0;
|
|
|
|
current_stage += 1;
|
|
let spinner = stage_spinner(
|
|
current_stage,
|
|
total_stages,
|
|
"Fetching issues from GitLab...",
|
|
options.robot_mode,
|
|
);
|
|
info!("Sync stage {current_stage}/{total_stages}: ingesting issues");
|
|
let issues_result = run_ingest(
|
|
config,
|
|
"issues",
|
|
None,
|
|
options.force,
|
|
options.full,
|
|
false, // dry_run - sync has its own dry_run handling
|
|
ingest_display,
|
|
Some(spinner.clone()),
|
|
signal,
|
|
)
|
|
.await?;
|
|
result.issues_updated = issues_result.issues_upserted;
|
|
result.discussions_fetched += issues_result.discussions_fetched;
|
|
result.resource_events_fetched += issues_result.resource_events_fetched;
|
|
result.resource_events_failed += issues_result.resource_events_failed;
|
|
spinner.finish_and_clear();
|
|
|
|
if signal.is_cancelled() {
|
|
info!("Shutdown requested after issues stage, returning partial sync results");
|
|
return Ok(result);
|
|
}
|
|
|
|
current_stage += 1;
|
|
let spinner = stage_spinner(
|
|
current_stage,
|
|
total_stages,
|
|
"Fetching merge requests from GitLab...",
|
|
options.robot_mode,
|
|
);
|
|
info!("Sync stage {current_stage}/{total_stages}: ingesting merge requests");
|
|
let mrs_result = run_ingest(
|
|
config,
|
|
"mrs",
|
|
None,
|
|
options.force,
|
|
options.full,
|
|
false, // dry_run - sync has its own dry_run handling
|
|
ingest_display,
|
|
Some(spinner.clone()),
|
|
signal,
|
|
)
|
|
.await?;
|
|
result.mrs_updated = mrs_result.mrs_upserted;
|
|
result.discussions_fetched += mrs_result.discussions_fetched;
|
|
result.resource_events_fetched += mrs_result.resource_events_fetched;
|
|
result.resource_events_failed += mrs_result.resource_events_failed;
|
|
spinner.finish_and_clear();
|
|
|
|
if signal.is_cancelled() {
|
|
info!("Shutdown requested after MRs stage, returning partial sync results");
|
|
return Ok(result);
|
|
}
|
|
|
|
if !options.no_docs {
|
|
current_stage += 1;
|
|
let spinner = stage_spinner(
|
|
current_stage,
|
|
total_stages,
|
|
"Processing documents...",
|
|
options.robot_mode,
|
|
);
|
|
info!("Sync stage {current_stage}/{total_stages}: generating documents");
|
|
|
|
let docs_bar = if options.robot_mode {
|
|
ProgressBar::hidden()
|
|
} else {
|
|
let b = crate::cli::progress::multi().add(ProgressBar::new(0));
|
|
b.set_style(
|
|
ProgressStyle::default_bar()
|
|
.template(
|
|
" {spinner:.blue} Processing documents [{bar:30.cyan/dim}] {pos}/{len}",
|
|
)
|
|
.unwrap()
|
|
.progress_chars("=> "),
|
|
);
|
|
b
|
|
};
|
|
let docs_bar_clone = docs_bar.clone();
|
|
let tick_started = Arc::new(AtomicBool::new(false));
|
|
let tick_started_clone = Arc::clone(&tick_started);
|
|
let docs_cb: Box<dyn Fn(usize, usize)> = Box::new(move |processed, total| {
|
|
if total > 0 {
|
|
if !tick_started_clone.swap(true, Ordering::Relaxed) {
|
|
docs_bar_clone.enable_steady_tick(std::time::Duration::from_millis(100));
|
|
}
|
|
docs_bar_clone.set_length(total as u64);
|
|
docs_bar_clone.set_position(processed as u64);
|
|
}
|
|
});
|
|
let docs_result = run_generate_docs(config, options.full, None, Some(docs_cb))?;
|
|
result.documents_regenerated = docs_result.regenerated;
|
|
docs_bar.finish_and_clear();
|
|
spinner.finish_and_clear();
|
|
} else {
|
|
info!("Sync: skipping document generation (--no-docs)");
|
|
}
|
|
|
|
if !options.no_embed {
|
|
current_stage += 1;
|
|
let spinner = stage_spinner(
|
|
current_stage,
|
|
total_stages,
|
|
"Generating embeddings...",
|
|
options.robot_mode,
|
|
);
|
|
info!("Sync stage {current_stage}/{total_stages}: embedding documents");
|
|
|
|
let embed_bar = if options.robot_mode {
|
|
ProgressBar::hidden()
|
|
} else {
|
|
let b = crate::cli::progress::multi().add(ProgressBar::new(0));
|
|
b.set_style(
|
|
ProgressStyle::default_bar()
|
|
.template(
|
|
" {spinner:.blue} Generating embeddings [{bar:30.cyan/dim}] {pos}/{len}",
|
|
)
|
|
.unwrap()
|
|
.progress_chars("=> "),
|
|
);
|
|
b
|
|
};
|
|
let embed_bar_clone = embed_bar.clone();
|
|
let tick_started = Arc::new(AtomicBool::new(false));
|
|
let tick_started_clone = Arc::clone(&tick_started);
|
|
let embed_cb: Box<dyn Fn(usize, usize)> = Box::new(move |processed, total| {
|
|
if total > 0 {
|
|
if !tick_started_clone.swap(true, Ordering::Relaxed) {
|
|
embed_bar_clone.enable_steady_tick(std::time::Duration::from_millis(100));
|
|
}
|
|
embed_bar_clone.set_length(total as u64);
|
|
embed_bar_clone.set_position(processed as u64);
|
|
}
|
|
});
|
|
match run_embed(config, options.full, false, Some(embed_cb), signal).await {
|
|
Ok(embed_result) => {
|
|
result.documents_embedded = embed_result.docs_embedded;
|
|
embed_bar.finish_and_clear();
|
|
spinner.finish_and_clear();
|
|
}
|
|
Err(e) => {
|
|
embed_bar.finish_and_clear();
|
|
spinner.finish_and_clear();
|
|
if !options.robot_mode {
|
|
eprintln!(" {} Embedding skipped ({})", style("warn").yellow(), e);
|
|
}
|
|
warn!(error = %e, "Embedding stage failed (Ollama may be unavailable), continuing");
|
|
}
|
|
}
|
|
} else {
|
|
info!("Sync: skipping embedding (--no-embed)");
|
|
}
|
|
|
|
info!(
|
|
issues = result.issues_updated,
|
|
mrs = result.mrs_updated,
|
|
discussions = result.discussions_fetched,
|
|
resource_events = result.resource_events_fetched,
|
|
resource_events_failed = result.resource_events_failed,
|
|
docs = result.documents_regenerated,
|
|
embedded = result.documents_embedded,
|
|
"Sync pipeline complete"
|
|
);
|
|
|
|
Ok(result)
|
|
}
|
|
.instrument(span)
|
|
.await
|
|
}
|
|
|
|
pub fn print_sync(
|
|
result: &SyncResult,
|
|
elapsed: std::time::Duration,
|
|
metrics: Option<&MetricsLayer>,
|
|
) {
|
|
println!("{} Sync complete:", style("done").green().bold(),);
|
|
println!(" Issues updated: {}", result.issues_updated);
|
|
println!(" MRs updated: {}", result.mrs_updated);
|
|
println!(
|
|
" Discussions fetched: {}",
|
|
result.discussions_fetched
|
|
);
|
|
if result.resource_events_fetched > 0 || result.resource_events_failed > 0 {
|
|
println!(
|
|
" Resource events fetched: {}",
|
|
result.resource_events_fetched
|
|
);
|
|
if result.resource_events_failed > 0 {
|
|
println!(
|
|
" Resource events failed: {}",
|
|
result.resource_events_failed
|
|
);
|
|
}
|
|
}
|
|
println!(
|
|
" Documents regenerated: {}",
|
|
result.documents_regenerated
|
|
);
|
|
println!(" Documents embedded: {}", result.documents_embedded);
|
|
println!(" Elapsed: {:.1}s", elapsed.as_secs_f64());
|
|
|
|
if let Some(metrics) = metrics {
|
|
let stages = metrics.extract_timings();
|
|
if !stages.is_empty() {
|
|
print_timing_summary(&stages);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn print_timing_summary(stages: &[StageTiming]) {
|
|
println!();
|
|
println!("{}", style("Stage timing:").dim());
|
|
for stage in stages {
|
|
for sub in &stage.sub_stages {
|
|
print_stage_line(sub, 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn print_stage_line(stage: &StageTiming, depth: usize) {
|
|
let indent = " ".repeat(depth);
|
|
let name = if let Some(ref project) = stage.project {
|
|
format!("{} ({})", stage.name, project)
|
|
} else {
|
|
stage.name.clone()
|
|
};
|
|
let pad_width = 30_usize.saturating_sub(indent.len() + name.len());
|
|
let dots = ".".repeat(pad_width.max(2));
|
|
|
|
let mut suffix = String::new();
|
|
if stage.items_processed > 0 {
|
|
suffix.push_str(&format!("{} items", stage.items_processed));
|
|
}
|
|
if stage.errors > 0 {
|
|
if !suffix.is_empty() {
|
|
suffix.push_str(", ");
|
|
}
|
|
suffix.push_str(&format!("{} errors", stage.errors));
|
|
}
|
|
if stage.rate_limit_hits > 0 {
|
|
if !suffix.is_empty() {
|
|
suffix.push_str(", ");
|
|
}
|
|
suffix.push_str(&format!("{} rate limits", stage.rate_limit_hits));
|
|
}
|
|
|
|
let time_str = format!("{:.1}s", stage.elapsed_ms as f64 / 1000.0);
|
|
if suffix.is_empty() {
|
|
println!("{indent}{name} {dots} {time_str}");
|
|
} else {
|
|
println!("{indent}{name} {dots} {time_str} ({suffix})");
|
|
}
|
|
|
|
for sub in &stage.sub_stages {
|
|
print_stage_line(sub, depth + 1);
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct SyncJsonOutput<'a> {
|
|
ok: bool,
|
|
data: &'a SyncResult,
|
|
meta: SyncMeta,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct SyncMeta {
|
|
run_id: String,
|
|
elapsed_ms: u64,
|
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
|
stages: Vec<StageTiming>,
|
|
}
|
|
|
|
pub fn print_sync_json(result: &SyncResult, elapsed_ms: u64, metrics: Option<&MetricsLayer>) {
|
|
let stages = metrics.map_or_else(Vec::new, MetricsLayer::extract_timings);
|
|
let output = SyncJsonOutput {
|
|
ok: true,
|
|
data: result,
|
|
meta: SyncMeta {
|
|
run_id: result.run_id.clone(),
|
|
elapsed_ms,
|
|
stages,
|
|
},
|
|
};
|
|
println!("{}", serde_json::to_string(&output).unwrap());
|
|
}
|
|
|
|
#[derive(Debug, Default, Serialize)]
|
|
pub struct SyncDryRunResult {
|
|
pub issues_preview: DryRunPreview,
|
|
pub mrs_preview: DryRunPreview,
|
|
pub would_generate_docs: bool,
|
|
pub would_embed: bool,
|
|
}
|
|
|
|
async fn run_sync_dry_run(config: &Config, options: &SyncOptions) -> Result<SyncResult> {
|
|
// Get dry run previews for both issues and MRs
|
|
let issues_preview = run_ingest_dry_run(config, "issues", None, options.full)?;
|
|
let mrs_preview = run_ingest_dry_run(config, "mrs", None, options.full)?;
|
|
|
|
let dry_result = SyncDryRunResult {
|
|
issues_preview,
|
|
mrs_preview,
|
|
would_generate_docs: !options.no_docs,
|
|
would_embed: !options.no_embed,
|
|
};
|
|
|
|
if options.robot_mode {
|
|
print_sync_dry_run_json(&dry_result);
|
|
} else {
|
|
print_sync_dry_run(&dry_result);
|
|
}
|
|
|
|
// Return an empty SyncResult since this is just a preview
|
|
Ok(SyncResult::default())
|
|
}
|
|
|
|
pub fn print_sync_dry_run(result: &SyncDryRunResult) {
|
|
println!(
|
|
"{} {}",
|
|
style("Sync Dry Run Preview").cyan().bold(),
|
|
style("(no changes will be made)").yellow()
|
|
);
|
|
println!();
|
|
|
|
println!("{}", style("Stage 1: Issues Ingestion").white().bold());
|
|
println!(
|
|
" Sync mode: {}",
|
|
if result.issues_preview.sync_mode == "full" {
|
|
style("full").yellow()
|
|
} else {
|
|
style("incremental").green()
|
|
}
|
|
);
|
|
println!(" Projects: {}", result.issues_preview.projects.len());
|
|
for project in &result.issues_preview.projects {
|
|
let sync_status = if !project.has_cursor {
|
|
style("initial sync").yellow()
|
|
} else {
|
|
style("incremental").green()
|
|
};
|
|
println!(
|
|
" {} ({}) - {} existing",
|
|
&project.path, sync_status, project.existing_count
|
|
);
|
|
}
|
|
println!();
|
|
|
|
println!(
|
|
"{}",
|
|
style("Stage 2: Merge Requests Ingestion").white().bold()
|
|
);
|
|
println!(
|
|
" Sync mode: {}",
|
|
if result.mrs_preview.sync_mode == "full" {
|
|
style("full").yellow()
|
|
} else {
|
|
style("incremental").green()
|
|
}
|
|
);
|
|
println!(" Projects: {}", result.mrs_preview.projects.len());
|
|
for project in &result.mrs_preview.projects {
|
|
let sync_status = if !project.has_cursor {
|
|
style("initial sync").yellow()
|
|
} else {
|
|
style("incremental").green()
|
|
};
|
|
println!(
|
|
" {} ({}) - {} existing",
|
|
&project.path, sync_status, project.existing_count
|
|
);
|
|
}
|
|
println!();
|
|
|
|
if result.would_generate_docs {
|
|
println!(
|
|
"{} {}",
|
|
style("Stage 3: Document Generation").white().bold(),
|
|
style("(would run)").green()
|
|
);
|
|
} else {
|
|
println!(
|
|
"{} {}",
|
|
style("Stage 3: Document Generation").white().bold(),
|
|
style("(skipped)").dim()
|
|
);
|
|
}
|
|
|
|
if result.would_embed {
|
|
println!(
|
|
"{} {}",
|
|
style("Stage 4: Embedding").white().bold(),
|
|
style("(would run)").green()
|
|
);
|
|
} else {
|
|
println!(
|
|
"{} {}",
|
|
style("Stage 4: Embedding").white().bold(),
|
|
style("(skipped)").dim()
|
|
);
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct SyncDryRunJsonOutput {
|
|
ok: bool,
|
|
dry_run: bool,
|
|
data: SyncDryRunJsonData,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct SyncDryRunJsonData {
|
|
stages: Vec<SyncDryRunStage>,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct SyncDryRunStage {
|
|
name: String,
|
|
would_run: bool,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
preview: Option<DryRunPreview>,
|
|
}
|
|
|
|
pub fn print_sync_dry_run_json(result: &SyncDryRunResult) {
|
|
let output = SyncDryRunJsonOutput {
|
|
ok: true,
|
|
dry_run: true,
|
|
data: SyncDryRunJsonData {
|
|
stages: vec![
|
|
SyncDryRunStage {
|
|
name: "ingest_issues".to_string(),
|
|
would_run: true,
|
|
preview: Some(result.issues_preview.clone()),
|
|
},
|
|
SyncDryRunStage {
|
|
name: "ingest_mrs".to_string(),
|
|
would_run: true,
|
|
preview: Some(result.mrs_preview.clone()),
|
|
},
|
|
SyncDryRunStage {
|
|
name: "generate_docs".to_string(),
|
|
would_run: result.would_generate_docs,
|
|
preview: None,
|
|
},
|
|
SyncDryRunStage {
|
|
name: "embed".to_string(),
|
|
would_run: result.would_embed,
|
|
preview: None,
|
|
},
|
|
],
|
|
},
|
|
};
|
|
|
|
println!("{}", serde_json::to_string(&output).unwrap());
|
|
}
|