From daf5a730197815b3a053eb5517fdef7fd3c81d4a Mon Sep 17 00:00:00 2001 From: Taylor Eernisse Date: Fri, 30 Jan 2026 15:47:10 -0500 Subject: [PATCH] feat(cli): Add search, stats, embed, sync, health, and robot-docs commands Extends the CLI with six new commands that complete the search pipeline: - lore search : Hybrid search with mode selection (lexical, hybrid, semantic), rich filtering (--type, --author, --project, --label, --path, --after, --updated-after), result limits, and optional explain mode showing RRF score breakdowns. Safe FTS mode sanitizes user input; raw mode passes through for power users. - lore stats: Document and index statistics with optional --check for integrity verification and --repair to fix inconsistencies (orphaned documents, missing FTS entries, stale dirty queue items). - lore embed: Generate vector embeddings via Ollama. Supports --retry-failed to re-attempt previously failed embeddings. - lore generate-docs: Drain the dirty queue to regenerate documents. --full seeds all entities for complete rebuild. --project scopes to a single project. - lore sync: Full pipeline orchestration (ingest issues + MRs, generate-docs, embed) with --no-embed and --no-docs flags for partial runs. Reports per-stage results and total elapsed time. - lore health: Quick pre-flight check (config exists, DB exists, schema current). Returns exit code 1 if unhealthy. Designed for agent pre-flight scripts. - lore robot-docs: Machine-readable command manifest for agent self-discovery. Returns all commands, flags, examples, exit codes, and recommended workflows as structured JSON. Also enhances lore init with --gitlab-url, --token-env-var, and --projects flags for fully non-interactive robot-mode initialization. Fixes init's force/non-interactive precedence logic and adds JSON output for robot mode. Updates all command files for the GiError -> LoreError rename. Co-Authored-By: Claude Opus 4.5 --- src/cli/commands/auth_test.rs | 6 +- src/cli/commands/doctor.rs | 6 +- src/cli/commands/embed.rs | 88 ++++ src/cli/commands/generate_docs.rs | 205 ++++++++++ src/cli/commands/ingest.rs | 10 +- src/cli/commands/init.rs | 30 +- src/cli/commands/mod.rs | 12 + src/cli/commands/search.rs | 402 +++++++++++++++++++ src/cli/commands/show.rs | 10 +- src/cli/commands/stats.rs | 348 ++++++++++++++++ src/cli/commands/sync.rs | 124 ++++++ src/cli/mod.rs | 138 +++++++ src/main.rs | 646 +++++++++++++++++++++++++++--- 13 files changed, 1930 insertions(+), 95 deletions(-) create mode 100644 src/cli/commands/embed.rs create mode 100644 src/cli/commands/generate_docs.rs create mode 100644 src/cli/commands/search.rs create mode 100644 src/cli/commands/stats.rs create mode 100644 src/cli/commands/sync.rs diff --git a/src/cli/commands/auth_test.rs b/src/cli/commands/auth_test.rs index de98d4f..5562875 100644 --- a/src/cli/commands/auth_test.rs +++ b/src/cli/commands/auth_test.rs @@ -1,7 +1,7 @@ //! Auth test command - verify GitLab authentication. use crate::core::config::Config; -use crate::core::error::{GiError, Result}; +use crate::core::error::{LoreError, Result}; use crate::gitlab::GitLabClient; /// Result of successful auth test. @@ -19,12 +19,12 @@ pub async fn run_auth_test(config_path: Option<&str>) -> Result // 2. Get token from environment let token = std::env::var(&config.gitlab.token_env_var) .map(|t| t.trim().to_string()) - .map_err(|_| GiError::TokenNotSet { + .map_err(|_| LoreError::TokenNotSet { env_var: config.gitlab.token_env_var.clone(), })?; if token.is_empty() { - return Err(GiError::TokenNotSet { + return Err(LoreError::TokenNotSet { env_var: config.gitlab.token_env_var.clone(), }); } diff --git a/src/cli/commands/doctor.rs b/src/cli/commands/doctor.rs index db1c949..1d84667 100644 --- a/src/cli/commands/doctor.rs +++ b/src/cli/commands/doctor.rs @@ -5,7 +5,7 @@ use serde::Serialize; use crate::core::config::Config; use crate::core::db::{create_connection, get_schema_version, verify_pragmas}; -use crate::core::error::GiError; +use crate::core::error::LoreError; use crate::core::paths::{get_config_path, get_db_path}; use crate::gitlab::GitLabClient; @@ -137,7 +137,7 @@ fn check_config(config_path: &str) -> (ConfigCheck, Option) { }, Some(config), ), - Err(GiError::ConfigNotFound { path }) => ( + Err(LoreError::ConfigNotFound { path }) => ( ConfigCheck { result: CheckResult { status: CheckStatus::Error, @@ -264,7 +264,7 @@ async fn check_gitlab(config: Option<&Config>) -> GitLabCheck { url: Some(config.gitlab.base_url.clone()), username: Some(user.username), }, - Err(GiError::GitLabAuthFailed) => GitLabCheck { + Err(LoreError::GitLabAuthFailed) => GitLabCheck { result: CheckResult { status: CheckStatus::Error, message: Some("Authentication failed. Check your token.".to_string()), diff --git a/src/cli/commands/embed.rs b/src/cli/commands/embed.rs new file mode 100644 index 0000000..7a99fe7 --- /dev/null +++ b/src/cli/commands/embed.rs @@ -0,0 +1,88 @@ +//! Embed command: generate vector embeddings for documents via Ollama. + +use console::style; +use serde::Serialize; + +use crate::core::db::create_connection; +use crate::core::error::Result; +use crate::core::paths::get_db_path; +use crate::embedding::ollama::{OllamaClient, OllamaConfig}; +use crate::embedding::pipeline::embed_documents; +use crate::Config; + +/// Result of the embed command. +#[derive(Debug, Default, Serialize)] +pub struct EmbedCommandResult { + pub embedded: usize, + pub failed: usize, + pub skipped: usize, +} + +/// Run the embed command. +pub async fn run_embed( + config: &Config, + retry_failed: bool, +) -> Result { + let db_path = get_db_path(config.storage.db_path.as_deref()); + let conn = create_connection(&db_path)?; + + // Build Ollama config from user settings + let ollama_config = OllamaConfig { + base_url: config.embedding.base_url.clone(), + model: config.embedding.model.clone(), + ..OllamaConfig::default() + }; + let client = OllamaClient::new(ollama_config); + + // Health check — fail fast if Ollama is down or model missing + client.health_check().await?; + + // If retry_failed, clear errors so they become pending again + if retry_failed { + conn.execute( + "UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0 + WHERE last_error IS NOT NULL", + [], + )?; + } + + let model_name = &config.embedding.model; + let result = embed_documents(&conn, &client, model_name, None).await?; + + Ok(EmbedCommandResult { + embedded: result.embedded, + failed: result.failed, + skipped: result.skipped, + }) +} + +/// Print human-readable output. +pub fn print_embed(result: &EmbedCommandResult) { + println!( + "{} Embedding complete", + style("done").green().bold(), + ); + println!(" Embedded: {}", result.embedded); + if result.failed > 0 { + println!(" Failed: {}", style(result.failed).red()); + } + if result.skipped > 0 { + println!(" Skipped: {}", result.skipped); + } +} + +/// JSON output. +#[derive(Serialize)] +struct EmbedJsonOutput<'a> { + ok: bool, + data: &'a EmbedCommandResult, +} + +/// Print JSON robot-mode output. +pub fn print_embed_json(result: &EmbedCommandResult) { + let output = EmbedJsonOutput { + ok: true, + data: result, + }; + println!("{}", serde_json::to_string(&output).unwrap()); +} diff --git a/src/cli/commands/generate_docs.rs b/src/cli/commands/generate_docs.rs new file mode 100644 index 0000000..e6377c2 --- /dev/null +++ b/src/cli/commands/generate_docs.rs @@ -0,0 +1,205 @@ +//! Generate searchable documents from ingested GitLab data. + +use console::style; +use rusqlite::Connection; +use serde::Serialize; +use tracing::info; + +use crate::core::db::create_connection; +use crate::core::error::Result; +use crate::core::paths::get_db_path; +use crate::documents::{regenerate_dirty_documents, SourceType}; +use crate::Config; + +const FULL_MODE_CHUNK_SIZE: i64 = 2000; + +/// Result of a generate-docs run. +#[derive(Debug, Default)] +pub struct GenerateDocsResult { + pub regenerated: usize, + pub unchanged: usize, + pub errored: usize, + pub seeded: usize, + pub full_mode: bool, +} + +/// Run the generate-docs pipeline. +/// +/// Default mode: process only existing dirty_sources entries. +/// Full mode: seed dirty_sources with ALL entities, then drain. +pub fn run_generate_docs( + config: &Config, + full: bool, + project_filter: Option<&str>, +) -> Result { + let db_path = get_db_path(config.storage.db_path.as_deref()); + let conn = create_connection(&db_path)?; + let mut result = GenerateDocsResult { + full_mode: full, + ..Default::default() + }; + + if full { + result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?; + result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?; + result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?; + } + + let regen = regenerate_dirty_documents(&conn)?; + result.regenerated = regen.regenerated; + result.unchanged = regen.unchanged; + result.errored = regen.errored; + + if full { + // Optimize FTS index after bulk rebuild + let _ = conn.execute( + "INSERT INTO documents_fts(documents_fts) VALUES('optimize')", + [], + ); + info!("FTS index optimized after full rebuild"); + } + + Ok(result) +} + +/// Seed dirty_sources with all entities of the given type using keyset pagination. +fn seed_dirty( + conn: &Connection, + source_type: SourceType, + project_filter: Option<&str>, +) -> Result { + let table = match source_type { + SourceType::Issue => "issues", + SourceType::MergeRequest => "merge_requests", + SourceType::Discussion => "discussions", + }; + let type_str = source_type.as_str(); + let now = chrono::Utc::now().timestamp_millis(); + + let mut total_seeded: usize = 0; + let mut last_id: i64 = 0; + + loop { + let inserted = if let Some(project) = project_filter { + // Resolve project to ID for filtering + let project_id: Option = conn + .query_row( + "SELECT id FROM projects WHERE path_with_namespace = ?1 COLLATE NOCASE", + [project], + |row| row.get(0), + ) + .ok(); + + let Some(pid) = project_id else { + break; + }; + + conn.execute( + &format!( + "INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at) + SELECT ?1, id, ?2, 0, NULL, NULL, NULL + FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5 + ON CONFLICT(source_type, source_id) DO NOTHING" + ), + rusqlite::params![type_str, now, last_id, pid, FULL_MODE_CHUNK_SIZE], + )? + } else { + conn.execute( + &format!( + "INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at) + SELECT ?1, id, ?2, 0, NULL, NULL, NULL + FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4 + ON CONFLICT(source_type, source_id) DO NOTHING" + ), + rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE], + )? + }; + + if inserted == 0 { + break; + } + + // Advance keyset cursor to the max id within the chunk window + let max_id: i64 = conn.query_row( + &format!( + "SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)", + table = table + ), + rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE], + |row| row.get(0), + )?; + + total_seeded += inserted; + last_id = max_id; + } + + info!( + source_type = type_str, + seeded = total_seeded, + "Seeded dirty_sources" + ); + + Ok(total_seeded) +} + +/// Print human-readable output. +pub fn print_generate_docs(result: &GenerateDocsResult) { + let mode = if result.full_mode { "full" } else { "incremental" }; + println!( + "{} Document generation complete ({})", + style("done").green().bold(), + mode + ); + + if result.full_mode { + println!(" Seeded: {}", result.seeded); + } + println!(" Regenerated: {}", result.regenerated); + println!(" Unchanged: {}", result.unchanged); + if result.errored > 0 { + println!( + " Errored: {}", + style(result.errored).red() + ); + } +} + +/// JSON output structures. +#[derive(Serialize)] +struct GenerateDocsJsonOutput { + ok: bool, + data: GenerateDocsJsonData, +} + +#[derive(Serialize)] +struct GenerateDocsJsonData { + mode: String, + #[serde(skip_serializing_if = "Option::is_none")] + seeded: Option, + regenerated: usize, + unchanged: usize, + errored: usize, +} + +/// Print JSON robot-mode output. +pub fn print_generate_docs_json(result: &GenerateDocsResult) { + let output = GenerateDocsJsonOutput { + ok: true, + data: GenerateDocsJsonData { + mode: if result.full_mode { + "full".to_string() + } else { + "incremental".to_string() + }, + seeded: if result.full_mode { + Some(result.seeded) + } else { + None + }, + regenerated: result.regenerated, + unchanged: result.unchanged, + errored: result.errored, + }, + }; + println!("{}", serde_json::to_string(&output).unwrap()); +} diff --git a/src/cli/commands/ingest.rs b/src/cli/commands/ingest.rs index 849b8d1..d8b71b7 100644 --- a/src/cli/commands/ingest.rs +++ b/src/cli/commands/ingest.rs @@ -7,7 +7,7 @@ use serde::Serialize; use crate::Config; use crate::core::db::create_connection; -use crate::core::error::{GiError, Result}; +use crate::core::error::{LoreError, Result}; use crate::core::lock::{AppLock, LockOptions}; use crate::core::paths::get_db_path; use crate::gitlab::GitLabClient; @@ -51,7 +51,7 @@ pub async fn run_ingest( ) -> Result { // Validate resource type early if resource_type != "issues" && resource_type != "mrs" { - return Err(GiError::Other(format!( + return Err(LoreError::Other(format!( "Invalid resource type '{}'. Valid types: issues, mrs", resource_type ))); @@ -74,7 +74,7 @@ pub async fn run_ingest( lock.acquire(force)?; // Get token from environment - let token = std::env::var(&config.gitlab.token_env_var).map_err(|_| GiError::TokenNotSet { + let token = std::env::var(&config.gitlab.token_env_var).map_err(|_| LoreError::TokenNotSet { env_var: config.gitlab.token_env_var.clone(), })?; @@ -119,12 +119,12 @@ pub async fn run_ingest( if projects.is_empty() { if let Some(filter) = project_filter { - return Err(GiError::Other(format!( + return Err(LoreError::Other(format!( "Project '{}' not found in configuration", filter ))); } - return Err(GiError::Other( + return Err(LoreError::Other( "No projects configured. Run 'lore init' first.".to_string(), )); } diff --git a/src/cli/commands/init.rs b/src/cli/commands/init.rs index 92a6a31..3dfaaba 100644 --- a/src/cli/commands/init.rs +++ b/src/cli/commands/init.rs @@ -4,7 +4,7 @@ use std::fs; use crate::core::config::{MinimalConfig, MinimalGitLabConfig, ProjectConfig}; use crate::core::db::{create_connection, run_migrations}; -use crate::core::error::{GiError, Result}; +use crate::core::error::{LoreError, Result}; use crate::core::paths::{get_config_path, get_data_dir}; use crate::gitlab::{GitLabClient, GitLabProject}; @@ -45,32 +45,30 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result Result Result, + pub author: Option, + pub created_at: Option, + pub updated_at: Option, + pub project_path: String, + pub labels: Vec, + pub paths: Vec, + pub snippet: String, + pub score: f64, + #[serde(skip_serializing_if = "Option::is_none")] + pub explain: Option, +} + +/// Ranking explanation for --explain output. +#[derive(Debug, Serialize)] +pub struct ExplainData { + pub vector_rank: Option, + pub fts_rank: Option, + pub rrf_score: f64, +} + +/// Search response wrapper. +#[derive(Debug, Serialize)] +pub struct SearchResponse { + pub query: String, + pub mode: String, + pub total_results: usize, + pub results: Vec, + pub warnings: Vec, +} + +/// Build SearchFilters from CLI args. +pub struct SearchCliFilters { + pub source_type: Option, + pub author: Option, + pub project: Option, + pub labels: Vec, + pub path: Option, + pub after: Option, + pub updated_after: Option, + pub limit: usize, +} + +/// Run a lexical search query. +pub fn run_search( + config: &Config, + query: &str, + cli_filters: SearchCliFilters, + fts_mode: FtsQueryMode, + explain: bool, +) -> Result { + let db_path = get_db_path(config.storage.db_path.as_deref()); + let conn = create_connection(&db_path)?; + + // Check if any documents exist + let doc_count: i64 = conn + .query_row("SELECT COUNT(*) FROM documents", [], |row| row.get(0)) + .unwrap_or(0); + + if doc_count == 0 { + return Ok(SearchResponse { + query: query.to_string(), + mode: "lexical".to_string(), + total_results: 0, + results: vec![], + warnings: vec![ + "No documents indexed. Run 'lore generate-docs' first.".to_string() + ], + }); + } + + // Build filters + let source_type = cli_filters + .source_type + .as_deref() + .and_then(SourceType::parse); + + let project_id = cli_filters + .project + .as_deref() + .map(|p| resolve_project(&conn, p)) + .transpose()?; + + let after = cli_filters.after.as_deref().and_then(parse_since); + let updated_after = cli_filters.updated_after.as_deref().and_then(parse_since); + + let path = cli_filters.path.as_deref().map(|p| { + if p.ends_with('/') { + PathFilter::Prefix(p.to_string()) + } else { + PathFilter::Exact(p.to_string()) + } + }); + + let filters = SearchFilters { + source_type, + author: cli_filters.author, + project_id, + after, + updated_after, + labels: cli_filters.labels, + path, + limit: cli_filters.limit, + }; + + // Adaptive recall: wider initial fetch when filters applied + let requested = filters.clamp_limit(); + let top_k = if filters.has_any_filter() { + (requested * 50).max(200).min(1500) + } else { + (requested * 10).max(50).min(1500) + }; + + // FTS search + let fts_results = search_fts(&conn, query, top_k, fts_mode)?; + let fts_tuples: Vec<(i64, f64)> = fts_results + .iter() + .map(|r| (r.document_id, r.bm25_score)) + .collect(); + + // Build snippet map before ranking + let snippet_map: std::collections::HashMap = fts_results + .iter() + .map(|r| (r.document_id, r.snippet.clone())) + .collect(); + + // RRF ranking (single-list for lexical mode) + let ranked = rank_rrf(&[], &fts_tuples); + let ranked_ids: Vec = ranked.iter().map(|r| r.document_id).collect(); + + // Apply post-retrieval filters + let filtered_ids = apply_filters(&conn, &ranked_ids, &filters)?; + + if filtered_ids.is_empty() { + return Ok(SearchResponse { + query: query.to_string(), + mode: "lexical".to_string(), + total_results: 0, + results: vec![], + warnings: vec![], + }); + } + + // Hydrate results in single round-trip + let hydrated = hydrate_results(&conn, &filtered_ids)?; + + // Build display results preserving filter order + let rrf_map: std::collections::HashMap = ranked + .iter() + .map(|r| (r.document_id, r)) + .collect(); + + let mut results: Vec = Vec::with_capacity(hydrated.len()); + for row in &hydrated { + let rrf = rrf_map.get(&row.document_id); + let fts_snippet = snippet_map.get(&row.document_id).map(|s| s.as_str()); + let snippet = get_result_snippet(fts_snippet, &row.content_text); + + let explain_data = if explain { + rrf.map(|r| ExplainData { + vector_rank: r.vector_rank, + fts_rank: r.fts_rank, + rrf_score: r.rrf_score, + }) + } else { + None + }; + + results.push(SearchResultDisplay { + document_id: row.document_id, + source_type: row.source_type.clone(), + title: row.title.clone(), + url: row.url.clone(), + author: row.author.clone(), + created_at: row.created_at.map(ms_to_iso), + updated_at: row.updated_at.map(ms_to_iso), + project_path: row.project_path.clone(), + labels: row.labels.clone(), + paths: row.paths.clone(), + snippet, + score: rrf.map(|r| r.normalized_score).unwrap_or(0.0), + explain: explain_data, + }); + } + + Ok(SearchResponse { + query: query.to_string(), + mode: "lexical".to_string(), + total_results: results.len(), + results, + warnings: vec![], + }) +} + +/// Raw row from hydration query. +struct HydratedRow { + document_id: i64, + source_type: String, + title: String, + url: Option, + author: Option, + created_at: Option, + updated_at: Option, + content_text: String, + project_path: String, + labels: Vec, + paths: Vec, +} + +/// Hydrate document IDs into full display rows in a single query. +/// +/// Uses json_each() to pass ranked IDs and preserve ordering via ORDER BY j.key. +/// Labels and paths fetched via correlated json_group_array subqueries. +fn hydrate_results( + conn: &rusqlite::Connection, + document_ids: &[i64], +) -> Result> { + if document_ids.is_empty() { + return Ok(Vec::new()); + } + + let ids_json = serde_json::to_string(document_ids) + .map_err(|e| LoreError::Other(e.to_string()))?; + + let sql = r#" + SELECT d.id, d.source_type, d.title, d.url, d.author_username, + d.created_at, d.updated_at, d.content_text, + p.path_with_namespace AS project_path, + (SELECT json_group_array(dl.label_name) + FROM document_labels dl WHERE dl.document_id = d.id) AS labels_json, + (SELECT json_group_array(dp.path) + FROM document_paths dp WHERE dp.document_id = d.id) AS paths_json + FROM json_each(?1) AS j + JOIN documents d ON d.id = j.value + JOIN projects p ON p.id = d.project_id + ORDER BY j.key + "#; + + let mut stmt = conn.prepare(sql)?; + let rows = stmt + .query_map([ids_json], |row| { + let labels_json: String = row.get(9)?; + let paths_json: String = row.get(10)?; + + Ok(HydratedRow { + document_id: row.get(0)?, + source_type: row.get(1)?, + title: row.get(2)?, + url: row.get(3)?, + author: row.get(4)?, + created_at: row.get(5)?, + updated_at: row.get(6)?, + content_text: row.get(7)?, + project_path: row.get(8)?, + labels: parse_json_array(&labels_json), + paths: parse_json_array(&paths_json), + }) + })? + .collect::, _>>()?; + + Ok(rows) +} + +/// Parse a JSON array string into a Vec, filtering out null/empty. +fn parse_json_array(json: &str) -> Vec { + serde_json::from_str::>(json) + .unwrap_or_default() + .into_iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .filter(|s| !s.is_empty()) + .collect() +} + +/// Print human-readable search results. +pub fn print_search_results(response: &SearchResponse) { + if !response.warnings.is_empty() { + for w in &response.warnings { + eprintln!("{} {}", style("Warning:").yellow(), w); + } + } + + if response.results.is_empty() { + println!( + "No results found for '{}'", + style(&response.query).bold() + ); + return; + } + + println!( + "{} results for '{}' ({})", + response.total_results, + style(&response.query).bold(), + response.mode + ); + println!(); + + for (i, result) in response.results.iter().enumerate() { + let type_prefix = match result.source_type.as_str() { + "issue" => "Issue", + "merge_request" => "MR", + "discussion" => "Discussion", + _ => &result.source_type, + }; + + println!( + "[{}] {} - {} (score: {:.2})", + i + 1, + style(type_prefix).cyan(), + result.title, + result.score + ); + + if let Some(ref url) = result.url { + println!(" {}", style(url).dim()); + } + + println!( + " {} | {}", + style(&result.project_path).dim(), + result + .author + .as_deref() + .map(|a| format!("@{}", a)) + .unwrap_or_default() + ); + + if !result.labels.is_empty() { + println!( + " Labels: {}", + result.labels.join(", ") + ); + } + + // Strip HTML tags from snippet for terminal display + let clean_snippet = result + .snippet + .replace("", "") + .replace("", ""); + println!(" {}", style(clean_snippet).dim()); + + if let Some(ref explain) = result.explain { + println!( + " {} fts_rank={} rrf_score={:.6}", + style("[explain]").magenta(), + explain + .fts_rank + .map(|r| r.to_string()) + .unwrap_or_else(|| "-".into()), + explain.rrf_score + ); + } + + println!(); + } +} + +/// JSON output structures. +#[derive(Serialize)] +struct SearchJsonOutput<'a> { + ok: bool, + data: &'a SearchResponse, + meta: SearchMeta, +} + +#[derive(Serialize)] +struct SearchMeta { + elapsed_ms: u64, +} + +/// Print JSON robot-mode output. +pub fn print_search_results_json(response: &SearchResponse, elapsed_ms: u64) { + let output = SearchJsonOutput { + ok: true, + data: response, + meta: SearchMeta { elapsed_ms }, + }; + println!("{}", serde_json::to_string(&output).unwrap()); +} diff --git a/src/cli/commands/show.rs b/src/cli/commands/show.rs index 48b03ca..012d1b6 100644 --- a/src/cli/commands/show.rs +++ b/src/cli/commands/show.rs @@ -6,7 +6,7 @@ use serde::Serialize; use crate::Config; use crate::core::db::create_connection; -use crate::core::error::{GiError, Result}; +use crate::core::error::{LoreError, Result}; use crate::core::paths::get_db_path; use crate::core::time::ms_to_iso; @@ -188,11 +188,11 @@ fn find_issue(conn: &Connection, iid: i64, project_filter: Option<&str>) -> Resu .collect::, _>>()?; match issues.len() { - 0 => Err(GiError::NotFound(format!("Issue #{} not found", iid))), + 0 => Err(LoreError::NotFound(format!("Issue #{} not found", iid))), 1 => Ok(issues.into_iter().next().unwrap()), _ => { let projects: Vec = issues.iter().map(|i| i.project_path.clone()).collect(); - Err(GiError::Ambiguous(format!( + Err(LoreError::Ambiguous(format!( "Issue #{} exists in multiple projects: {}. Use --project to specify.", iid, projects.join(", ") @@ -386,11 +386,11 @@ fn find_mr(conn: &Connection, iid: i64, project_filter: Option<&str>) -> Result< .collect::, _>>()?; match mrs.len() { - 0 => Err(GiError::NotFound(format!("MR !{} not found", iid))), + 0 => Err(LoreError::NotFound(format!("MR !{} not found", iid))), 1 => Ok(mrs.into_iter().next().unwrap()), _ => { let projects: Vec = mrs.iter().map(|m| m.project_path.clone()).collect(); - Err(GiError::Ambiguous(format!( + Err(LoreError::Ambiguous(format!( "MR !{} exists in multiple projects: {}. Use --project to specify.", iid, projects.join(", ") diff --git a/src/cli/commands/stats.rs b/src/cli/commands/stats.rs new file mode 100644 index 0000000..8ec3e46 --- /dev/null +++ b/src/cli/commands/stats.rs @@ -0,0 +1,348 @@ +//! Stats command: document counts, embedding coverage, queue status, integrity checks. + +use console::style; +use rusqlite::Connection; +use serde::Serialize; + +use crate::core::db::create_connection; +use crate::core::error::Result; +use crate::core::paths::get_db_path; +use crate::Config; + +/// Result of the stats command. +#[derive(Debug, Default, Serialize)] +pub struct StatsResult { + pub documents: DocumentStats, + pub embeddings: EmbeddingStats, + pub fts: FtsStats, + pub queues: QueueStats, + #[serde(skip_serializing_if = "Option::is_none")] + pub integrity: Option, +} + +#[derive(Debug, Default, Serialize)] +pub struct DocumentStats { + pub total: i64, + pub issues: i64, + pub merge_requests: i64, + pub discussions: i64, + pub truncated: i64, +} + +#[derive(Debug, Default, Serialize)] +pub struct EmbeddingStats { + pub embedded_documents: i64, + pub total_chunks: i64, + pub coverage_pct: f64, +} + +#[derive(Debug, Default, Serialize)] +pub struct FtsStats { + pub indexed: i64, +} + +#[derive(Debug, Default, Serialize)] +pub struct QueueStats { + pub dirty_sources: i64, + pub dirty_sources_failed: i64, + pub pending_discussion_fetches: i64, + pub pending_discussion_fetches_failed: i64, +} + +#[derive(Debug, Default, Serialize)] +pub struct IntegrityResult { + pub ok: bool, + pub fts_doc_mismatch: bool, + pub orphan_embeddings: i64, + pub stale_metadata: i64, + #[serde(skip_serializing_if = "Option::is_none")] + pub repair: Option, +} + +#[derive(Debug, Default, Serialize)] +pub struct RepairResult { + pub fts_rebuilt: bool, + pub orphans_deleted: i64, + pub stale_cleared: i64, +} + +/// Run the stats command. +pub fn run_stats( + config: &Config, + check: bool, + repair: bool, +) -> Result { + let db_path = get_db_path(config.storage.db_path.as_deref()); + let conn = create_connection(&db_path)?; + + let mut result = StatsResult::default(); + + // Document counts + result.documents.total = count_query(&conn, "SELECT COUNT(*) FROM documents")?; + result.documents.issues = + count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'issue'")?; + result.documents.merge_requests = + count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'merge_request'")?; + result.documents.discussions = + count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'discussion'")?; + result.documents.truncated = + count_query(&conn, "SELECT COUNT(*) FROM documents WHERE is_truncated = 1")?; + + // Embedding stats — skip gracefully if table doesn't exist (Gate A only) + if table_exists(&conn, "embedding_metadata") { + let embedded = count_query( + &conn, + "SELECT COUNT(DISTINCT document_id) FROM embedding_metadata WHERE last_error IS NULL", + )?; + let chunks = count_query( + &conn, + "SELECT COUNT(*) FROM embedding_metadata WHERE last_error IS NULL", + )?; + result.embeddings.embedded_documents = embedded; + result.embeddings.total_chunks = chunks; + result.embeddings.coverage_pct = if result.documents.total > 0 { + (embedded as f64 / result.documents.total as f64) * 100.0 + } else { + 0.0 + }; + } + + // FTS stats + result.fts.indexed = count_query(&conn, "SELECT COUNT(*) FROM documents_fts")?; + + // Queue stats + result.queues.dirty_sources = + count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NULL")?; + result.queues.dirty_sources_failed = + count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NOT NULL")?; + + if table_exists(&conn, "pending_discussion_fetches") { + result.queues.pending_discussion_fetches = count_query( + &conn, + "SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NULL", + )?; + result.queues.pending_discussion_fetches_failed = count_query( + &conn, + "SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NOT NULL", + )?; + } + + // Integrity check + if check { + let mut integrity = IntegrityResult::default(); + + // FTS/doc count mismatch + integrity.fts_doc_mismatch = result.fts.indexed != result.documents.total; + + // Orphan embeddings (rowid/1000 should match a document ID) + if table_exists(&conn, "embeddings") { + integrity.orphan_embeddings = count_query( + &conn, + "SELECT COUNT(*) FROM embedding_metadata em + WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = em.document_id)", + )?; + } + + // Stale metadata (document_hash != current content_hash) + if table_exists(&conn, "embedding_metadata") { + integrity.stale_metadata = count_query( + &conn, + "SELECT COUNT(*) FROM embedding_metadata em + JOIN documents d ON d.id = em.document_id + WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash", + )?; + } + + integrity.ok = !integrity.fts_doc_mismatch + && integrity.orphan_embeddings == 0 + && integrity.stale_metadata == 0; + + // Repair + if repair { + let mut repair_result = RepairResult::default(); + + if integrity.fts_doc_mismatch { + conn.execute( + "INSERT INTO documents_fts(documents_fts) VALUES('rebuild')", + [], + )?; + repair_result.fts_rebuilt = true; + } + + if integrity.orphan_embeddings > 0 && table_exists(&conn, "embedding_metadata") { + let deleted = conn.execute( + "DELETE FROM embedding_metadata + WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = embedding_metadata.document_id)", + [], + )?; + repair_result.orphans_deleted = deleted as i64; + + // Also clean orphaned vectors if vec0 table exists + if table_exists(&conn, "embeddings") { + let _ = conn.execute( + "DELETE FROM embeddings + WHERE rowid / 1000 NOT IN (SELECT id FROM documents)", + [], + ); + } + } + + if integrity.stale_metadata > 0 && table_exists(&conn, "embedding_metadata") { + let cleared = conn.execute( + "DELETE FROM embedding_metadata + WHERE document_id IN ( + SELECT em.document_id FROM embedding_metadata em + JOIN documents d ON d.id = em.document_id + WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash + )", + [], + )?; + repair_result.stale_cleared = cleared as i64; + } + + integrity.repair = Some(repair_result); + } + + result.integrity = Some(integrity); + } + + Ok(result) +} + +fn count_query(conn: &Connection, sql: &str) -> Result { + let count: i64 = conn + .query_row(sql, [], |row| row.get(0)) + .unwrap_or(0); + Ok(count) +} + +fn table_exists(conn: &Connection, table: &str) -> bool { + conn.query_row( + "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1", + [table], + |row| row.get::<_, i64>(0), + ) + .unwrap_or(0) + > 0 +} + +/// Print human-readable stats. +pub fn print_stats(result: &StatsResult) { + println!("{}", style("Documents").cyan().bold()); + println!(" Total: {}", result.documents.total); + println!(" Issues: {}", result.documents.issues); + println!(" Merge Requests: {}", result.documents.merge_requests); + println!(" Discussions: {}", result.documents.discussions); + if result.documents.truncated > 0 { + println!(" Truncated: {}", style(result.documents.truncated).yellow()); + } + println!(); + + println!("{}", style("Search Index").cyan().bold()); + println!(" FTS indexed: {}", result.fts.indexed); + println!( + " Embedding coverage: {:.1}% ({}/{})", + result.embeddings.coverage_pct, + result.embeddings.embedded_documents, + result.documents.total + ); + if result.embeddings.total_chunks > 0 { + println!(" Total chunks: {}", result.embeddings.total_chunks); + } + println!(); + + println!("{}", style("Queues").cyan().bold()); + println!(" Dirty sources: {} pending, {} failed", + result.queues.dirty_sources, + result.queues.dirty_sources_failed + ); + println!(" Discussion fetch: {} pending, {} failed", + result.queues.pending_discussion_fetches, + result.queues.pending_discussion_fetches_failed + ); + + if let Some(ref integrity) = result.integrity { + println!(); + let status = if integrity.ok { + style("OK").green().bold() + } else { + style("ISSUES FOUND").red().bold() + }; + println!("{} Integrity: {}", style("Check").cyan().bold(), status); + + if integrity.fts_doc_mismatch { + println!(" {} FTS/document count mismatch", style("!").red()); + } + if integrity.orphan_embeddings > 0 { + println!( + " {} {} orphan embeddings", + style("!").red(), + integrity.orphan_embeddings + ); + } + if integrity.stale_metadata > 0 { + println!( + " {} {} stale embedding metadata", + style("!").red(), + integrity.stale_metadata + ); + } + + if let Some(ref repair) = integrity.repair { + println!(); + println!("{}", style("Repair").cyan().bold()); + if repair.fts_rebuilt { + println!(" {} FTS index rebuilt", style("fixed").green()); + } + if repair.orphans_deleted > 0 { + println!( + " {} {} orphan embeddings deleted", + style("fixed").green(), + repair.orphans_deleted + ); + } + if repair.stale_cleared > 0 { + println!( + " {} {} stale metadata entries cleared", + style("fixed").green(), + repair.stale_cleared + ); + } + if !repair.fts_rebuilt && repair.orphans_deleted == 0 && repair.stale_cleared == 0 { + println!(" No issues to repair."); + } + } + } +} + +/// JSON output structures. +#[derive(Serialize)] +struct StatsJsonOutput { + ok: bool, + data: StatsResult, +} + +/// Print JSON robot-mode output. +pub fn print_stats_json(result: &StatsResult) { + let output = StatsJsonOutput { + ok: true, + data: StatsResult { + documents: DocumentStats { ..*&result.documents }, + embeddings: EmbeddingStats { ..*&result.embeddings }, + fts: FtsStats { ..*&result.fts }, + queues: QueueStats { ..*&result.queues }, + integrity: result.integrity.as_ref().map(|i| IntegrityResult { + ok: i.ok, + fts_doc_mismatch: i.fts_doc_mismatch, + orphan_embeddings: i.orphan_embeddings, + stale_metadata: i.stale_metadata, + repair: i.repair.as_ref().map(|r| RepairResult { + fts_rebuilt: r.fts_rebuilt, + orphans_deleted: r.orphans_deleted, + stale_cleared: r.stale_cleared, + }), + }), + }, + }; + println!("{}", serde_json::to_string(&output).unwrap()); +} diff --git a/src/cli/commands/sync.rs b/src/cli/commands/sync.rs new file mode 100644 index 0000000..84ffd0d --- /dev/null +++ b/src/cli/commands/sync.rs @@ -0,0 +1,124 @@ +//! Sync command: unified orchestrator for ingest -> generate-docs -> embed. + +use console::style; +use serde::Serialize; +use tracing::{info, warn}; + +use crate::Config; +use crate::core::error::Result; + +use super::embed::run_embed; +use super::generate_docs::run_generate_docs; +use super::ingest::run_ingest; + +/// Options for the sync command. +#[derive(Debug, Default)] +pub struct SyncOptions { + pub full: bool, + pub force: bool, + pub no_embed: bool, + pub no_docs: bool, +} + +/// Result of the sync command. +#[derive(Debug, Default, Serialize)] +pub struct SyncResult { + pub issues_updated: usize, + pub mrs_updated: usize, + pub discussions_fetched: usize, + pub documents_regenerated: usize, + pub documents_embedded: usize, +} + +/// Run the full sync pipeline: ingest -> generate-docs -> embed. +pub async fn run_sync(config: &Config, options: SyncOptions) -> Result { + let mut result = SyncResult::default(); + + // Stage 1: Ingest issues + info!("Sync stage 1/4: ingesting issues"); + let issues_result = run_ingest(config, "issues", None, options.force, options.full, true).await?; + result.issues_updated = issues_result.issues_upserted; + result.discussions_fetched += issues_result.discussions_fetched; + + // Stage 2: Ingest MRs + info!("Sync stage 2/4: ingesting merge requests"); + let mrs_result = run_ingest(config, "mrs", None, options.force, options.full, true).await?; + result.mrs_updated = mrs_result.mrs_upserted; + result.discussions_fetched += mrs_result.discussions_fetched; + + // Stage 3: Generate documents (unless --no-docs) + if options.no_docs { + info!("Sync stage 3/4: skipping document generation (--no-docs)"); + } else { + info!("Sync stage 3/4: generating documents"); + let docs_result = run_generate_docs(config, false, None)?; + result.documents_regenerated = docs_result.regenerated; + } + + // Stage 4: Embed documents (unless --no-embed) + if options.no_embed { + info!("Sync stage 4/4: skipping embedding (--no-embed)"); + } else { + info!("Sync stage 4/4: embedding documents"); + match run_embed(config, false).await { + Ok(embed_result) => { + result.documents_embedded = embed_result.embedded; + } + Err(e) => { + // Graceful degradation: Ollama down is a warning, not an error + warn!(error = %e, "Embedding stage failed (Ollama may be unavailable), continuing"); + } + } + } + + info!( + issues = result.issues_updated, + mrs = result.mrs_updated, + discussions = result.discussions_fetched, + docs = result.documents_regenerated, + embedded = result.documents_embedded, + "Sync pipeline complete" + ); + + Ok(result) +} + +/// Print human-readable sync summary. +pub fn print_sync(result: &SyncResult, elapsed: std::time::Duration) { + println!( + "{} Sync complete:", + style("done").green().bold(), + ); + println!(" Issues updated: {}", result.issues_updated); + println!(" MRs updated: {}", result.mrs_updated); + println!(" Discussions fetched: {}", result.discussions_fetched); + println!(" Documents regenerated: {}", result.documents_regenerated); + println!(" Documents embedded: {}", result.documents_embedded); + println!( + " Elapsed: {:.1}s", + elapsed.as_secs_f64() + ); +} + +/// JSON output for sync. +#[derive(Serialize)] +struct SyncJsonOutput<'a> { + ok: bool, + data: &'a SyncResult, + meta: SyncMeta, +} + +#[derive(Serialize)] +struct SyncMeta { + elapsed_ms: u64, +} + +/// Print JSON robot-mode sync output. +pub fn print_sync_json(result: &SyncResult, elapsed_ms: u64) { + let output = SyncJsonOutput { + ok: true, + data: result, + meta: SyncMeta { elapsed_ms }, + }; + println!("{}", serde_json::to_string(&output).unwrap()); +} diff --git a/src/cli/mod.rs b/src/cli/mod.rs index c6c2ce0..c01a894 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -69,6 +69,18 @@ pub enum Commands { /// Fail if prompts would be shown #[arg(long)] non_interactive: bool, + + /// GitLab base URL (required in robot mode) + #[arg(long)] + gitlab_url: Option, + + /// Environment variable name holding GitLab token (required in robot mode) + #[arg(long)] + token_env_var: Option, + + /// Comma-separated project paths (required in robot mode) + #[arg(long)] + projects: Option, }, /// Create timestamped database backup @@ -81,9 +93,32 @@ pub enum Commands { yes: bool, }, + /// Search indexed documents + Search(SearchArgs), + + /// Show document and index statistics + Stats(StatsArgs), + + /// Generate searchable documents from ingested data + #[command(name = "generate-docs")] + GenerateDocs(GenerateDocsArgs), + + /// Generate vector embeddings for documents via Ollama + Embed(EmbedArgs), + + /// Run full sync pipeline: ingest -> generate-docs -> embed + Sync(SyncArgs), + /// Run pending database migrations Migrate, + /// Quick health check: config, database, schema version + Health, + + /// Machine-readable command manifest for agent self-discovery + #[command(name = "robot-docs")] + RobotDocs, + // --- Hidden backward-compat aliases --- /// List issues or MRs (deprecated: use 'lore issues' or 'lore mrs') #[command(hide = true)] @@ -299,6 +334,109 @@ pub struct IngestArgs { pub full: bool, } +/// Arguments for `lore stats` +#[derive(Parser)] +pub struct StatsArgs { + /// Run integrity checks + #[arg(long)] + pub check: bool, + + /// Repair integrity issues (requires --check) + #[arg(long, requires = "check")] + pub repair: bool, +} + +/// Arguments for `lore search ` +#[derive(Parser)] +pub struct SearchArgs { + /// Search query string + pub query: String, + + /// Search mode (lexical, hybrid, semantic) + #[arg(long, default_value = "hybrid")] + pub mode: String, + + /// Filter by source type (issue, mr, discussion) + #[arg(long = "type", value_name = "TYPE")] + pub source_type: Option, + + /// Filter by author username + #[arg(long)] + pub author: Option, + + /// Filter by project path + #[arg(short = 'p', long)] + pub project: Option, + + /// Filter by label (repeatable, AND logic) + #[arg(long, action = clap::ArgAction::Append)] + pub label: Vec, + + /// Filter by file path (trailing / for prefix match) + #[arg(long)] + pub path: Option, + + /// Filter by created after (7d, 2w, or YYYY-MM-DD) + #[arg(long)] + pub after: Option, + + /// Filter by updated after (7d, 2w, or YYYY-MM-DD) + #[arg(long = "updated-after")] + pub updated_after: Option, + + /// Maximum results (default 20, max 100) + #[arg(short = 'n', long = "limit", default_value = "20")] + pub limit: usize, + + /// Show ranking explanation per result + #[arg(long)] + pub explain: bool, + + /// FTS query mode: safe (default) or raw + #[arg(long = "fts-mode", default_value = "safe")] + pub fts_mode: String, +} + +/// Arguments for `lore generate-docs` +#[derive(Parser)] +pub struct GenerateDocsArgs { + /// Full rebuild: seed all entities into dirty queue, then drain + #[arg(long)] + pub full: bool, + + /// Filter to single project + #[arg(short = 'p', long)] + pub project: Option, +} + +/// Arguments for `lore sync` +#[derive(Parser)] +pub struct SyncArgs { + /// Reset cursors, fetch everything + #[arg(long)] + pub full: bool, + + /// Override stale lock + #[arg(long)] + pub force: bool, + + /// Skip embedding step + #[arg(long)] + pub no_embed: bool, + + /// Skip document regeneration + #[arg(long)] + pub no_docs: bool, +} + +/// Arguments for `lore embed` +#[derive(Parser)] +pub struct EmbedArgs { + /// Retry previously failed embeddings + #[arg(long)] + pub retry_failed: bool, +} + /// Arguments for `lore count ` #[derive(Parser)] pub struct CountArgs { diff --git a/src/main.rs b/src/main.rs index dbee8df..237689c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,17 +10,23 @@ use tracing_subscriber::util::SubscriberInitExt; use lore::Config; use lore::cli::commands::{ - InitInputs, InitOptions, ListFilters, MrListFilters, open_issue_in_browser, open_mr_in_browser, - print_count, print_count_json, print_doctor_results, print_ingest_summary, - print_ingest_summary_json, print_list_issues, print_list_issues_json, print_list_mrs, - print_list_mrs_json, print_show_issue, print_show_issue_json, print_show_mr, + InitInputs, InitOptions, InitResult, ListFilters, MrListFilters, SearchCliFilters, open_issue_in_browser, + open_mr_in_browser, print_count, print_count_json, print_doctor_results, print_generate_docs, + print_generate_docs_json, print_ingest_summary, print_ingest_summary_json, print_list_issues, + print_list_issues_json, print_list_mrs, print_list_mrs_json, print_search_results, + print_search_results_json, print_show_issue, print_show_issue_json, print_show_mr, print_stats, + print_stats_json, + print_embed, print_embed_json, print_sync, print_sync_json, print_show_mr_json, print_sync_status, print_sync_status_json, run_auth_test, run_count, - run_doctor, run_ingest, run_init, run_list_issues, run_list_mrs, run_show_issue, run_show_mr, - run_sync_status, + run_doctor, run_embed, run_generate_docs, run_ingest, run_init, run_list_issues, run_list_mrs, + run_search, run_show_issue, run_show_mr, run_stats, run_sync, run_sync_status, SyncOptions, +}; +use lore::cli::{ + Cli, Commands, CountArgs, EmbedArgs, GenerateDocsArgs, IngestArgs, IssuesArgs, MrsArgs, + SearchArgs, StatsArgs, SyncArgs, }; -use lore::cli::{Cli, Commands, CountArgs, IngestArgs, IssuesArgs, MrsArgs}; use lore::core::db::{create_connection, get_schema_version, run_migrations}; -use lore::core::error::{GiError, RobotErrorOutput}; +use lore::core::error::{LoreError, RobotErrorOutput}; use lore::core::paths::get_config_path; use lore::core::paths::get_db_path; @@ -49,6 +55,10 @@ async fn main() { let result = match cli.command { Commands::Issues(args) => handle_issues(cli.config.as_deref(), args, robot_mode).await, Commands::Mrs(args) => handle_mrs(cli.config.as_deref(), args, robot_mode).await, + Commands::Search(args) => handle_search(cli.config.as_deref(), args, robot_mode).await, + Commands::Stats(args) => handle_stats(cli.config.as_deref(), args, robot_mode).await, + Commands::Embed(args) => handle_embed(cli.config.as_deref(), args, robot_mode).await, + Commands::Sync(args) => handle_sync_cmd(cli.config.as_deref(), args, robot_mode).await, Commands::Ingest(args) => handle_ingest(cli.config.as_deref(), args, robot_mode).await, Commands::Count(args) => { handle_count(cli.config.as_deref(), args, robot_mode).await @@ -60,10 +70,29 @@ async fn main() { Commands::Init { force, non_interactive, - } => handle_init(cli.config.as_deref(), force, non_interactive, robot_mode).await, + gitlab_url, + token_env_var, + projects, + } => { + handle_init( + cli.config.as_deref(), + force, + non_interactive, + robot_mode, + gitlab_url, + token_env_var, + projects, + ) + .await + } + Commands::GenerateDocs(args) => { + handle_generate_docs(cli.config.as_deref(), args, robot_mode).await + } Commands::Backup => handle_backup(robot_mode), Commands::Reset { yes: _ } => handle_reset(robot_mode), Commands::Migrate => handle_migrate(cli.config.as_deref(), robot_mode).await, + Commands::Health => handle_health(cli.config.as_deref(), robot_mode).await, + Commands::RobotDocs => handle_robot_docs(robot_mode), // --- Backward-compat: deprecated aliases --- Commands::List { @@ -159,7 +188,7 @@ async fn main() { } } -/// Fallback error output for non-GiError errors in robot mode. +/// Fallback error output for non-LoreError errors in robot mode. #[derive(Serialize)] struct FallbackErrorOutput { error: FallbackError, @@ -172,8 +201,8 @@ struct FallbackError { } fn handle_error(e: Box, robot_mode: bool) -> ! { - // Try to downcast to GiError for structured output - if let Some(gi_error) = e.downcast_ref::() { + // Try to downcast to LoreError for structured output + if let Some(gi_error) = e.downcast_ref::() { if robot_mode { let output = RobotErrorOutput::from(gi_error); // Use serde_json for safe serialization; fallback constructs JSON safely @@ -201,7 +230,7 @@ fn handle_error(e: Box, robot_mode: bool) -> ! { } } - // Fallback for non-GiError errors - use serde for proper JSON escaping + // Fallback for non-LoreError errors - use serde for proper JSON escaping if robot_mode { let output = FallbackErrorOutput { error: FallbackError { @@ -473,22 +502,123 @@ async fn handle_sync_status_cmd( Ok(()) } +/// JSON output for init command. +#[derive(Serialize)] +struct InitOutput { + ok: bool, + data: InitOutputData, +} + +#[derive(Serialize)] +struct InitOutputData { + config_path: String, + data_dir: String, + user: InitOutputUser, + projects: Vec, +} + +#[derive(Serialize)] +struct InitOutputUser { + username: String, + name: String, +} + +#[derive(Serialize)] +struct InitOutputProject { + path: String, + name: String, +} + +fn print_init_json(result: &InitResult) { + let output = InitOutput { + ok: true, + data: InitOutputData { + config_path: result.config_path.clone(), + data_dir: result.data_dir.clone(), + user: InitOutputUser { + username: result.user.username.clone(), + name: result.user.name.clone(), + }, + projects: result + .projects + .iter() + .map(|p| InitOutputProject { + path: p.path.clone(), + name: p.name.clone(), + }) + .collect(), + }, + }; + println!("{}", serde_json::to_string(&output).unwrap()); +} + async fn handle_init( config_override: Option<&str>, force: bool, non_interactive: bool, - _robot_mode: bool, // TODO: Add robot mode support for init (requires non-interactive implementation) + robot_mode: bool, + gitlab_url_flag: Option, + token_env_var_flag: Option, + projects_flag: Option, ) -> Result<(), Box> { + // Robot mode: require all inputs via flags, skip interactive prompts + if robot_mode { + let missing: Vec<&str> = [ + gitlab_url_flag.is_none().then_some("--gitlab-url"), + token_env_var_flag.is_none().then_some("--token-env-var"), + projects_flag.is_none().then_some("--projects"), + ] + .into_iter() + .flatten() + .collect(); + + if !missing.is_empty() { + let output = RobotErrorWithSuggestion { + error: RobotErrorSuggestionData { + code: "MISSING_FLAGS".to_string(), + message: format!("Robot mode requires flags: {}", missing.join(", ")), + suggestion: "lore --robot init --gitlab-url https://gitlab.com --token-env-var GITLAB_TOKEN --projects group/project".to_string(), + }, + }; + eprintln!("{}", serde_json::to_string(&output)?); + std::process::exit(2); + } + + let project_paths: Vec = projects_flag + .unwrap() + .split(',') + .map(|p| p.trim().to_string()) + .filter(|p| !p.is_empty()) + .collect(); + + let result = run_init( + InitInputs { + gitlab_url: gitlab_url_flag.unwrap(), + token_env_var: token_env_var_flag.unwrap(), + project_paths, + }, + InitOptions { + config_path: config_override.map(String::from), + force: true, + non_interactive: true, + }, + ) + .await?; + + print_init_json(&result); + return Ok(()); + } + + // Human mode: interactive prompts let config_path = get_config_path(config_override); let mut confirmed_overwrite = force; - // Check if config exists and handle overwrite - if config_path.exists() { + if config_path.exists() && !force { if non_interactive { eprintln!( "{}", style(format!( - "Config file exists at {}. Cannot proceed in non-interactive mode.", + "Config file exists at {}. Use --force to overwrite.", config_path.display() )) .red() @@ -496,59 +626,70 @@ async fn handle_init( std::process::exit(2); } - if !force { - let confirm = Confirm::new() - .with_prompt(format!( - "Config file exists at {}. Overwrite?", - config_path.display() - )) - .default(false) - .interact()?; + let confirm = Confirm::new() + .with_prompt(format!( + "Config file exists at {}. Overwrite?", + config_path.display() + )) + .default(false) + .interact()?; - if !confirm { - println!("{}", style("Cancelled.").yellow()); - std::process::exit(2); - } - confirmed_overwrite = true; + if !confirm { + println!("{}", style("Cancelled.").yellow()); + std::process::exit(2); } + confirmed_overwrite = true; } - // Prompt for GitLab URL - let gitlab_url: String = Input::new() - .with_prompt("GitLab URL") - .default("https://gitlab.com".to_string()) - .validate_with(|input: &String| -> Result<(), &str> { - if url::Url::parse(input).is_ok() { - Ok(()) - } else { - Err("Please enter a valid URL") - } - }) - .interact_text()?; + let gitlab_url: String = if let Some(url) = gitlab_url_flag { + url + } else { + Input::new() + .with_prompt("GitLab URL") + .default("https://gitlab.com".to_string()) + .validate_with(|input: &String| -> Result<(), &str> { + if url::Url::parse(input).is_ok() { + Ok(()) + } else { + Err("Please enter a valid URL") + } + }) + .interact_text()? + }; - // Prompt for token env var - let token_env_var: String = Input::new() - .with_prompt("Token environment variable name") - .default("GITLAB_TOKEN".to_string()) - .interact_text()?; + let token_env_var: String = if let Some(var) = token_env_var_flag { + var + } else { + Input::new() + .with_prompt("Token environment variable name") + .default("GITLAB_TOKEN".to_string()) + .interact_text()? + }; - // Prompt for project paths - let project_paths_input: String = Input::new() - .with_prompt("Project paths (comma-separated, e.g., group/project)") - .validate_with(|input: &String| -> Result<(), &str> { - if input.trim().is_empty() { - Err("Please enter at least one project path") - } else { - Ok(()) - } - }) - .interact_text()?; + let project_paths: Vec = if let Some(projects) = projects_flag { + projects + .split(',') + .map(|p| p.trim().to_string()) + .filter(|p| !p.is_empty()) + .collect() + } else { + let project_paths_input: String = Input::new() + .with_prompt("Project paths (comma-separated, e.g., group/project)") + .validate_with(|input: &String| -> Result<(), &str> { + if input.trim().is_empty() { + Err("Please enter at least one project path") + } else { + Ok(()) + } + }) + .interact_text()?; - let project_paths: Vec = project_paths_input - .split(',') - .map(|p| p.trim().to_string()) - .filter(|p| !p.is_empty()) - .collect(); + project_paths_input + .split(',') + .map(|p| p.trim().to_string()) + .filter(|p| !p.is_empty()) + .collect() + }; println!("{}", style("\nValidating configuration...").blue()); @@ -840,6 +981,385 @@ async fn handle_migrate( Ok(()) } +async fn handle_stats( + config_override: Option<&str>, + args: StatsArgs, + robot_mode: bool, +) -> Result<(), Box> { + let config = Config::load(config_override)?; + let result = run_stats(&config, args.check, args.repair)?; + if robot_mode { + print_stats_json(&result); + } else { + print_stats(&result); + } + Ok(()) +} + +async fn handle_search( + config_override: Option<&str>, + args: SearchArgs, + robot_mode: bool, +) -> Result<(), Box> { + let config = Config::load(config_override)?; + + let fts_mode = match args.fts_mode.as_str() { + "raw" => lore::search::FtsQueryMode::Raw, + _ => lore::search::FtsQueryMode::Safe, + }; + + let cli_filters = SearchCliFilters { + source_type: args.source_type, + author: args.author, + project: args.project, + labels: args.label, + path: args.path, + after: args.after, + updated_after: args.updated_after, + limit: args.limit, + }; + + let start = std::time::Instant::now(); + let response = run_search(&config, &args.query, cli_filters, fts_mode, args.explain)?; + let elapsed_ms = start.elapsed().as_millis() as u64; + + if robot_mode { + print_search_results_json(&response, elapsed_ms); + } else { + print_search_results(&response); + } + Ok(()) +} + +async fn handle_generate_docs( + config_override: Option<&str>, + args: GenerateDocsArgs, + robot_mode: bool, +) -> Result<(), Box> { + let config = Config::load(config_override)?; + + let result = run_generate_docs(&config, args.full, args.project.as_deref())?; + if robot_mode { + print_generate_docs_json(&result); + } else { + print_generate_docs(&result); + } + Ok(()) +} + +async fn handle_embed( + config_override: Option<&str>, + args: EmbedArgs, + robot_mode: bool, +) -> Result<(), Box> { + let config = Config::load(config_override)?; + let result = run_embed(&config, args.retry_failed).await?; + if robot_mode { + print_embed_json(&result); + } else { + print_embed(&result); + } + Ok(()) +} + +async fn handle_sync_cmd( + config_override: Option<&str>, + args: SyncArgs, + robot_mode: bool, +) -> Result<(), Box> { + let config = Config::load(config_override)?; + let options = SyncOptions { + full: args.full, + force: args.force, + no_embed: args.no_embed, + no_docs: args.no_docs, + }; + + let start = std::time::Instant::now(); + let result = run_sync(&config, options).await?; + let elapsed = start.elapsed(); + + if robot_mode { + print_sync_json(&result, elapsed.as_millis() as u64); + } else { + print_sync(&result, elapsed); + } + Ok(()) +} + +// ============================================================================ +// Health + Robot-docs handlers +// ============================================================================ + +/// JSON output for health command. +#[derive(Serialize)] +struct HealthOutput { + ok: bool, + data: HealthData, +} + +#[derive(Serialize)] +struct HealthData { + healthy: bool, + config_found: bool, + db_found: bool, + schema_current: bool, + schema_version: i32, +} + +async fn handle_health( + config_override: Option<&str>, + robot_mode: bool, +) -> Result<(), Box> { + let config_path = get_config_path(config_override); + let config_found = config_path.exists(); + + let (db_found, schema_version, schema_current) = if config_found { + match Config::load(config_override) { + Ok(config) => { + let db_path = get_db_path(config.storage.db_path.as_deref()); + if db_path.exists() { + match create_connection(&db_path) { + Ok(conn) => { + let version = get_schema_version(&conn); + let latest = 9; // Number of embedded migrations + (true, version, version >= latest) + } + Err(_) => (true, 0, false), + } + } else { + (false, 0, false) + } + } + Err(_) => (false, 0, false), + } + } else { + (false, 0, false) + }; + + let healthy = config_found && db_found && schema_current; + + if robot_mode { + let output = HealthOutput { + ok: true, + data: HealthData { + healthy, + config_found, + db_found, + schema_current, + schema_version, + }, + }; + println!("{}", serde_json::to_string(&output)?); + } else { + let status = |ok: bool| { + if ok { + style("pass").green() + } else { + style("FAIL").red() + } + }; + println!("Config: {} ({})", status(config_found), config_path.display()); + println!("DB: {}", status(db_found)); + println!( + "Schema: {} (v{})", + status(schema_current), + schema_version + ); + println!(); + if healthy { + println!("{}", style("Healthy").green().bold()); + } else { + println!("{}", style("Unhealthy - run 'lore doctor' for details").red().bold()); + } + } + + if !healthy { + std::process::exit(1); + } + + Ok(()) +} + +/// JSON output for robot-docs command. +#[derive(Serialize)] +struct RobotDocsOutput { + ok: bool, + data: RobotDocsData, +} + +#[derive(Serialize)] +struct RobotDocsData { + name: String, + version: String, + description: String, + activation: RobotDocsActivation, + commands: serde_json::Value, + exit_codes: serde_json::Value, + error_format: String, + workflows: serde_json::Value, +} + +#[derive(Serialize)] +struct RobotDocsActivation { + flags: Vec, + env: String, + auto: String, +} + +fn handle_robot_docs(robot_mode: bool) -> Result<(), Box> { + let version = env!("CARGO_PKG_VERSION").to_string(); + + let commands = serde_json::json!({ + "init": { + "description": "Initialize configuration and database", + "flags": ["--force", "--non-interactive", "--gitlab-url ", "--token-env-var ", "--projects "], + "robot_flags": ["--gitlab-url", "--token-env-var", "--projects"], + "example": "lore --robot init --gitlab-url https://gitlab.com --token-env-var GITLAB_TOKEN --projects group/project" + }, + "health": { + "description": "Quick pre-flight check: config, database, schema version", + "flags": [], + "example": "lore --robot health" + }, + "auth": { + "description": "Verify GitLab authentication", + "flags": [], + "example": "lore --robot auth" + }, + "doctor": { + "description": "Full environment health check (config, auth, DB, Ollama)", + "flags": [], + "example": "lore --robot doctor" + }, + "ingest": { + "description": "Sync data from GitLab", + "flags": ["--project ", "--force", "--full", ""], + "example": "lore --robot ingest issues --project group/repo" + }, + "sync": { + "description": "Full sync pipeline: ingest -> generate-docs -> embed", + "flags": ["--full", "--force", "--no-embed", "--no-docs"], + "example": "lore --robot sync" + }, + "issues": { + "description": "List or show issues", + "flags": ["", "--limit", "--state", "--project", "--author", "--assignee", "--label", "--milestone", "--since", "--due-before", "--has-due", "--sort", "--asc"], + "example": "lore --robot issues --state opened --limit 10" + }, + "mrs": { + "description": "List or show merge requests", + "flags": ["", "--limit", "--state", "--project", "--author", "--assignee", "--reviewer", "--label", "--since", "--draft", "--no-draft", "--target", "--source", "--sort", "--asc"], + "example": "lore --robot mrs --state opened" + }, + "search": { + "description": "Search indexed documents (lexical, hybrid, semantic)", + "flags": ["", "--mode", "--type", "--author", "--project", "--label", "--path", "--after", "--updated-after", "--limit", "--explain", "--fts-mode"], + "example": "lore --robot search 'authentication bug' --mode hybrid --limit 10" + }, + "count": { + "description": "Count entities in local database", + "flags": ["", "--for "], + "example": "lore --robot count issues" + }, + "stats": { + "description": "Show document and index statistics", + "flags": ["--check", "--repair"], + "example": "lore --robot stats" + }, + "status": { + "description": "Show sync state (cursors, last sync times)", + "flags": [], + "example": "lore --robot status" + }, + "generate-docs": { + "description": "Generate searchable documents from ingested data", + "flags": ["--full", "--project "], + "example": "lore --robot generate-docs --full" + }, + "embed": { + "description": "Generate vector embeddings for documents via Ollama", + "flags": ["--retry-failed"], + "example": "lore --robot embed" + }, + "migrate": { + "description": "Run pending database migrations", + "flags": [], + "example": "lore --robot migrate" + }, + "version": { + "description": "Show version information", + "flags": [], + "example": "lore --robot version" + }, + "robot-docs": { + "description": "This command (agent self-discovery manifest)", + "flags": [], + "example": "lore robot-docs" + } + }); + + let exit_codes = serde_json::json!({ + "0": "Success", + "1": "Internal error / health check failed", + "2": "Config not found / missing flags", + "3": "Config invalid", + "4": "Token not set", + "5": "GitLab auth failed", + "6": "Resource not found", + "7": "Rate limited", + "8": "Network error", + "9": "Database locked", + "10": "Database error", + "11": "Migration failed", + "12": "I/O error", + "13": "Transform error" + }); + + let workflows = serde_json::json!({ + "first_setup": [ + "lore --robot init --gitlab-url https://gitlab.com --token-env-var GITLAB_TOKEN --projects group/project", + "lore --robot doctor", + "lore --robot sync" + ], + "daily_sync": [ + "lore --robot sync" + ], + "search": [ + "lore --robot search 'query' --mode hybrid" + ], + "pre_flight": [ + "lore --robot health" + ] + }); + + let output = RobotDocsOutput { + ok: true, + data: RobotDocsData { + name: "lore".to_string(), + version, + description: "Local GitLab data management with semantic search".to_string(), + activation: RobotDocsActivation { + flags: vec!["--robot".to_string(), "-J".to_string(), "--json".to_string()], + env: "LORE_ROBOT=1".to_string(), + auto: "Non-TTY stdout".to_string(), + }, + commands, + exit_codes, + error_format: "stderr JSON: {\"error\":{\"code\":\"...\",\"message\":\"...\",\"suggestion\":\"...\"}}".to_string(), + workflows, + }, + }; + + if robot_mode { + println!("{}", serde_json::to_string(&output)?); + } else { + println!("{}", serde_json::to_string_pretty(&output)?); + } + + Ok(()) +} + // ============================================================================ // Backward-compat handlers (deprecated, delegate to new handlers) // ============================================================================