feat(cli): Add search, stats, embed, sync, health, and robot-docs commands

Extends the CLI with six new commands that complete the search pipeline: - lore search <QUERY>: Hybrid search with mode selection (lexical, hybrid, semantic), rich filtering (--type, --author, --project, --label, --path, --after, --updated-after), result limits, and optional explain mode showing RRF score breakdowns. Safe FTS mode sanitizes user input; raw mode passes through for power users. - lore stats: Document and index statistics with optional --check for integrity verification and --repair to fix inconsistencies (orphaned documents, missing FTS entries, stale dirty queue items). - lore embed: Generate vector embeddings via Ollama. Supports --retry-failed to re-attempt previously failed embeddings. - lore generate-docs: Drain the dirty queue to regenerate documents. --full seeds all entities for complete rebuild. --project scopes to a single project. - lore sync: Full pipeline orchestration (ingest issues + MRs, generate-docs, embed) with --no-embed and --no-docs flags for partial runs. Reports per-stage results and total elapsed time. - lore health: Quick pre-flight check (config exists, DB exists, schema current). Returns exit code 1 if unhealthy. Designed for agent pre-flight scripts. - lore robot-docs: Machine-readable command manifest for agent self-discovery. Returns all commands, flags, examples, exit codes, and recommended workflows as structured JSON. Also enhances lore init with --gitlab-url, --token-env-var, and --projects flags for fully non-interactive robot-mode initialization. Fixes init's force/non-interactive precedence logic and adds JSON output for robot mode. Updates all command files for the GiError -> LoreError rename. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:47:10 -05:00
parent 559f0702ad
commit daf5a73019
13 changed files with 1930 additions and 95 deletions
--- a/src/cli/commands/search.rs
+++ b/src/cli/commands/search.rs
@@ -0,0 +1,402 @@
+//! Search command: lexical (FTS5) search with filter support and single-query hydration.
+
+use console::style;
+use serde::Serialize;
+
+use crate::core::db::create_connection;
+use crate::core::error::{LoreError, Result};
+use crate::core::paths::get_db_path;
+use crate::core::project::resolve_project;
+use crate::core::time::{ms_to_iso, parse_since};
+use crate::documents::SourceType;
+use crate::search::{
+    apply_filters, get_result_snippet, rank_rrf, search_fts, FtsQueryMode, PathFilter,
+    SearchFilters,
+};
+use crate::Config;
+
+/// Display-ready search result with all fields hydrated.
+#[derive(Debug, Serialize)]
+pub struct SearchResultDisplay {
+    pub document_id: i64,
+    pub source_type: String,
+    pub title: String,
+    pub url: Option<String>,
+    pub author: Option<String>,
+    pub created_at: Option<String>,
+    pub updated_at: Option<String>,
+    pub project_path: String,
+    pub labels: Vec<String>,
+    pub paths: Vec<String>,
+    pub snippet: String,
+    pub score: f64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub explain: Option<ExplainData>,
+}
+
+/// Ranking explanation for --explain output.
+#[derive(Debug, Serialize)]
+pub struct ExplainData {
+    pub vector_rank: Option<usize>,
+    pub fts_rank: Option<usize>,
+    pub rrf_score: f64,
+}
+
+/// Search response wrapper.
+#[derive(Debug, Serialize)]
+pub struct SearchResponse {
+    pub query: String,
+    pub mode: String,
+    pub total_results: usize,
+    pub results: Vec<SearchResultDisplay>,
+    pub warnings: Vec<String>,
+}
+
+/// Build SearchFilters from CLI args.
+pub struct SearchCliFilters {
+    pub source_type: Option<String>,
+    pub author: Option<String>,
+    pub project: Option<String>,
+    pub labels: Vec<String>,
+    pub path: Option<String>,
+    pub after: Option<String>,
+    pub updated_after: Option<String>,
+    pub limit: usize,
+}
+
+/// Run a lexical search query.
+pub fn run_search(
+    config: &Config,
+    query: &str,
+    cli_filters: SearchCliFilters,
+    fts_mode: FtsQueryMode,
+    explain: bool,
+) -> Result<SearchResponse> {
+    let db_path = get_db_path(config.storage.db_path.as_deref());
+    let conn = create_connection(&db_path)?;
+
+    // Check if any documents exist
+    let doc_count: i64 = conn
+        .query_row("SELECT COUNT(*) FROM documents", [], |row| row.get(0))
+        .unwrap_or(0);
+
+    if doc_count == 0 {
+        return Ok(SearchResponse {
+            query: query.to_string(),
+            mode: "lexical".to_string(),
+            total_results: 0,
+            results: vec![],
+            warnings: vec![
+                "No documents indexed. Run 'lore generate-docs' first.".to_string()
+            ],
+        });
+    }
+
+    // Build filters
+    let source_type = cli_filters
+        .source_type
+        .as_deref()
+        .and_then(SourceType::parse);
+
+    let project_id = cli_filters
+        .project
+        .as_deref()
+        .map(|p| resolve_project(&conn, p))
+        .transpose()?;
+
+    let after = cli_filters.after.as_deref().and_then(parse_since);
+    let updated_after = cli_filters.updated_after.as_deref().and_then(parse_since);
+
+    let path = cli_filters.path.as_deref().map(|p| {
+        if p.ends_with('/') {
+            PathFilter::Prefix(p.to_string())
+        } else {
+            PathFilter::Exact(p.to_string())
+        }
+    });
+
+    let filters = SearchFilters {
+        source_type,
+        author: cli_filters.author,
+        project_id,
+        after,
+        updated_after,
+        labels: cli_filters.labels,
+        path,
+        limit: cli_filters.limit,
+    };
+
+    // Adaptive recall: wider initial fetch when filters applied
+    let requested = filters.clamp_limit();
+    let top_k = if filters.has_any_filter() {
+        (requested * 50).max(200).min(1500)
+    } else {
+        (requested * 10).max(50).min(1500)
+    };
+
+    // FTS search
+    let fts_results = search_fts(&conn, query, top_k, fts_mode)?;
+    let fts_tuples: Vec<(i64, f64)> = fts_results
+        .iter()
+        .map(|r| (r.document_id, r.bm25_score))
+        .collect();
+
+    // Build snippet map before ranking
+    let snippet_map: std::collections::HashMap<i64, String> = fts_results
+        .iter()
+        .map(|r| (r.document_id, r.snippet.clone()))
+        .collect();
+
+    // RRF ranking (single-list for lexical mode)
+    let ranked = rank_rrf(&[], &fts_tuples);
+    let ranked_ids: Vec<i64> = ranked.iter().map(|r| r.document_id).collect();
+
+    // Apply post-retrieval filters
+    let filtered_ids = apply_filters(&conn, &ranked_ids, &filters)?;
+
+    if filtered_ids.is_empty() {
+        return Ok(SearchResponse {
+            query: query.to_string(),
+            mode: "lexical".to_string(),
+            total_results: 0,
+            results: vec![],
+            warnings: vec![],
+        });
+    }
+
+    // Hydrate results in single round-trip
+    let hydrated = hydrate_results(&conn, &filtered_ids)?;
+
+    // Build display results preserving filter order
+    let rrf_map: std::collections::HashMap<i64, &crate::search::RrfResult> = ranked
+        .iter()
+        .map(|r| (r.document_id, r))
+        .collect();
+
+    let mut results: Vec<SearchResultDisplay> = Vec::with_capacity(hydrated.len());
+    for row in &hydrated {
+        let rrf = rrf_map.get(&row.document_id);
+        let fts_snippet = snippet_map.get(&row.document_id).map(|s| s.as_str());
+        let snippet = get_result_snippet(fts_snippet, &row.content_text);
+
+        let explain_data = if explain {
+            rrf.map(|r| ExplainData {
+                vector_rank: r.vector_rank,
+                fts_rank: r.fts_rank,
+                rrf_score: r.rrf_score,
+            })
+        } else {
+            None
+        };
+
+        results.push(SearchResultDisplay {
+            document_id: row.document_id,
+            source_type: row.source_type.clone(),
+            title: row.title.clone(),
+            url: row.url.clone(),
+            author: row.author.clone(),
+            created_at: row.created_at.map(ms_to_iso),
+            updated_at: row.updated_at.map(ms_to_iso),
+            project_path: row.project_path.clone(),
+            labels: row.labels.clone(),
+            paths: row.paths.clone(),
+            snippet,
+            score: rrf.map(|r| r.normalized_score).unwrap_or(0.0),
+            explain: explain_data,
+        });
+    }
+
+    Ok(SearchResponse {
+        query: query.to_string(),
+        mode: "lexical".to_string(),
+        total_results: results.len(),
+        results,
+        warnings: vec![],
+    })
+}
+
+/// Raw row from hydration query.
+struct HydratedRow {
+    document_id: i64,
+    source_type: String,
+    title: String,
+    url: Option<String>,
+    author: Option<String>,
+    created_at: Option<i64>,
+    updated_at: Option<i64>,
+    content_text: String,
+    project_path: String,
+    labels: Vec<String>,
+    paths: Vec<String>,
+}
+
+/// Hydrate document IDs into full display rows in a single query.
+///
+/// Uses json_each() to pass ranked IDs and preserve ordering via ORDER BY j.key.
+/// Labels and paths fetched via correlated json_group_array subqueries.
+fn hydrate_results(
+    conn: &rusqlite::Connection,
+    document_ids: &[i64],
+) -> Result<Vec<HydratedRow>> {
+    if document_ids.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    let ids_json = serde_json::to_string(document_ids)
+        .map_err(|e| LoreError::Other(e.to_string()))?;
+
+    let sql = r#"
+        SELECT d.id, d.source_type, d.title, d.url, d.author_username,
+               d.created_at, d.updated_at, d.content_text,
+               p.path_with_namespace AS project_path,
+               (SELECT json_group_array(dl.label_name)
+                FROM document_labels dl WHERE dl.document_id = d.id) AS labels_json,
+               (SELECT json_group_array(dp.path)
+                FROM document_paths dp WHERE dp.document_id = d.id) AS paths_json
+        FROM json_each(?1) AS j
+        JOIN documents d ON d.id = j.value
+        JOIN projects p ON p.id = d.project_id
+        ORDER BY j.key
+    "#;
+
+    let mut stmt = conn.prepare(sql)?;
+    let rows = stmt
+        .query_map([ids_json], |row| {
+            let labels_json: String = row.get(9)?;
+            let paths_json: String = row.get(10)?;
+
+            Ok(HydratedRow {
+                document_id: row.get(0)?,
+                source_type: row.get(1)?,
+                title: row.get(2)?,
+                url: row.get(3)?,
+                author: row.get(4)?,
+                created_at: row.get(5)?,
+                updated_at: row.get(6)?,
+                content_text: row.get(7)?,
+                project_path: row.get(8)?,
+                labels: parse_json_array(&labels_json),
+                paths: parse_json_array(&paths_json),
+            })
+        })?
+        .collect::<std::result::Result<Vec<_>, _>>()?;
+
+    Ok(rows)
+}
+
+/// Parse a JSON array string into a Vec<String>, filtering out null/empty.
+fn parse_json_array(json: &str) -> Vec<String> {
+    serde_json::from_str::<Vec<serde_json::Value>>(json)
+        .unwrap_or_default()
+        .into_iter()
+        .filter_map(|v| v.as_str().map(|s| s.to_string()))
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
+/// Print human-readable search results.
+pub fn print_search_results(response: &SearchResponse) {
+    if !response.warnings.is_empty() {
+        for w in &response.warnings {
+            eprintln!("{} {}", style("Warning:").yellow(), w);
+        }
+    }
+
+    if response.results.is_empty() {
+        println!(
+            "No results found for '{}'",
+            style(&response.query).bold()
+        );
+        return;
+    }
+
+    println!(
+        "{} results for '{}' ({})",
+        response.total_results,
+        style(&response.query).bold(),
+        response.mode
+    );
+    println!();
+
+    for (i, result) in response.results.iter().enumerate() {
+        let type_prefix = match result.source_type.as_str() {
+            "issue" => "Issue",
+            "merge_request" => "MR",
+            "discussion" => "Discussion",
+            _ => &result.source_type,
+        };
+
+        println!(
+            "[{}] {} - {} (score: {:.2})",
+            i + 1,
+            style(type_prefix).cyan(),
+            result.title,
+            result.score
+        );
+
+        if let Some(ref url) = result.url {
+            println!("    {}", style(url).dim());
+        }
+
+        println!(
+            "    {} | {}",
+            style(&result.project_path).dim(),
+            result
+                .author
+                .as_deref()
+                .map(|a| format!("@{}", a))
+                .unwrap_or_default()
+        );
+
+        if !result.labels.is_empty() {
+            println!(
+                "    Labels: {}",
+                result.labels.join(", ")
+            );
+        }
+
+        // Strip HTML tags from snippet for terminal display
+        let clean_snippet = result
+            .snippet
+            .replace("<mark>", "")
+            .replace("</mark>", "");
+        println!("    {}", style(clean_snippet).dim());
+
+        if let Some(ref explain) = result.explain {
+            println!(
+                "    {} fts_rank={} rrf_score={:.6}",
+                style("[explain]").magenta(),
+                explain
+                    .fts_rank
+                    .map(|r| r.to_string())
+                    .unwrap_or_else(|| "-".into()),
+                explain.rrf_score
+            );
+        }
+
+        println!();
+    }
+}
+
+/// JSON output structures.
+#[derive(Serialize)]
+struct SearchJsonOutput<'a> {
+    ok: bool,
+    data: &'a SearchResponse,
+    meta: SearchMeta,
+}
+
+#[derive(Serialize)]
+struct SearchMeta {
+    elapsed_ms: u64,
+}
+
+/// Print JSON robot-mode output.
+pub fn print_search_results_json(response: &SearchResponse, elapsed_ms: u64) {
+    let output = SearchJsonOutput {
+        ok: true,
+        data: response,
+        meta: SearchMeta { elapsed_ms },
+    };
+    println!("{}", serde_json::to_string(&output).unwrap());
+}