feat(cli): Add search, stats, embed, sync, health, and robot-docs commands

Extends the CLI with six new commands that complete the search pipeline: - lore search <QUERY>: Hybrid search with mode selection (lexical, hybrid, semantic), rich filtering (--type, --author, --project, --label, --path, --after, --updated-after), result limits, and optional explain mode showing RRF score breakdowns. Safe FTS mode sanitizes user input; raw mode passes through for power users. - lore stats: Document and index statistics with optional --check for integrity verification and --repair to fix inconsistencies (orphaned documents, missing FTS entries, stale dirty queue items). - lore embed: Generate vector embeddings via Ollama. Supports --retry-failed to re-attempt previously failed embeddings. - lore generate-docs: Drain the dirty queue to regenerate documents. --full seeds all entities for complete rebuild. --project scopes to a single project. - lore sync: Full pipeline orchestration (ingest issues + MRs, generate-docs, embed) with --no-embed and --no-docs flags for partial runs. Reports per-stage results and total elapsed time. - lore health: Quick pre-flight check (config exists, DB exists, schema current). Returns exit code 1 if unhealthy. Designed for agent pre-flight scripts. - lore robot-docs: Machine-readable command manifest for agent self-discovery. Returns all commands, flags, examples, exit codes, and recommended workflows as structured JSON. Also enhances lore init with --gitlab-url, --token-env-var, and --projects flags for fully non-interactive robot-mode initialization. Fixes init's force/non-interactive precedence logic and adds JSON output for robot mode. Updates all command files for the GiError -> LoreError rename. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:47:10 -05:00
parent 559f0702ad
commit daf5a73019
13 changed files with 1930 additions and 95 deletions
--- a/src/cli/commands/stats.rs
+++ b/src/cli/commands/stats.rs
@@ -0,0 +1,348 @@
+//! Stats command: document counts, embedding coverage, queue status, integrity checks.
+
+use console::style;
+use rusqlite::Connection;
+use serde::Serialize;
+
+use crate::core::db::create_connection;
+use crate::core::error::Result;
+use crate::core::paths::get_db_path;
+use crate::Config;
+
+/// Result of the stats command.
+#[derive(Debug, Default, Serialize)]
+pub struct StatsResult {
+    pub documents: DocumentStats,
+    pub embeddings: EmbeddingStats,
+    pub fts: FtsStats,
+    pub queues: QueueStats,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub integrity: Option<IntegrityResult>,
+}
+
+#[derive(Debug, Default, Serialize)]
+pub struct DocumentStats {
+    pub total: i64,
+    pub issues: i64,
+    pub merge_requests: i64,
+    pub discussions: i64,
+    pub truncated: i64,
+}
+
+#[derive(Debug, Default, Serialize)]
+pub struct EmbeddingStats {
+    pub embedded_documents: i64,
+    pub total_chunks: i64,
+    pub coverage_pct: f64,
+}
+
+#[derive(Debug, Default, Serialize)]
+pub struct FtsStats {
+    pub indexed: i64,
+}
+
+#[derive(Debug, Default, Serialize)]
+pub struct QueueStats {
+    pub dirty_sources: i64,
+    pub dirty_sources_failed: i64,
+    pub pending_discussion_fetches: i64,
+    pub pending_discussion_fetches_failed: i64,
+}
+
+#[derive(Debug, Default, Serialize)]
+pub struct IntegrityResult {
+    pub ok: bool,
+    pub fts_doc_mismatch: bool,
+    pub orphan_embeddings: i64,
+    pub stale_metadata: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub repair: Option<RepairResult>,
+}
+
+#[derive(Debug, Default, Serialize)]
+pub struct RepairResult {
+    pub fts_rebuilt: bool,
+    pub orphans_deleted: i64,
+    pub stale_cleared: i64,
+}
+
+/// Run the stats command.
+pub fn run_stats(
+    config: &Config,
+    check: bool,
+    repair: bool,
+) -> Result<StatsResult> {
+    let db_path = get_db_path(config.storage.db_path.as_deref());
+    let conn = create_connection(&db_path)?;
+
+    let mut result = StatsResult::default();
+
+    // Document counts
+    result.documents.total = count_query(&conn, "SELECT COUNT(*) FROM documents")?;
+    result.documents.issues =
+        count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'issue'")?;
+    result.documents.merge_requests =
+        count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'merge_request'")?;
+    result.documents.discussions =
+        count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'discussion'")?;
+    result.documents.truncated =
+        count_query(&conn, "SELECT COUNT(*) FROM documents WHERE is_truncated = 1")?;
+
+    // Embedding stats — skip gracefully if table doesn't exist (Gate A only)
+    if table_exists(&conn, "embedding_metadata") {
+        let embedded = count_query(
+            &conn,
+            "SELECT COUNT(DISTINCT document_id) FROM embedding_metadata WHERE last_error IS NULL",
+        )?;
+        let chunks = count_query(
+            &conn,
+            "SELECT COUNT(*) FROM embedding_metadata WHERE last_error IS NULL",
+        )?;
+        result.embeddings.embedded_documents = embedded;
+        result.embeddings.total_chunks = chunks;
+        result.embeddings.coverage_pct = if result.documents.total > 0 {
+            (embedded as f64 / result.documents.total as f64) * 100.0
+        } else {
+            0.0
+        };
+    }
+
+    // FTS stats
+    result.fts.indexed = count_query(&conn, "SELECT COUNT(*) FROM documents_fts")?;
+
+    // Queue stats
+    result.queues.dirty_sources =
+        count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NULL")?;
+    result.queues.dirty_sources_failed =
+        count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NOT NULL")?;
+
+    if table_exists(&conn, "pending_discussion_fetches") {
+        result.queues.pending_discussion_fetches = count_query(
+            &conn,
+            "SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NULL",
+        )?;
+        result.queues.pending_discussion_fetches_failed = count_query(
+            &conn,
+            "SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NOT NULL",
+        )?;
+    }
+
+    // Integrity check
+    if check {
+        let mut integrity = IntegrityResult::default();
+
+        // FTS/doc count mismatch
+        integrity.fts_doc_mismatch = result.fts.indexed != result.documents.total;
+
+        // Orphan embeddings (rowid/1000 should match a document ID)
+        if table_exists(&conn, "embeddings") {
+            integrity.orphan_embeddings = count_query(
+                &conn,
+                "SELECT COUNT(*) FROM embedding_metadata em
+                 WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = em.document_id)",
+            )?;
+        }
+
+        // Stale metadata (document_hash != current content_hash)
+        if table_exists(&conn, "embedding_metadata") {
+            integrity.stale_metadata = count_query(
+                &conn,
+                "SELECT COUNT(*) FROM embedding_metadata em
+                 JOIN documents d ON d.id = em.document_id
+                 WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash",
+            )?;
+        }
+
+        integrity.ok = !integrity.fts_doc_mismatch
+            && integrity.orphan_embeddings == 0
+            && integrity.stale_metadata == 0;
+
+        // Repair
+        if repair {
+            let mut repair_result = RepairResult::default();
+
+            if integrity.fts_doc_mismatch {
+                conn.execute(
+                    "INSERT INTO documents_fts(documents_fts) VALUES('rebuild')",
+                    [],
+                )?;
+                repair_result.fts_rebuilt = true;
+            }
+
+            if integrity.orphan_embeddings > 0 && table_exists(&conn, "embedding_metadata") {
+                let deleted = conn.execute(
+                    "DELETE FROM embedding_metadata
+                     WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = embedding_metadata.document_id)",
+                    [],
+                )?;
+                repair_result.orphans_deleted = deleted as i64;
+
+                // Also clean orphaned vectors if vec0 table exists
+                if table_exists(&conn, "embeddings") {
+                    let _ = conn.execute(
+                        "DELETE FROM embeddings
+                         WHERE rowid / 1000 NOT IN (SELECT id FROM documents)",
+                        [],
+                    );
+                }
+            }
+
+            if integrity.stale_metadata > 0 && table_exists(&conn, "embedding_metadata") {
+                let cleared = conn.execute(
+                    "DELETE FROM embedding_metadata
+                     WHERE document_id IN (
+                         SELECT em.document_id FROM embedding_metadata em
+                         JOIN documents d ON d.id = em.document_id
+                         WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash
+                     )",
+                    [],
+                )?;
+                repair_result.stale_cleared = cleared as i64;
+            }
+
+            integrity.repair = Some(repair_result);
+        }
+
+        result.integrity = Some(integrity);
+    }
+
+    Ok(result)
+}
+
+fn count_query(conn: &Connection, sql: &str) -> Result<i64> {
+    let count: i64 = conn
+        .query_row(sql, [], |row| row.get(0))
+        .unwrap_or(0);
+    Ok(count)
+}
+
+fn table_exists(conn: &Connection, table: &str) -> bool {
+    conn.query_row(
+        "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
+        [table],
+        |row| row.get::<_, i64>(0),
+    )
+    .unwrap_or(0)
+        > 0
+}
+
+/// Print human-readable stats.
+pub fn print_stats(result: &StatsResult) {
+    println!("{}", style("Documents").cyan().bold());
+    println!("  Total:           {}", result.documents.total);
+    println!("  Issues:          {}", result.documents.issues);
+    println!("  Merge Requests:  {}", result.documents.merge_requests);
+    println!("  Discussions:     {}", result.documents.discussions);
+    if result.documents.truncated > 0 {
+        println!("  Truncated:       {}", style(result.documents.truncated).yellow());
+    }
+    println!();
+
+    println!("{}", style("Search Index").cyan().bold());
+    println!("  FTS indexed:     {}", result.fts.indexed);
+    println!(
+        "  Embedding coverage: {:.1}% ({}/{})",
+        result.embeddings.coverage_pct,
+        result.embeddings.embedded_documents,
+        result.documents.total
+    );
+    if result.embeddings.total_chunks > 0 {
+        println!("  Total chunks:    {}", result.embeddings.total_chunks);
+    }
+    println!();
+
+    println!("{}", style("Queues").cyan().bold());
+    println!("  Dirty sources:   {} pending, {} failed",
+        result.queues.dirty_sources,
+        result.queues.dirty_sources_failed
+    );
+    println!("  Discussion fetch: {} pending, {} failed",
+        result.queues.pending_discussion_fetches,
+        result.queues.pending_discussion_fetches_failed
+    );
+
+    if let Some(ref integrity) = result.integrity {
+        println!();
+        let status = if integrity.ok {
+            style("OK").green().bold()
+        } else {
+            style("ISSUES FOUND").red().bold()
+        };
+        println!("{} Integrity: {}", style("Check").cyan().bold(), status);
+
+        if integrity.fts_doc_mismatch {
+            println!("  {} FTS/document count mismatch", style("!").red());
+        }
+        if integrity.orphan_embeddings > 0 {
+            println!(
+                "  {} {} orphan embeddings",
+                style("!").red(),
+                integrity.orphan_embeddings
+            );
+        }
+        if integrity.stale_metadata > 0 {
+            println!(
+                "  {} {} stale embedding metadata",
+                style("!").red(),
+                integrity.stale_metadata
+            );
+        }
+
+        if let Some(ref repair) = integrity.repair {
+            println!();
+            println!("{}", style("Repair").cyan().bold());
+            if repair.fts_rebuilt {
+                println!("  {} FTS index rebuilt", style("fixed").green());
+            }
+            if repair.orphans_deleted > 0 {
+                println!(
+                    "  {} {} orphan embeddings deleted",
+                    style("fixed").green(),
+                    repair.orphans_deleted
+                );
+            }
+            if repair.stale_cleared > 0 {
+                println!(
+                    "  {} {} stale metadata entries cleared",
+                    style("fixed").green(),
+                    repair.stale_cleared
+                );
+            }
+            if !repair.fts_rebuilt && repair.orphans_deleted == 0 && repair.stale_cleared == 0 {
+                println!("  No issues to repair.");
+            }
+        }
+    }
+}
+
+/// JSON output structures.
+#[derive(Serialize)]
+struct StatsJsonOutput {
+    ok: bool,
+    data: StatsResult,
+}
+
+/// Print JSON robot-mode output.
+pub fn print_stats_json(result: &StatsResult) {
+    let output = StatsJsonOutput {
+        ok: true,
+        data: StatsResult {
+            documents: DocumentStats { ..*&result.documents },
+            embeddings: EmbeddingStats { ..*&result.embeddings },
+            fts: FtsStats { ..*&result.fts },
+            queues: QueueStats { ..*&result.queues },
+            integrity: result.integrity.as_ref().map(|i| IntegrityResult {
+                ok: i.ok,
+                fts_doc_mismatch: i.fts_doc_mismatch,
+                orphan_embeddings: i.orphan_embeddings,
+                stale_metadata: i.stale_metadata,
+                repair: i.repair.as_ref().map(|r| RepairResult {
+                    fts_rebuilt: r.fts_rebuilt,
+                    orphans_deleted: r.orphans_deleted,
+                    stale_cleared: r.stale_cleared,
+                }),
+            }),
+        },
+    };
+    println!("{}", serde_json::to_string(&output).unwrap());
+}