feat(cli): Add search, stats, embed, sync, health, and robot-docs commands

Extends the CLI with six new commands that complete the search pipeline:

- lore search <QUERY>: Hybrid search with mode selection (lexical,
  hybrid, semantic), rich filtering (--type, --author, --project,
  --label, --path, --after, --updated-after), result limits, and
  optional explain mode showing RRF score breakdowns. Safe FTS mode
  sanitizes user input; raw mode passes through for power users.

- lore stats: Document and index statistics with optional --check
  for integrity verification and --repair to fix inconsistencies
  (orphaned documents, missing FTS entries, stale dirty queue items).

- lore embed: Generate vector embeddings via Ollama. Supports
  --retry-failed to re-attempt previously failed embeddings.

- lore generate-docs: Drain the dirty queue to regenerate documents.
  --full seeds all entities for complete rebuild. --project scopes
  to a single project.

- lore sync: Full pipeline orchestration (ingest issues + MRs,
  generate-docs, embed) with --no-embed and --no-docs flags for
  partial runs. Reports per-stage results and total elapsed time.

- lore health: Quick pre-flight check (config exists, DB exists,
  schema current). Returns exit code 1 if unhealthy. Designed for
  agent pre-flight scripts.

- lore robot-docs: Machine-readable command manifest for agent
  self-discovery. Returns all commands, flags, examples, exit codes,
  and recommended workflows as structured JSON.

Also enhances lore init with --gitlab-url, --token-env-var, and
--projects flags for fully non-interactive robot-mode initialization.
Fixes init's force/non-interactive precedence logic and adds JSON
output for robot mode.

Updates all command files for the GiError -> LoreError rename.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:47:10 -05:00
parent 559f0702ad
commit daf5a73019
13 changed files with 1930 additions and 95 deletions

View File

@@ -0,0 +1,205 @@
//! Generate searchable documents from ingested GitLab data.
use console::style;
use rusqlite::Connection;
use serde::Serialize;
use tracing::info;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::paths::get_db_path;
use crate::documents::{regenerate_dirty_documents, SourceType};
use crate::Config;
const FULL_MODE_CHUNK_SIZE: i64 = 2000;
/// Result of a generate-docs run.
#[derive(Debug, Default)]
pub struct GenerateDocsResult {
pub regenerated: usize,
pub unchanged: usize,
pub errored: usize,
pub seeded: usize,
pub full_mode: bool,
}
/// Run the generate-docs pipeline.
///
/// Default mode: process only existing dirty_sources entries.
/// Full mode: seed dirty_sources with ALL entities, then drain.
pub fn run_generate_docs(
config: &Config,
full: bool,
project_filter: Option<&str>,
) -> Result<GenerateDocsResult> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let mut result = GenerateDocsResult {
full_mode: full,
..Default::default()
};
if full {
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
}
let regen = regenerate_dirty_documents(&conn)?;
result.regenerated = regen.regenerated;
result.unchanged = regen.unchanged;
result.errored = regen.errored;
if full {
// Optimize FTS index after bulk rebuild
let _ = conn.execute(
"INSERT INTO documents_fts(documents_fts) VALUES('optimize')",
[],
);
info!("FTS index optimized after full rebuild");
}
Ok(result)
}
/// Seed dirty_sources with all entities of the given type using keyset pagination.
fn seed_dirty(
conn: &Connection,
source_type: SourceType,
project_filter: Option<&str>,
) -> Result<usize> {
let table = match source_type {
SourceType::Issue => "issues",
SourceType::MergeRequest => "merge_requests",
SourceType::Discussion => "discussions",
};
let type_str = source_type.as_str();
let now = chrono::Utc::now().timestamp_millis();
let mut total_seeded: usize = 0;
let mut last_id: i64 = 0;
loop {
let inserted = if let Some(project) = project_filter {
// Resolve project to ID for filtering
let project_id: Option<i64> = conn
.query_row(
"SELECT id FROM projects WHERE path_with_namespace = ?1 COLLATE NOCASE",
[project],
|row| row.get(0),
)
.ok();
let Some(pid) = project_id else {
break;
};
conn.execute(
&format!(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5
ON CONFLICT(source_type, source_id) DO NOTHING"
),
rusqlite::params![type_str, now, last_id, pid, FULL_MODE_CHUNK_SIZE],
)?
} else {
conn.execute(
&format!(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4
ON CONFLICT(source_type, source_id) DO NOTHING"
),
rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE],
)?
};
if inserted == 0 {
break;
}
// Advance keyset cursor to the max id within the chunk window
let max_id: i64 = conn.query_row(
&format!(
"SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)",
table = table
),
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|row| row.get(0),
)?;
total_seeded += inserted;
last_id = max_id;
}
info!(
source_type = type_str,
seeded = total_seeded,
"Seeded dirty_sources"
);
Ok(total_seeded)
}
/// Print human-readable output.
pub fn print_generate_docs(result: &GenerateDocsResult) {
let mode = if result.full_mode { "full" } else { "incremental" };
println!(
"{} Document generation complete ({})",
style("done").green().bold(),
mode
);
if result.full_mode {
println!(" Seeded: {}", result.seeded);
}
println!(" Regenerated: {}", result.regenerated);
println!(" Unchanged: {}", result.unchanged);
if result.errored > 0 {
println!(
" Errored: {}",
style(result.errored).red()
);
}
}
/// JSON output structures.
#[derive(Serialize)]
struct GenerateDocsJsonOutput {
ok: bool,
data: GenerateDocsJsonData,
}
#[derive(Serialize)]
struct GenerateDocsJsonData {
mode: String,
#[serde(skip_serializing_if = "Option::is_none")]
seeded: Option<usize>,
regenerated: usize,
unchanged: usize,
errored: usize,
}
/// Print JSON robot-mode output.
pub fn print_generate_docs_json(result: &GenerateDocsResult) {
let output = GenerateDocsJsonOutput {
ok: true,
data: GenerateDocsJsonData {
mode: if result.full_mode {
"full".to_string()
} else {
"incremental".to_string()
},
seeded: if result.full_mode {
Some(result.seeded)
} else {
None
},
regenerated: result.regenerated,
unchanged: result.unchanged,
errored: result.errored,
},
};
println!("{}", serde_json::to_string(&output).unwrap());
}