feat(cli): Add search, stats, embed, sync, health, and robot-docs commands
Extends the CLI with six new commands that complete the search pipeline: - lore search <QUERY>: Hybrid search with mode selection (lexical, hybrid, semantic), rich filtering (--type, --author, --project, --label, --path, --after, --updated-after), result limits, and optional explain mode showing RRF score breakdowns. Safe FTS mode sanitizes user input; raw mode passes through for power users. - lore stats: Document and index statistics with optional --check for integrity verification and --repair to fix inconsistencies (orphaned documents, missing FTS entries, stale dirty queue items). - lore embed: Generate vector embeddings via Ollama. Supports --retry-failed to re-attempt previously failed embeddings. - lore generate-docs: Drain the dirty queue to regenerate documents. --full seeds all entities for complete rebuild. --project scopes to a single project. - lore sync: Full pipeline orchestration (ingest issues + MRs, generate-docs, embed) with --no-embed and --no-docs flags for partial runs. Reports per-stage results and total elapsed time. - lore health: Quick pre-flight check (config exists, DB exists, schema current). Returns exit code 1 if unhealthy. Designed for agent pre-flight scripts. - lore robot-docs: Machine-readable command manifest for agent self-discovery. Returns all commands, flags, examples, exit codes, and recommended workflows as structured JSON. Also enhances lore init with --gitlab-url, --token-env-var, and --projects flags for fully non-interactive robot-mode initialization. Fixes init's force/non-interactive precedence logic and adds JSON output for robot mode. Updates all command files for the GiError -> LoreError rename. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
205
src/cli/commands/generate_docs.rs
Normal file
205
src/cli/commands/generate_docs.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
//! Generate searchable documents from ingested GitLab data.
|
||||
|
||||
use console::style;
|
||||
use rusqlite::Connection;
|
||||
use serde::Serialize;
|
||||
use tracing::info;
|
||||
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::documents::{regenerate_dirty_documents, SourceType};
|
||||
use crate::Config;
|
||||
|
||||
const FULL_MODE_CHUNK_SIZE: i64 = 2000;
|
||||
|
||||
/// Result of a generate-docs run.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GenerateDocsResult {
|
||||
pub regenerated: usize,
|
||||
pub unchanged: usize,
|
||||
pub errored: usize,
|
||||
pub seeded: usize,
|
||||
pub full_mode: bool,
|
||||
}
|
||||
|
||||
/// Run the generate-docs pipeline.
|
||||
///
|
||||
/// Default mode: process only existing dirty_sources entries.
|
||||
/// Full mode: seed dirty_sources with ALL entities, then drain.
|
||||
pub fn run_generate_docs(
|
||||
config: &Config,
|
||||
full: bool,
|
||||
project_filter: Option<&str>,
|
||||
) -> Result<GenerateDocsResult> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
let mut result = GenerateDocsResult {
|
||||
full_mode: full,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
if full {
|
||||
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
|
||||
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
|
||||
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
|
||||
}
|
||||
|
||||
let regen = regenerate_dirty_documents(&conn)?;
|
||||
result.regenerated = regen.regenerated;
|
||||
result.unchanged = regen.unchanged;
|
||||
result.errored = regen.errored;
|
||||
|
||||
if full {
|
||||
// Optimize FTS index after bulk rebuild
|
||||
let _ = conn.execute(
|
||||
"INSERT INTO documents_fts(documents_fts) VALUES('optimize')",
|
||||
[],
|
||||
);
|
||||
info!("FTS index optimized after full rebuild");
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Seed dirty_sources with all entities of the given type using keyset pagination.
|
||||
fn seed_dirty(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
project_filter: Option<&str>,
|
||||
) -> Result<usize> {
|
||||
let table = match source_type {
|
||||
SourceType::Issue => "issues",
|
||||
SourceType::MergeRequest => "merge_requests",
|
||||
SourceType::Discussion => "discussions",
|
||||
};
|
||||
let type_str = source_type.as_str();
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
|
||||
let mut total_seeded: usize = 0;
|
||||
let mut last_id: i64 = 0;
|
||||
|
||||
loop {
|
||||
let inserted = if let Some(project) = project_filter {
|
||||
// Resolve project to ID for filtering
|
||||
let project_id: Option<i64> = conn
|
||||
.query_row(
|
||||
"SELECT id FROM projects WHERE path_with_namespace = ?1 COLLATE NOCASE",
|
||||
[project],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.ok();
|
||||
|
||||
let Some(pid) = project_id else {
|
||||
break;
|
||||
};
|
||||
|
||||
conn.execute(
|
||||
&format!(
|
||||
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
|
||||
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
|
||||
FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5
|
||||
ON CONFLICT(source_type, source_id) DO NOTHING"
|
||||
),
|
||||
rusqlite::params![type_str, now, last_id, pid, FULL_MODE_CHUNK_SIZE],
|
||||
)?
|
||||
} else {
|
||||
conn.execute(
|
||||
&format!(
|
||||
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
|
||||
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
|
||||
FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4
|
||||
ON CONFLICT(source_type, source_id) DO NOTHING"
|
||||
),
|
||||
rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE],
|
||||
)?
|
||||
};
|
||||
|
||||
if inserted == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Advance keyset cursor to the max id within the chunk window
|
||||
let max_id: i64 = conn.query_row(
|
||||
&format!(
|
||||
"SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)",
|
||||
table = table
|
||||
),
|
||||
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
total_seeded += inserted;
|
||||
last_id = max_id;
|
||||
}
|
||||
|
||||
info!(
|
||||
source_type = type_str,
|
||||
seeded = total_seeded,
|
||||
"Seeded dirty_sources"
|
||||
);
|
||||
|
||||
Ok(total_seeded)
|
||||
}
|
||||
|
||||
/// Print human-readable output.
|
||||
pub fn print_generate_docs(result: &GenerateDocsResult) {
|
||||
let mode = if result.full_mode { "full" } else { "incremental" };
|
||||
println!(
|
||||
"{} Document generation complete ({})",
|
||||
style("done").green().bold(),
|
||||
mode
|
||||
);
|
||||
|
||||
if result.full_mode {
|
||||
println!(" Seeded: {}", result.seeded);
|
||||
}
|
||||
println!(" Regenerated: {}", result.regenerated);
|
||||
println!(" Unchanged: {}", result.unchanged);
|
||||
if result.errored > 0 {
|
||||
println!(
|
||||
" Errored: {}",
|
||||
style(result.errored).red()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON output structures.
|
||||
#[derive(Serialize)]
|
||||
struct GenerateDocsJsonOutput {
|
||||
ok: bool,
|
||||
data: GenerateDocsJsonData,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GenerateDocsJsonData {
|
||||
mode: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
seeded: Option<usize>,
|
||||
regenerated: usize,
|
||||
unchanged: usize,
|
||||
errored: usize,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode output.
|
||||
pub fn print_generate_docs_json(result: &GenerateDocsResult) {
|
||||
let output = GenerateDocsJsonOutput {
|
||||
ok: true,
|
||||
data: GenerateDocsJsonData {
|
||||
mode: if result.full_mode {
|
||||
"full".to_string()
|
||||
} else {
|
||||
"incremental".to_string()
|
||||
},
|
||||
seeded: if result.full_mode {
|
||||
Some(result.seeded)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
regenerated: result.regenerated,
|
||||
unchanged: result.unchanged,
|
||||
errored: result.errored,
|
||||
},
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
Reference in New Issue
Block a user