feat(cli): Add search, stats, embed, sync, health, and robot-docs commands
Extends the CLI with six new commands that complete the search pipeline: - lore search <QUERY>: Hybrid search with mode selection (lexical, hybrid, semantic), rich filtering (--type, --author, --project, --label, --path, --after, --updated-after), result limits, and optional explain mode showing RRF score breakdowns. Safe FTS mode sanitizes user input; raw mode passes through for power users. - lore stats: Document and index statistics with optional --check for integrity verification and --repair to fix inconsistencies (orphaned documents, missing FTS entries, stale dirty queue items). - lore embed: Generate vector embeddings via Ollama. Supports --retry-failed to re-attempt previously failed embeddings. - lore generate-docs: Drain the dirty queue to regenerate documents. --full seeds all entities for complete rebuild. --project scopes to a single project. - lore sync: Full pipeline orchestration (ingest issues + MRs, generate-docs, embed) with --no-embed and --no-docs flags for partial runs. Reports per-stage results and total elapsed time. - lore health: Quick pre-flight check (config exists, DB exists, schema current). Returns exit code 1 if unhealthy. Designed for agent pre-flight scripts. - lore robot-docs: Machine-readable command manifest for agent self-discovery. Returns all commands, flags, examples, exit codes, and recommended workflows as structured JSON. Also enhances lore init with --gitlab-url, --token-env-var, and --projects flags for fully non-interactive robot-mode initialization. Fixes init's force/non-interactive precedence logic and adds JSON output for robot mode. Updates all command files for the GiError -> LoreError rename. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
402
src/cli/commands/search.rs
Normal file
402
src/cli/commands/search.rs
Normal file
@@ -0,0 +1,402 @@
|
||||
//! Search command: lexical (FTS5) search with filter support and single-query hydration.
|
||||
|
||||
use console::style;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::core::project::resolve_project;
|
||||
use crate::core::time::{ms_to_iso, parse_since};
|
||||
use crate::documents::SourceType;
|
||||
use crate::search::{
|
||||
apply_filters, get_result_snippet, rank_rrf, search_fts, FtsQueryMode, PathFilter,
|
||||
SearchFilters,
|
||||
};
|
||||
use crate::Config;
|
||||
|
||||
/// Display-ready search result with all fields hydrated.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SearchResultDisplay {
|
||||
pub document_id: i64,
|
||||
pub source_type: String,
|
||||
pub title: String,
|
||||
pub url: Option<String>,
|
||||
pub author: Option<String>,
|
||||
pub created_at: Option<String>,
|
||||
pub updated_at: Option<String>,
|
||||
pub project_path: String,
|
||||
pub labels: Vec<String>,
|
||||
pub paths: Vec<String>,
|
||||
pub snippet: String,
|
||||
pub score: f64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub explain: Option<ExplainData>,
|
||||
}
|
||||
|
||||
/// Ranking explanation for --explain output.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ExplainData {
|
||||
pub vector_rank: Option<usize>,
|
||||
pub fts_rank: Option<usize>,
|
||||
pub rrf_score: f64,
|
||||
}
|
||||
|
||||
/// Search response wrapper.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SearchResponse {
|
||||
pub query: String,
|
||||
pub mode: String,
|
||||
pub total_results: usize,
|
||||
pub results: Vec<SearchResultDisplay>,
|
||||
pub warnings: Vec<String>,
|
||||
}
|
||||
|
||||
/// Build SearchFilters from CLI args.
|
||||
pub struct SearchCliFilters {
|
||||
pub source_type: Option<String>,
|
||||
pub author: Option<String>,
|
||||
pub project: Option<String>,
|
||||
pub labels: Vec<String>,
|
||||
pub path: Option<String>,
|
||||
pub after: Option<String>,
|
||||
pub updated_after: Option<String>,
|
||||
pub limit: usize,
|
||||
}
|
||||
|
||||
/// Run a lexical search query.
|
||||
pub fn run_search(
|
||||
config: &Config,
|
||||
query: &str,
|
||||
cli_filters: SearchCliFilters,
|
||||
fts_mode: FtsQueryMode,
|
||||
explain: bool,
|
||||
) -> Result<SearchResponse> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
|
||||
// Check if any documents exist
|
||||
let doc_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents", [], |row| row.get(0))
|
||||
.unwrap_or(0);
|
||||
|
||||
if doc_count == 0 {
|
||||
return Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
mode: "lexical".to_string(),
|
||||
total_results: 0,
|
||||
results: vec![],
|
||||
warnings: vec![
|
||||
"No documents indexed. Run 'lore generate-docs' first.".to_string()
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
// Build filters
|
||||
let source_type = cli_filters
|
||||
.source_type
|
||||
.as_deref()
|
||||
.and_then(SourceType::parse);
|
||||
|
||||
let project_id = cli_filters
|
||||
.project
|
||||
.as_deref()
|
||||
.map(|p| resolve_project(&conn, p))
|
||||
.transpose()?;
|
||||
|
||||
let after = cli_filters.after.as_deref().and_then(parse_since);
|
||||
let updated_after = cli_filters.updated_after.as_deref().and_then(parse_since);
|
||||
|
||||
let path = cli_filters.path.as_deref().map(|p| {
|
||||
if p.ends_with('/') {
|
||||
PathFilter::Prefix(p.to_string())
|
||||
} else {
|
||||
PathFilter::Exact(p.to_string())
|
||||
}
|
||||
});
|
||||
|
||||
let filters = SearchFilters {
|
||||
source_type,
|
||||
author: cli_filters.author,
|
||||
project_id,
|
||||
after,
|
||||
updated_after,
|
||||
labels: cli_filters.labels,
|
||||
path,
|
||||
limit: cli_filters.limit,
|
||||
};
|
||||
|
||||
// Adaptive recall: wider initial fetch when filters applied
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = if filters.has_any_filter() {
|
||||
(requested * 50).max(200).min(1500)
|
||||
} else {
|
||||
(requested * 10).max(50).min(1500)
|
||||
};
|
||||
|
||||
// FTS search
|
||||
let fts_results = search_fts(&conn, query, top_k, fts_mode)?;
|
||||
let fts_tuples: Vec<(i64, f64)> = fts_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.bm25_score))
|
||||
.collect();
|
||||
|
||||
// Build snippet map before ranking
|
||||
let snippet_map: std::collections::HashMap<i64, String> = fts_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.snippet.clone()))
|
||||
.collect();
|
||||
|
||||
// RRF ranking (single-list for lexical mode)
|
||||
let ranked = rank_rrf(&[], &fts_tuples);
|
||||
let ranked_ids: Vec<i64> = ranked.iter().map(|r| r.document_id).collect();
|
||||
|
||||
// Apply post-retrieval filters
|
||||
let filtered_ids = apply_filters(&conn, &ranked_ids, &filters)?;
|
||||
|
||||
if filtered_ids.is_empty() {
|
||||
return Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
mode: "lexical".to_string(),
|
||||
total_results: 0,
|
||||
results: vec![],
|
||||
warnings: vec![],
|
||||
});
|
||||
}
|
||||
|
||||
// Hydrate results in single round-trip
|
||||
let hydrated = hydrate_results(&conn, &filtered_ids)?;
|
||||
|
||||
// Build display results preserving filter order
|
||||
let rrf_map: std::collections::HashMap<i64, &crate::search::RrfResult> = ranked
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r))
|
||||
.collect();
|
||||
|
||||
let mut results: Vec<SearchResultDisplay> = Vec::with_capacity(hydrated.len());
|
||||
for row in &hydrated {
|
||||
let rrf = rrf_map.get(&row.document_id);
|
||||
let fts_snippet = snippet_map.get(&row.document_id).map(|s| s.as_str());
|
||||
let snippet = get_result_snippet(fts_snippet, &row.content_text);
|
||||
|
||||
let explain_data = if explain {
|
||||
rrf.map(|r| ExplainData {
|
||||
vector_rank: r.vector_rank,
|
||||
fts_rank: r.fts_rank,
|
||||
rrf_score: r.rrf_score,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
results.push(SearchResultDisplay {
|
||||
document_id: row.document_id,
|
||||
source_type: row.source_type.clone(),
|
||||
title: row.title.clone(),
|
||||
url: row.url.clone(),
|
||||
author: row.author.clone(),
|
||||
created_at: row.created_at.map(ms_to_iso),
|
||||
updated_at: row.updated_at.map(ms_to_iso),
|
||||
project_path: row.project_path.clone(),
|
||||
labels: row.labels.clone(),
|
||||
paths: row.paths.clone(),
|
||||
snippet,
|
||||
score: rrf.map(|r| r.normalized_score).unwrap_or(0.0),
|
||||
explain: explain_data,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
mode: "lexical".to_string(),
|
||||
total_results: results.len(),
|
||||
results,
|
||||
warnings: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Raw row from hydration query.
|
||||
struct HydratedRow {
|
||||
document_id: i64,
|
||||
source_type: String,
|
||||
title: String,
|
||||
url: Option<String>,
|
||||
author: Option<String>,
|
||||
created_at: Option<i64>,
|
||||
updated_at: Option<i64>,
|
||||
content_text: String,
|
||||
project_path: String,
|
||||
labels: Vec<String>,
|
||||
paths: Vec<String>,
|
||||
}
|
||||
|
||||
/// Hydrate document IDs into full display rows in a single query.
|
||||
///
|
||||
/// Uses json_each() to pass ranked IDs and preserve ordering via ORDER BY j.key.
|
||||
/// Labels and paths fetched via correlated json_group_array subqueries.
|
||||
fn hydrate_results(
|
||||
conn: &rusqlite::Connection,
|
||||
document_ids: &[i64],
|
||||
) -> Result<Vec<HydratedRow>> {
|
||||
if document_ids.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let ids_json = serde_json::to_string(document_ids)
|
||||
.map_err(|e| LoreError::Other(e.to_string()))?;
|
||||
|
||||
let sql = r#"
|
||||
SELECT d.id, d.source_type, d.title, d.url, d.author_username,
|
||||
d.created_at, d.updated_at, d.content_text,
|
||||
p.path_with_namespace AS project_path,
|
||||
(SELECT json_group_array(dl.label_name)
|
||||
FROM document_labels dl WHERE dl.document_id = d.id) AS labels_json,
|
||||
(SELECT json_group_array(dp.path)
|
||||
FROM document_paths dp WHERE dp.document_id = d.id) AS paths_json
|
||||
FROM json_each(?1) AS j
|
||||
JOIN documents d ON d.id = j.value
|
||||
JOIN projects p ON p.id = d.project_id
|
||||
ORDER BY j.key
|
||||
"#;
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let rows = stmt
|
||||
.query_map([ids_json], |row| {
|
||||
let labels_json: String = row.get(9)?;
|
||||
let paths_json: String = row.get(10)?;
|
||||
|
||||
Ok(HydratedRow {
|
||||
document_id: row.get(0)?,
|
||||
source_type: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
url: row.get(3)?,
|
||||
author: row.get(4)?,
|
||||
created_at: row.get(5)?,
|
||||
updated_at: row.get(6)?,
|
||||
content_text: row.get(7)?,
|
||||
project_path: row.get(8)?,
|
||||
labels: parse_json_array(&labels_json),
|
||||
paths: parse_json_array(&paths_json),
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Parse a JSON array string into a Vec<String>, filtering out null/empty.
|
||||
fn parse_json_array(json: &str) -> Vec<String> {
|
||||
serde_json::from_str::<Vec<serde_json::Value>>(json)
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Print human-readable search results.
|
||||
pub fn print_search_results(response: &SearchResponse) {
|
||||
if !response.warnings.is_empty() {
|
||||
for w in &response.warnings {
|
||||
eprintln!("{} {}", style("Warning:").yellow(), w);
|
||||
}
|
||||
}
|
||||
|
||||
if response.results.is_empty() {
|
||||
println!(
|
||||
"No results found for '{}'",
|
||||
style(&response.query).bold()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
println!(
|
||||
"{} results for '{}' ({})",
|
||||
response.total_results,
|
||||
style(&response.query).bold(),
|
||||
response.mode
|
||||
);
|
||||
println!();
|
||||
|
||||
for (i, result) in response.results.iter().enumerate() {
|
||||
let type_prefix = match result.source_type.as_str() {
|
||||
"issue" => "Issue",
|
||||
"merge_request" => "MR",
|
||||
"discussion" => "Discussion",
|
||||
_ => &result.source_type,
|
||||
};
|
||||
|
||||
println!(
|
||||
"[{}] {} - {} (score: {:.2})",
|
||||
i + 1,
|
||||
style(type_prefix).cyan(),
|
||||
result.title,
|
||||
result.score
|
||||
);
|
||||
|
||||
if let Some(ref url) = result.url {
|
||||
println!(" {}", style(url).dim());
|
||||
}
|
||||
|
||||
println!(
|
||||
" {} | {}",
|
||||
style(&result.project_path).dim(),
|
||||
result
|
||||
.author
|
||||
.as_deref()
|
||||
.map(|a| format!("@{}", a))
|
||||
.unwrap_or_default()
|
||||
);
|
||||
|
||||
if !result.labels.is_empty() {
|
||||
println!(
|
||||
" Labels: {}",
|
||||
result.labels.join(", ")
|
||||
);
|
||||
}
|
||||
|
||||
// Strip HTML tags from snippet for terminal display
|
||||
let clean_snippet = result
|
||||
.snippet
|
||||
.replace("<mark>", "")
|
||||
.replace("</mark>", "");
|
||||
println!(" {}", style(clean_snippet).dim());
|
||||
|
||||
if let Some(ref explain) = result.explain {
|
||||
println!(
|
||||
" {} fts_rank={} rrf_score={:.6}",
|
||||
style("[explain]").magenta(),
|
||||
explain
|
||||
.fts_rank
|
||||
.map(|r| r.to_string())
|
||||
.unwrap_or_else(|| "-".into()),
|
||||
explain.rrf_score
|
||||
);
|
||||
}
|
||||
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON output structures.
|
||||
#[derive(Serialize)]
|
||||
struct SearchJsonOutput<'a> {
|
||||
ok: bool,
|
||||
data: &'a SearchResponse,
|
||||
meta: SearchMeta,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SearchMeta {
|
||||
elapsed_ms: u64,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode output.
|
||||
pub fn print_search_results_json(response: &SearchResponse, elapsed_ms: u64) {
|
||||
let output = SearchJsonOutput {
|
||||
ok: true,
|
||||
data: response,
|
||||
meta: SearchMeta { elapsed_ms },
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
Reference in New Issue
Block a user