feat(cli): Add search, stats, embed, sync, health, and robot-docs commands

Extends the CLI with six new commands that complete the search pipeline:

- lore search <QUERY>: Hybrid search with mode selection (lexical,
  hybrid, semantic), rich filtering (--type, --author, --project,
  --label, --path, --after, --updated-after), result limits, and
  optional explain mode showing RRF score breakdowns. Safe FTS mode
  sanitizes user input; raw mode passes through for power users.

- lore stats: Document and index statistics with optional --check
  for integrity verification and --repair to fix inconsistencies
  (orphaned documents, missing FTS entries, stale dirty queue items).

- lore embed: Generate vector embeddings via Ollama. Supports
  --retry-failed to re-attempt previously failed embeddings.

- lore generate-docs: Drain the dirty queue to regenerate documents.
  --full seeds all entities for complete rebuild. --project scopes
  to a single project.

- lore sync: Full pipeline orchestration (ingest issues + MRs,
  generate-docs, embed) with --no-embed and --no-docs flags for
  partial runs. Reports per-stage results and total elapsed time.

- lore health: Quick pre-flight check (config exists, DB exists,
  schema current). Returns exit code 1 if unhealthy. Designed for
  agent pre-flight scripts.

- lore robot-docs: Machine-readable command manifest for agent
  self-discovery. Returns all commands, flags, examples, exit codes,
  and recommended workflows as structured JSON.

Also enhances lore init with --gitlab-url, --token-env-var, and
--projects flags for fully non-interactive robot-mode initialization.
Fixes init's force/non-interactive precedence logic and adds JSON
output for robot mode.

Updates all command files for the GiError -> LoreError rename.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:47:10 -05:00
parent 559f0702ad
commit daf5a73019
13 changed files with 1930 additions and 95 deletions

View File

@@ -1,7 +1,7 @@
//! Auth test command - verify GitLab authentication.
use crate::core::config::Config;
use crate::core::error::{GiError, Result};
use crate::core::error::{LoreError, Result};
use crate::gitlab::GitLabClient;
/// Result of successful auth test.
@@ -19,12 +19,12 @@ pub async fn run_auth_test(config_path: Option<&str>) -> Result<AuthTestResult>
// 2. Get token from environment
let token = std::env::var(&config.gitlab.token_env_var)
.map(|t| t.trim().to_string())
.map_err(|_| GiError::TokenNotSet {
.map_err(|_| LoreError::TokenNotSet {
env_var: config.gitlab.token_env_var.clone(),
})?;
if token.is_empty() {
return Err(GiError::TokenNotSet {
return Err(LoreError::TokenNotSet {
env_var: config.gitlab.token_env_var.clone(),
});
}

View File

@@ -5,7 +5,7 @@ use serde::Serialize;
use crate::core::config::Config;
use crate::core::db::{create_connection, get_schema_version, verify_pragmas};
use crate::core::error::GiError;
use crate::core::error::LoreError;
use crate::core::paths::{get_config_path, get_db_path};
use crate::gitlab::GitLabClient;
@@ -137,7 +137,7 @@ fn check_config(config_path: &str) -> (ConfigCheck, Option<Config>) {
},
Some(config),
),
Err(GiError::ConfigNotFound { path }) => (
Err(LoreError::ConfigNotFound { path }) => (
ConfigCheck {
result: CheckResult {
status: CheckStatus::Error,
@@ -264,7 +264,7 @@ async fn check_gitlab(config: Option<&Config>) -> GitLabCheck {
url: Some(config.gitlab.base_url.clone()),
username: Some(user.username),
},
Err(GiError::GitLabAuthFailed) => GitLabCheck {
Err(LoreError::GitLabAuthFailed) => GitLabCheck {
result: CheckResult {
status: CheckStatus::Error,
message: Some("Authentication failed. Check your token.".to_string()),

88
src/cli/commands/embed.rs Normal file
View File

@@ -0,0 +1,88 @@
//! Embed command: generate vector embeddings for documents via Ollama.
use console::style;
use serde::Serialize;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::paths::get_db_path;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::embedding::pipeline::embed_documents;
use crate::Config;
/// Result of the embed command.
#[derive(Debug, Default, Serialize)]
pub struct EmbedCommandResult {
pub embedded: usize,
pub failed: usize,
pub skipped: usize,
}
/// Run the embed command.
pub async fn run_embed(
config: &Config,
retry_failed: bool,
) -> Result<EmbedCommandResult> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
// Build Ollama config from user settings
let ollama_config = OllamaConfig {
base_url: config.embedding.base_url.clone(),
model: config.embedding.model.clone(),
..OllamaConfig::default()
};
let client = OllamaClient::new(ollama_config);
// Health check — fail fast if Ollama is down or model missing
client.health_check().await?;
// If retry_failed, clear errors so they become pending again
if retry_failed {
conn.execute(
"UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
WHERE last_error IS NOT NULL",
[],
)?;
}
let model_name = &config.embedding.model;
let result = embed_documents(&conn, &client, model_name, None).await?;
Ok(EmbedCommandResult {
embedded: result.embedded,
failed: result.failed,
skipped: result.skipped,
})
}
/// Print human-readable output.
pub fn print_embed(result: &EmbedCommandResult) {
println!(
"{} Embedding complete",
style("done").green().bold(),
);
println!(" Embedded: {}", result.embedded);
if result.failed > 0 {
println!(" Failed: {}", style(result.failed).red());
}
if result.skipped > 0 {
println!(" Skipped: {}", result.skipped);
}
}
/// JSON output.
#[derive(Serialize)]
struct EmbedJsonOutput<'a> {
ok: bool,
data: &'a EmbedCommandResult,
}
/// Print JSON robot-mode output.
pub fn print_embed_json(result: &EmbedCommandResult) {
let output = EmbedJsonOutput {
ok: true,
data: result,
};
println!("{}", serde_json::to_string(&output).unwrap());
}

View File

@@ -0,0 +1,205 @@
//! Generate searchable documents from ingested GitLab data.
use console::style;
use rusqlite::Connection;
use serde::Serialize;
use tracing::info;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::paths::get_db_path;
use crate::documents::{regenerate_dirty_documents, SourceType};
use crate::Config;
const FULL_MODE_CHUNK_SIZE: i64 = 2000;
/// Result of a generate-docs run.
#[derive(Debug, Default)]
pub struct GenerateDocsResult {
pub regenerated: usize,
pub unchanged: usize,
pub errored: usize,
pub seeded: usize,
pub full_mode: bool,
}
/// Run the generate-docs pipeline.
///
/// Default mode: process only existing dirty_sources entries.
/// Full mode: seed dirty_sources with ALL entities, then drain.
pub fn run_generate_docs(
config: &Config,
full: bool,
project_filter: Option<&str>,
) -> Result<GenerateDocsResult> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let mut result = GenerateDocsResult {
full_mode: full,
..Default::default()
};
if full {
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
}
let regen = regenerate_dirty_documents(&conn)?;
result.regenerated = regen.regenerated;
result.unchanged = regen.unchanged;
result.errored = regen.errored;
if full {
// Optimize FTS index after bulk rebuild
let _ = conn.execute(
"INSERT INTO documents_fts(documents_fts) VALUES('optimize')",
[],
);
info!("FTS index optimized after full rebuild");
}
Ok(result)
}
/// Seed dirty_sources with all entities of the given type using keyset pagination.
fn seed_dirty(
conn: &Connection,
source_type: SourceType,
project_filter: Option<&str>,
) -> Result<usize> {
let table = match source_type {
SourceType::Issue => "issues",
SourceType::MergeRequest => "merge_requests",
SourceType::Discussion => "discussions",
};
let type_str = source_type.as_str();
let now = chrono::Utc::now().timestamp_millis();
let mut total_seeded: usize = 0;
let mut last_id: i64 = 0;
loop {
let inserted = if let Some(project) = project_filter {
// Resolve project to ID for filtering
let project_id: Option<i64> = conn
.query_row(
"SELECT id FROM projects WHERE path_with_namespace = ?1 COLLATE NOCASE",
[project],
|row| row.get(0),
)
.ok();
let Some(pid) = project_id else {
break;
};
conn.execute(
&format!(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5
ON CONFLICT(source_type, source_id) DO NOTHING"
),
rusqlite::params![type_str, now, last_id, pid, FULL_MODE_CHUNK_SIZE],
)?
} else {
conn.execute(
&format!(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4
ON CONFLICT(source_type, source_id) DO NOTHING"
),
rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE],
)?
};
if inserted == 0 {
break;
}
// Advance keyset cursor to the max id within the chunk window
let max_id: i64 = conn.query_row(
&format!(
"SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)",
table = table
),
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|row| row.get(0),
)?;
total_seeded += inserted;
last_id = max_id;
}
info!(
source_type = type_str,
seeded = total_seeded,
"Seeded dirty_sources"
);
Ok(total_seeded)
}
/// Print human-readable output.
pub fn print_generate_docs(result: &GenerateDocsResult) {
let mode = if result.full_mode { "full" } else { "incremental" };
println!(
"{} Document generation complete ({})",
style("done").green().bold(),
mode
);
if result.full_mode {
println!(" Seeded: {}", result.seeded);
}
println!(" Regenerated: {}", result.regenerated);
println!(" Unchanged: {}", result.unchanged);
if result.errored > 0 {
println!(
" Errored: {}",
style(result.errored).red()
);
}
}
/// JSON output structures.
#[derive(Serialize)]
struct GenerateDocsJsonOutput {
ok: bool,
data: GenerateDocsJsonData,
}
#[derive(Serialize)]
struct GenerateDocsJsonData {
mode: String,
#[serde(skip_serializing_if = "Option::is_none")]
seeded: Option<usize>,
regenerated: usize,
unchanged: usize,
errored: usize,
}
/// Print JSON robot-mode output.
pub fn print_generate_docs_json(result: &GenerateDocsResult) {
let output = GenerateDocsJsonOutput {
ok: true,
data: GenerateDocsJsonData {
mode: if result.full_mode {
"full".to_string()
} else {
"incremental".to_string()
},
seeded: if result.full_mode {
Some(result.seeded)
} else {
None
},
regenerated: result.regenerated,
unchanged: result.unchanged,
errored: result.errored,
},
};
println!("{}", serde_json::to_string(&output).unwrap());
}

View File

@@ -7,7 +7,7 @@ use serde::Serialize;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::{GiError, Result};
use crate::core::error::{LoreError, Result};
use crate::core::lock::{AppLock, LockOptions};
use crate::core::paths::get_db_path;
use crate::gitlab::GitLabClient;
@@ -51,7 +51,7 @@ pub async fn run_ingest(
) -> Result<IngestResult> {
// Validate resource type early
if resource_type != "issues" && resource_type != "mrs" {
return Err(GiError::Other(format!(
return Err(LoreError::Other(format!(
"Invalid resource type '{}'. Valid types: issues, mrs",
resource_type
)));
@@ -74,7 +74,7 @@ pub async fn run_ingest(
lock.acquire(force)?;
// Get token from environment
let token = std::env::var(&config.gitlab.token_env_var).map_err(|_| GiError::TokenNotSet {
let token = std::env::var(&config.gitlab.token_env_var).map_err(|_| LoreError::TokenNotSet {
env_var: config.gitlab.token_env_var.clone(),
})?;
@@ -119,12 +119,12 @@ pub async fn run_ingest(
if projects.is_empty() {
if let Some(filter) = project_filter {
return Err(GiError::Other(format!(
return Err(LoreError::Other(format!(
"Project '{}' not found in configuration",
filter
)));
}
return Err(GiError::Other(
return Err(LoreError::Other(
"No projects configured. Run 'lore init' first.".to_string(),
));
}

View File

@@ -4,7 +4,7 @@ use std::fs;
use crate::core::config::{MinimalConfig, MinimalGitLabConfig, ProjectConfig};
use crate::core::db::{create_connection, run_migrations};
use crate::core::error::{GiError, Result};
use crate::core::error::{LoreError, Result};
use crate::core::paths::{get_config_path, get_data_dir};
use crate::gitlab::{GitLabClient, GitLabProject};
@@ -45,32 +45,30 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result<InitRe
let config_path = get_config_path(options.config_path.as_deref());
let data_dir = get_data_dir();
// 1. Check if config exists
if config_path.exists() {
// 1. Check if config exists (force takes precedence over non_interactive)
if config_path.exists() && !options.force {
if options.non_interactive {
return Err(GiError::Other(format!(
"Config file exists at {}. Cannot proceed in non-interactive mode.",
return Err(LoreError::Other(format!(
"Config file exists at {}. Use --force to overwrite.",
config_path.display()
)));
}
if !options.force {
return Err(GiError::Other(
"User cancelled config overwrite.".to_string(),
));
}
return Err(LoreError::Other(
"User cancelled config overwrite.".to_string(),
));
}
// 2. Validate GitLab URL format
if url::Url::parse(&inputs.gitlab_url).is_err() {
return Err(GiError::Other(format!(
return Err(LoreError::Other(format!(
"Invalid GitLab URL: {}",
inputs.gitlab_url
)));
}
// 3. Check token is set in environment
let token = std::env::var(&inputs.token_env_var).map_err(|_| GiError::TokenNotSet {
let token = std::env::var(&inputs.token_env_var).map_err(|_| LoreError::TokenNotSet {
env_var: inputs.token_env_var.clone(),
})?;
@@ -78,8 +76,8 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result<InitRe
let client = GitLabClient::new(&inputs.gitlab_url, &token, None);
let gitlab_user = client.get_current_user().await.map_err(|e| {
if matches!(e, GiError::GitLabAuthFailed) {
GiError::Other(format!("Authentication failed for {}", inputs.gitlab_url))
if matches!(e, LoreError::GitLabAuthFailed) {
LoreError::Other(format!("Authentication failed for {}", inputs.gitlab_url))
} else {
e
}
@@ -95,8 +93,8 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result<InitRe
for project_path in &inputs.project_paths {
let project = client.get_project(project_path).await.map_err(|e| {
if matches!(e, GiError::GitLabNotFound { .. }) {
GiError::Other(format!("Project not found: {project_path}"))
if matches!(e, LoreError::GitLabNotFound { .. }) {
LoreError::Other(format!("Project not found: {project_path}"))
} else {
e
}

View File

@@ -3,21 +3,33 @@
pub mod auth_test;
pub mod count;
pub mod doctor;
pub mod embed;
pub mod generate_docs;
pub mod ingest;
pub mod init;
pub mod list;
pub mod search;
pub mod show;
pub mod stats;
pub mod sync;
pub mod sync_status;
pub use auth_test::run_auth_test;
pub use count::{print_count, print_count_json, run_count};
pub use doctor::{print_doctor_results, run_doctor};
pub use embed::{print_embed, print_embed_json, run_embed};
pub use generate_docs::{print_generate_docs, print_generate_docs_json, run_generate_docs};
pub use stats::{print_stats, print_stats_json, run_stats};
pub use search::{
print_search_results, print_search_results_json, run_search, SearchCliFilters, SearchResponse,
};
pub use ingest::{print_ingest_summary, print_ingest_summary_json, run_ingest};
pub use init::{InitInputs, InitOptions, InitResult, run_init};
pub use list::{
ListFilters, MrListFilters, open_issue_in_browser, open_mr_in_browser, print_list_issues,
print_list_issues_json, print_list_mrs, print_list_mrs_json, run_list_issues, run_list_mrs,
};
pub use sync::{print_sync, print_sync_json, run_sync, SyncOptions, SyncResult};
pub use show::{
print_show_issue, print_show_issue_json, print_show_mr, print_show_mr_json, run_show_issue,
run_show_mr,

402
src/cli/commands/search.rs Normal file
View File

@@ -0,0 +1,402 @@
//! Search command: lexical (FTS5) search with filter support and single-query hydration.
use console::style;
use serde::Serialize;
use crate::core::db::create_connection;
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::core::time::{ms_to_iso, parse_since};
use crate::documents::SourceType;
use crate::search::{
apply_filters, get_result_snippet, rank_rrf, search_fts, FtsQueryMode, PathFilter,
SearchFilters,
};
use crate::Config;
/// Display-ready search result with all fields hydrated.
#[derive(Debug, Serialize)]
pub struct SearchResultDisplay {
pub document_id: i64,
pub source_type: String,
pub title: String,
pub url: Option<String>,
pub author: Option<String>,
pub created_at: Option<String>,
pub updated_at: Option<String>,
pub project_path: String,
pub labels: Vec<String>,
pub paths: Vec<String>,
pub snippet: String,
pub score: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub explain: Option<ExplainData>,
}
/// Ranking explanation for --explain output.
#[derive(Debug, Serialize)]
pub struct ExplainData {
pub vector_rank: Option<usize>,
pub fts_rank: Option<usize>,
pub rrf_score: f64,
}
/// Search response wrapper.
#[derive(Debug, Serialize)]
pub struct SearchResponse {
pub query: String,
pub mode: String,
pub total_results: usize,
pub results: Vec<SearchResultDisplay>,
pub warnings: Vec<String>,
}
/// Build SearchFilters from CLI args.
pub struct SearchCliFilters {
pub source_type: Option<String>,
pub author: Option<String>,
pub project: Option<String>,
pub labels: Vec<String>,
pub path: Option<String>,
pub after: Option<String>,
pub updated_after: Option<String>,
pub limit: usize,
}
/// Run a lexical search query.
pub fn run_search(
config: &Config,
query: &str,
cli_filters: SearchCliFilters,
fts_mode: FtsQueryMode,
explain: bool,
) -> Result<SearchResponse> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
// Check if any documents exist
let doc_count: i64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |row| row.get(0))
.unwrap_or(0);
if doc_count == 0 {
return Ok(SearchResponse {
query: query.to_string(),
mode: "lexical".to_string(),
total_results: 0,
results: vec![],
warnings: vec![
"No documents indexed. Run 'lore generate-docs' first.".to_string()
],
});
}
// Build filters
let source_type = cli_filters
.source_type
.as_deref()
.and_then(SourceType::parse);
let project_id = cli_filters
.project
.as_deref()
.map(|p| resolve_project(&conn, p))
.transpose()?;
let after = cli_filters.after.as_deref().and_then(parse_since);
let updated_after = cli_filters.updated_after.as_deref().and_then(parse_since);
let path = cli_filters.path.as_deref().map(|p| {
if p.ends_with('/') {
PathFilter::Prefix(p.to_string())
} else {
PathFilter::Exact(p.to_string())
}
});
let filters = SearchFilters {
source_type,
author: cli_filters.author,
project_id,
after,
updated_after,
labels: cli_filters.labels,
path,
limit: cli_filters.limit,
};
// Adaptive recall: wider initial fetch when filters applied
let requested = filters.clamp_limit();
let top_k = if filters.has_any_filter() {
(requested * 50).max(200).min(1500)
} else {
(requested * 10).max(50).min(1500)
};
// FTS search
let fts_results = search_fts(&conn, query, top_k, fts_mode)?;
let fts_tuples: Vec<(i64, f64)> = fts_results
.iter()
.map(|r| (r.document_id, r.bm25_score))
.collect();
// Build snippet map before ranking
let snippet_map: std::collections::HashMap<i64, String> = fts_results
.iter()
.map(|r| (r.document_id, r.snippet.clone()))
.collect();
// RRF ranking (single-list for lexical mode)
let ranked = rank_rrf(&[], &fts_tuples);
let ranked_ids: Vec<i64> = ranked.iter().map(|r| r.document_id).collect();
// Apply post-retrieval filters
let filtered_ids = apply_filters(&conn, &ranked_ids, &filters)?;
if filtered_ids.is_empty() {
return Ok(SearchResponse {
query: query.to_string(),
mode: "lexical".to_string(),
total_results: 0,
results: vec![],
warnings: vec![],
});
}
// Hydrate results in single round-trip
let hydrated = hydrate_results(&conn, &filtered_ids)?;
// Build display results preserving filter order
let rrf_map: std::collections::HashMap<i64, &crate::search::RrfResult> = ranked
.iter()
.map(|r| (r.document_id, r))
.collect();
let mut results: Vec<SearchResultDisplay> = Vec::with_capacity(hydrated.len());
for row in &hydrated {
let rrf = rrf_map.get(&row.document_id);
let fts_snippet = snippet_map.get(&row.document_id).map(|s| s.as_str());
let snippet = get_result_snippet(fts_snippet, &row.content_text);
let explain_data = if explain {
rrf.map(|r| ExplainData {
vector_rank: r.vector_rank,
fts_rank: r.fts_rank,
rrf_score: r.rrf_score,
})
} else {
None
};
results.push(SearchResultDisplay {
document_id: row.document_id,
source_type: row.source_type.clone(),
title: row.title.clone(),
url: row.url.clone(),
author: row.author.clone(),
created_at: row.created_at.map(ms_to_iso),
updated_at: row.updated_at.map(ms_to_iso),
project_path: row.project_path.clone(),
labels: row.labels.clone(),
paths: row.paths.clone(),
snippet,
score: rrf.map(|r| r.normalized_score).unwrap_or(0.0),
explain: explain_data,
});
}
Ok(SearchResponse {
query: query.to_string(),
mode: "lexical".to_string(),
total_results: results.len(),
results,
warnings: vec![],
})
}
/// Raw row from hydration query.
struct HydratedRow {
document_id: i64,
source_type: String,
title: String,
url: Option<String>,
author: Option<String>,
created_at: Option<i64>,
updated_at: Option<i64>,
content_text: String,
project_path: String,
labels: Vec<String>,
paths: Vec<String>,
}
/// Hydrate document IDs into full display rows in a single query.
///
/// Uses json_each() to pass ranked IDs and preserve ordering via ORDER BY j.key.
/// Labels and paths fetched via correlated json_group_array subqueries.
fn hydrate_results(
conn: &rusqlite::Connection,
document_ids: &[i64],
) -> Result<Vec<HydratedRow>> {
if document_ids.is_empty() {
return Ok(Vec::new());
}
let ids_json = serde_json::to_string(document_ids)
.map_err(|e| LoreError::Other(e.to_string()))?;
let sql = r#"
SELECT d.id, d.source_type, d.title, d.url, d.author_username,
d.created_at, d.updated_at, d.content_text,
p.path_with_namespace AS project_path,
(SELECT json_group_array(dl.label_name)
FROM document_labels dl WHERE dl.document_id = d.id) AS labels_json,
(SELECT json_group_array(dp.path)
FROM document_paths dp WHERE dp.document_id = d.id) AS paths_json
FROM json_each(?1) AS j
JOIN documents d ON d.id = j.value
JOIN projects p ON p.id = d.project_id
ORDER BY j.key
"#;
let mut stmt = conn.prepare(sql)?;
let rows = stmt
.query_map([ids_json], |row| {
let labels_json: String = row.get(9)?;
let paths_json: String = row.get(10)?;
Ok(HydratedRow {
document_id: row.get(0)?,
source_type: row.get(1)?,
title: row.get(2)?,
url: row.get(3)?,
author: row.get(4)?,
created_at: row.get(5)?,
updated_at: row.get(6)?,
content_text: row.get(7)?,
project_path: row.get(8)?,
labels: parse_json_array(&labels_json),
paths: parse_json_array(&paths_json),
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(rows)
}
/// Parse a JSON array string into a Vec<String>, filtering out null/empty.
fn parse_json_array(json: &str) -> Vec<String> {
serde_json::from_str::<Vec<serde_json::Value>>(json)
.unwrap_or_default()
.into_iter()
.filter_map(|v| v.as_str().map(|s| s.to_string()))
.filter(|s| !s.is_empty())
.collect()
}
/// Print human-readable search results.
pub fn print_search_results(response: &SearchResponse) {
if !response.warnings.is_empty() {
for w in &response.warnings {
eprintln!("{} {}", style("Warning:").yellow(), w);
}
}
if response.results.is_empty() {
println!(
"No results found for '{}'",
style(&response.query).bold()
);
return;
}
println!(
"{} results for '{}' ({})",
response.total_results,
style(&response.query).bold(),
response.mode
);
println!();
for (i, result) in response.results.iter().enumerate() {
let type_prefix = match result.source_type.as_str() {
"issue" => "Issue",
"merge_request" => "MR",
"discussion" => "Discussion",
_ => &result.source_type,
};
println!(
"[{}] {} - {} (score: {:.2})",
i + 1,
style(type_prefix).cyan(),
result.title,
result.score
);
if let Some(ref url) = result.url {
println!(" {}", style(url).dim());
}
println!(
" {} | {}",
style(&result.project_path).dim(),
result
.author
.as_deref()
.map(|a| format!("@{}", a))
.unwrap_or_default()
);
if !result.labels.is_empty() {
println!(
" Labels: {}",
result.labels.join(", ")
);
}
// Strip HTML tags from snippet for terminal display
let clean_snippet = result
.snippet
.replace("<mark>", "")
.replace("</mark>", "");
println!(" {}", style(clean_snippet).dim());
if let Some(ref explain) = result.explain {
println!(
" {} fts_rank={} rrf_score={:.6}",
style("[explain]").magenta(),
explain
.fts_rank
.map(|r| r.to_string())
.unwrap_or_else(|| "-".into()),
explain.rrf_score
);
}
println!();
}
}
/// JSON output structures.
#[derive(Serialize)]
struct SearchJsonOutput<'a> {
ok: bool,
data: &'a SearchResponse,
meta: SearchMeta,
}
#[derive(Serialize)]
struct SearchMeta {
elapsed_ms: u64,
}
/// Print JSON robot-mode output.
pub fn print_search_results_json(response: &SearchResponse, elapsed_ms: u64) {
let output = SearchJsonOutput {
ok: true,
data: response,
meta: SearchMeta { elapsed_ms },
};
println!("{}", serde_json::to_string(&output).unwrap());
}

View File

@@ -6,7 +6,7 @@ use serde::Serialize;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::{GiError, Result};
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::time::ms_to_iso;
@@ -188,11 +188,11 @@ fn find_issue(conn: &Connection, iid: i64, project_filter: Option<&str>) -> Resu
.collect::<std::result::Result<Vec<_>, _>>()?;
match issues.len() {
0 => Err(GiError::NotFound(format!("Issue #{} not found", iid))),
0 => Err(LoreError::NotFound(format!("Issue #{} not found", iid))),
1 => Ok(issues.into_iter().next().unwrap()),
_ => {
let projects: Vec<String> = issues.iter().map(|i| i.project_path.clone()).collect();
Err(GiError::Ambiguous(format!(
Err(LoreError::Ambiguous(format!(
"Issue #{} exists in multiple projects: {}. Use --project to specify.",
iid,
projects.join(", ")
@@ -386,11 +386,11 @@ fn find_mr(conn: &Connection, iid: i64, project_filter: Option<&str>) -> Result<
.collect::<std::result::Result<Vec<_>, _>>()?;
match mrs.len() {
0 => Err(GiError::NotFound(format!("MR !{} not found", iid))),
0 => Err(LoreError::NotFound(format!("MR !{} not found", iid))),
1 => Ok(mrs.into_iter().next().unwrap()),
_ => {
let projects: Vec<String> = mrs.iter().map(|m| m.project_path.clone()).collect();
Err(GiError::Ambiguous(format!(
Err(LoreError::Ambiguous(format!(
"MR !{} exists in multiple projects: {}. Use --project to specify.",
iid,
projects.join(", ")

348
src/cli/commands/stats.rs Normal file
View File

@@ -0,0 +1,348 @@
//! Stats command: document counts, embedding coverage, queue status, integrity checks.
use console::style;
use rusqlite::Connection;
use serde::Serialize;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::paths::get_db_path;
use crate::Config;
/// Result of the stats command.
#[derive(Debug, Default, Serialize)]
pub struct StatsResult {
pub documents: DocumentStats,
pub embeddings: EmbeddingStats,
pub fts: FtsStats,
pub queues: QueueStats,
#[serde(skip_serializing_if = "Option::is_none")]
pub integrity: Option<IntegrityResult>,
}
#[derive(Debug, Default, Serialize)]
pub struct DocumentStats {
pub total: i64,
pub issues: i64,
pub merge_requests: i64,
pub discussions: i64,
pub truncated: i64,
}
#[derive(Debug, Default, Serialize)]
pub struct EmbeddingStats {
pub embedded_documents: i64,
pub total_chunks: i64,
pub coverage_pct: f64,
}
#[derive(Debug, Default, Serialize)]
pub struct FtsStats {
pub indexed: i64,
}
#[derive(Debug, Default, Serialize)]
pub struct QueueStats {
pub dirty_sources: i64,
pub dirty_sources_failed: i64,
pub pending_discussion_fetches: i64,
pub pending_discussion_fetches_failed: i64,
}
#[derive(Debug, Default, Serialize)]
pub struct IntegrityResult {
pub ok: bool,
pub fts_doc_mismatch: bool,
pub orphan_embeddings: i64,
pub stale_metadata: i64,
#[serde(skip_serializing_if = "Option::is_none")]
pub repair: Option<RepairResult>,
}
#[derive(Debug, Default, Serialize)]
pub struct RepairResult {
pub fts_rebuilt: bool,
pub orphans_deleted: i64,
pub stale_cleared: i64,
}
/// Run the stats command.
pub fn run_stats(
config: &Config,
check: bool,
repair: bool,
) -> Result<StatsResult> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let mut result = StatsResult::default();
// Document counts
result.documents.total = count_query(&conn, "SELECT COUNT(*) FROM documents")?;
result.documents.issues =
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'issue'")?;
result.documents.merge_requests =
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'merge_request'")?;
result.documents.discussions =
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'discussion'")?;
result.documents.truncated =
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE is_truncated = 1")?;
// Embedding stats — skip gracefully if table doesn't exist (Gate A only)
if table_exists(&conn, "embedding_metadata") {
let embedded = count_query(
&conn,
"SELECT COUNT(DISTINCT document_id) FROM embedding_metadata WHERE last_error IS NULL",
)?;
let chunks = count_query(
&conn,
"SELECT COUNT(*) FROM embedding_metadata WHERE last_error IS NULL",
)?;
result.embeddings.embedded_documents = embedded;
result.embeddings.total_chunks = chunks;
result.embeddings.coverage_pct = if result.documents.total > 0 {
(embedded as f64 / result.documents.total as f64) * 100.0
} else {
0.0
};
}
// FTS stats
result.fts.indexed = count_query(&conn, "SELECT COUNT(*) FROM documents_fts")?;
// Queue stats
result.queues.dirty_sources =
count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NULL")?;
result.queues.dirty_sources_failed =
count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NOT NULL")?;
if table_exists(&conn, "pending_discussion_fetches") {
result.queues.pending_discussion_fetches = count_query(
&conn,
"SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NULL",
)?;
result.queues.pending_discussion_fetches_failed = count_query(
&conn,
"SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NOT NULL",
)?;
}
// Integrity check
if check {
let mut integrity = IntegrityResult::default();
// FTS/doc count mismatch
integrity.fts_doc_mismatch = result.fts.indexed != result.documents.total;
// Orphan embeddings (rowid/1000 should match a document ID)
if table_exists(&conn, "embeddings") {
integrity.orphan_embeddings = count_query(
&conn,
"SELECT COUNT(*) FROM embedding_metadata em
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = em.document_id)",
)?;
}
// Stale metadata (document_hash != current content_hash)
if table_exists(&conn, "embedding_metadata") {
integrity.stale_metadata = count_query(
&conn,
"SELECT COUNT(*) FROM embedding_metadata em
JOIN documents d ON d.id = em.document_id
WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash",
)?;
}
integrity.ok = !integrity.fts_doc_mismatch
&& integrity.orphan_embeddings == 0
&& integrity.stale_metadata == 0;
// Repair
if repair {
let mut repair_result = RepairResult::default();
if integrity.fts_doc_mismatch {
conn.execute(
"INSERT INTO documents_fts(documents_fts) VALUES('rebuild')",
[],
)?;
repair_result.fts_rebuilt = true;
}
if integrity.orphan_embeddings > 0 && table_exists(&conn, "embedding_metadata") {
let deleted = conn.execute(
"DELETE FROM embedding_metadata
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = embedding_metadata.document_id)",
[],
)?;
repair_result.orphans_deleted = deleted as i64;
// Also clean orphaned vectors if vec0 table exists
if table_exists(&conn, "embeddings") {
let _ = conn.execute(
"DELETE FROM embeddings
WHERE rowid / 1000 NOT IN (SELECT id FROM documents)",
[],
);
}
}
if integrity.stale_metadata > 0 && table_exists(&conn, "embedding_metadata") {
let cleared = conn.execute(
"DELETE FROM embedding_metadata
WHERE document_id IN (
SELECT em.document_id FROM embedding_metadata em
JOIN documents d ON d.id = em.document_id
WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash
)",
[],
)?;
repair_result.stale_cleared = cleared as i64;
}
integrity.repair = Some(repair_result);
}
result.integrity = Some(integrity);
}
Ok(result)
}
fn count_query(conn: &Connection, sql: &str) -> Result<i64> {
let count: i64 = conn
.query_row(sql, [], |row| row.get(0))
.unwrap_or(0);
Ok(count)
}
fn table_exists(conn: &Connection, table: &str) -> bool {
conn.query_row(
"SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
[table],
|row| row.get::<_, i64>(0),
)
.unwrap_or(0)
> 0
}
/// Print human-readable stats.
pub fn print_stats(result: &StatsResult) {
println!("{}", style("Documents").cyan().bold());
println!(" Total: {}", result.documents.total);
println!(" Issues: {}", result.documents.issues);
println!(" Merge Requests: {}", result.documents.merge_requests);
println!(" Discussions: {}", result.documents.discussions);
if result.documents.truncated > 0 {
println!(" Truncated: {}", style(result.documents.truncated).yellow());
}
println!();
println!("{}", style("Search Index").cyan().bold());
println!(" FTS indexed: {}", result.fts.indexed);
println!(
" Embedding coverage: {:.1}% ({}/{})",
result.embeddings.coverage_pct,
result.embeddings.embedded_documents,
result.documents.total
);
if result.embeddings.total_chunks > 0 {
println!(" Total chunks: {}", result.embeddings.total_chunks);
}
println!();
println!("{}", style("Queues").cyan().bold());
println!(" Dirty sources: {} pending, {} failed",
result.queues.dirty_sources,
result.queues.dirty_sources_failed
);
println!(" Discussion fetch: {} pending, {} failed",
result.queues.pending_discussion_fetches,
result.queues.pending_discussion_fetches_failed
);
if let Some(ref integrity) = result.integrity {
println!();
let status = if integrity.ok {
style("OK").green().bold()
} else {
style("ISSUES FOUND").red().bold()
};
println!("{} Integrity: {}", style("Check").cyan().bold(), status);
if integrity.fts_doc_mismatch {
println!(" {} FTS/document count mismatch", style("!").red());
}
if integrity.orphan_embeddings > 0 {
println!(
" {} {} orphan embeddings",
style("!").red(),
integrity.orphan_embeddings
);
}
if integrity.stale_metadata > 0 {
println!(
" {} {} stale embedding metadata",
style("!").red(),
integrity.stale_metadata
);
}
if let Some(ref repair) = integrity.repair {
println!();
println!("{}", style("Repair").cyan().bold());
if repair.fts_rebuilt {
println!(" {} FTS index rebuilt", style("fixed").green());
}
if repair.orphans_deleted > 0 {
println!(
" {} {} orphan embeddings deleted",
style("fixed").green(),
repair.orphans_deleted
);
}
if repair.stale_cleared > 0 {
println!(
" {} {} stale metadata entries cleared",
style("fixed").green(),
repair.stale_cleared
);
}
if !repair.fts_rebuilt && repair.orphans_deleted == 0 && repair.stale_cleared == 0 {
println!(" No issues to repair.");
}
}
}
}
/// JSON output structures.
#[derive(Serialize)]
struct StatsJsonOutput {
ok: bool,
data: StatsResult,
}
/// Print JSON robot-mode output.
pub fn print_stats_json(result: &StatsResult) {
let output = StatsJsonOutput {
ok: true,
data: StatsResult {
documents: DocumentStats { ..*&result.documents },
embeddings: EmbeddingStats { ..*&result.embeddings },
fts: FtsStats { ..*&result.fts },
queues: QueueStats { ..*&result.queues },
integrity: result.integrity.as_ref().map(|i| IntegrityResult {
ok: i.ok,
fts_doc_mismatch: i.fts_doc_mismatch,
orphan_embeddings: i.orphan_embeddings,
stale_metadata: i.stale_metadata,
repair: i.repair.as_ref().map(|r| RepairResult {
fts_rebuilt: r.fts_rebuilt,
orphans_deleted: r.orphans_deleted,
stale_cleared: r.stale_cleared,
}),
}),
},
};
println!("{}", serde_json::to_string(&output).unwrap());
}

124
src/cli/commands/sync.rs Normal file
View File

@@ -0,0 +1,124 @@
//! Sync command: unified orchestrator for ingest -> generate-docs -> embed.
use console::style;
use serde::Serialize;
use tracing::{info, warn};
use crate::Config;
use crate::core::error::Result;
use super::embed::run_embed;
use super::generate_docs::run_generate_docs;
use super::ingest::run_ingest;
/// Options for the sync command.
#[derive(Debug, Default)]
pub struct SyncOptions {
pub full: bool,
pub force: bool,
pub no_embed: bool,
pub no_docs: bool,
}
/// Result of the sync command.
#[derive(Debug, Default, Serialize)]
pub struct SyncResult {
pub issues_updated: usize,
pub mrs_updated: usize,
pub discussions_fetched: usize,
pub documents_regenerated: usize,
pub documents_embedded: usize,
}
/// Run the full sync pipeline: ingest -> generate-docs -> embed.
pub async fn run_sync(config: &Config, options: SyncOptions) -> Result<SyncResult> {
let mut result = SyncResult::default();
// Stage 1: Ingest issues
info!("Sync stage 1/4: ingesting issues");
let issues_result = run_ingest(config, "issues", None, options.force, options.full, true).await?;
result.issues_updated = issues_result.issues_upserted;
result.discussions_fetched += issues_result.discussions_fetched;
// Stage 2: Ingest MRs
info!("Sync stage 2/4: ingesting merge requests");
let mrs_result = run_ingest(config, "mrs", None, options.force, options.full, true).await?;
result.mrs_updated = mrs_result.mrs_upserted;
result.discussions_fetched += mrs_result.discussions_fetched;
// Stage 3: Generate documents (unless --no-docs)
if options.no_docs {
info!("Sync stage 3/4: skipping document generation (--no-docs)");
} else {
info!("Sync stage 3/4: generating documents");
let docs_result = run_generate_docs(config, false, None)?;
result.documents_regenerated = docs_result.regenerated;
}
// Stage 4: Embed documents (unless --no-embed)
if options.no_embed {
info!("Sync stage 4/4: skipping embedding (--no-embed)");
} else {
info!("Sync stage 4/4: embedding documents");
match run_embed(config, false).await {
Ok(embed_result) => {
result.documents_embedded = embed_result.embedded;
}
Err(e) => {
// Graceful degradation: Ollama down is a warning, not an error
warn!(error = %e, "Embedding stage failed (Ollama may be unavailable), continuing");
}
}
}
info!(
issues = result.issues_updated,
mrs = result.mrs_updated,
discussions = result.discussions_fetched,
docs = result.documents_regenerated,
embedded = result.documents_embedded,
"Sync pipeline complete"
);
Ok(result)
}
/// Print human-readable sync summary.
pub fn print_sync(result: &SyncResult, elapsed: std::time::Duration) {
println!(
"{} Sync complete:",
style("done").green().bold(),
);
println!(" Issues updated: {}", result.issues_updated);
println!(" MRs updated: {}", result.mrs_updated);
println!(" Discussions fetched: {}", result.discussions_fetched);
println!(" Documents regenerated: {}", result.documents_regenerated);
println!(" Documents embedded: {}", result.documents_embedded);
println!(
" Elapsed: {:.1}s",
elapsed.as_secs_f64()
);
}
/// JSON output for sync.
#[derive(Serialize)]
struct SyncJsonOutput<'a> {
ok: bool,
data: &'a SyncResult,
meta: SyncMeta,
}
#[derive(Serialize)]
struct SyncMeta {
elapsed_ms: u64,
}
/// Print JSON robot-mode sync output.
pub fn print_sync_json(result: &SyncResult, elapsed_ms: u64) {
let output = SyncJsonOutput {
ok: true,
data: result,
meta: SyncMeta { elapsed_ms },
};
println!("{}", serde_json::to_string(&output).unwrap());
}