Files
gitlore/src/cli/commands/search.rs
teernisse 83cd16c918 feat: implement per-note search and document pipeline
- Add SourceType::Note with extract_note_document() and ParentMetadataCache
- Migration 022: composite indexes for notes queries + author_id column
- Migration 024: table rebuild adding 'note' to CHECK constraints, defense triggers
- Migration 025: backfill existing non-system notes into dirty queue
- Add lore notes CLI command with 17 filter options (author, path, resolution, etc.)
- Support table/json/jsonl/csv output formats with field selection
- Wire note dirty tracking through discussion and MR discussion ingestion
- Fix test_migration_024_preserves_existing_data off-by-one (tested wrong migration)
- Fix upsert_document_inner returning false for label/path-only changes
2026-02-12 13:31:24 -05:00

419 lines
12 KiB
Rust

use std::collections::HashMap;
use console::style;
use serde::Serialize;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::core::time::{ms_to_iso, parse_since};
use crate::documents::SourceType;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::search::{
FtsQueryMode, HybridResult, PathFilter, SearchFilters, SearchMode, get_result_snippet,
search_fts, search_hybrid,
};
#[derive(Debug, Serialize)]
pub struct SearchResultDisplay {
pub document_id: i64,
pub source_type: String,
pub title: String,
pub url: Option<String>,
pub author: Option<String>,
pub created_at: Option<String>,
pub updated_at: Option<String>,
pub project_path: String,
pub labels: Vec<String>,
pub paths: Vec<String>,
pub snippet: String,
pub score: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub explain: Option<ExplainData>,
}
#[derive(Debug, Serialize)]
pub struct ExplainData {
pub vector_rank: Option<usize>,
pub fts_rank: Option<usize>,
pub rrf_score: f64,
}
#[derive(Debug, Serialize)]
pub struct SearchResponse {
pub query: String,
pub mode: String,
pub total_results: usize,
pub results: Vec<SearchResultDisplay>,
pub warnings: Vec<String>,
}
pub struct SearchCliFilters {
pub source_type: Option<String>,
pub author: Option<String>,
pub project: Option<String>,
pub labels: Vec<String>,
pub path: Option<String>,
pub since: Option<String>,
pub updated_since: Option<String>,
pub limit: usize,
}
pub async fn run_search(
config: &Config,
query: &str,
cli_filters: SearchCliFilters,
fts_mode: FtsQueryMode,
requested_mode: &str,
explain: bool,
) -> Result<SearchResponse> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let mut warnings: Vec<String> = Vec::new();
let actual_mode = SearchMode::parse(requested_mode).unwrap_or(SearchMode::Hybrid);
let client = if actual_mode != SearchMode::Lexical {
let ollama_cfg = &config.embedding;
Some(OllamaClient::new(OllamaConfig {
base_url: ollama_cfg.base_url.clone(),
model: ollama_cfg.model.clone(),
..OllamaConfig::default()
}))
} else {
None
};
let doc_count: i64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |row| row.get(0))
.unwrap_or(0);
if doc_count == 0 {
warnings.push("No documents indexed. Run 'lore generate-docs' first.".to_string());
return Ok(SearchResponse {
query: query.to_string(),
mode: actual_mode.as_str().to_string(),
total_results: 0,
results: vec![],
warnings,
});
}
let source_type = cli_filters
.source_type
.as_deref()
.and_then(SourceType::parse);
let project_id = cli_filters
.project
.as_deref()
.map(|p| resolve_project(&conn, p))
.transpose()?;
let since = cli_filters
.since
.as_deref()
.map(|s| {
parse_since(s).ok_or_else(|| {
LoreError::Other(format!(
"Invalid --since value '{}'. Use relative (7d, 2w, 1m) or absolute (YYYY-MM-DD) format.",
s
))
})
})
.transpose()?;
let updated_since = cli_filters
.updated_since
.as_deref()
.map(|s| {
parse_since(s).ok_or_else(|| {
LoreError::Other(format!(
"Invalid --updated-since value '{}'. Use relative (7d, 2w, 1m) or absolute (YYYY-MM-DD) format.",
s
))
})
})
.transpose()?;
let path = cli_filters.path.as_deref().map(|p| {
if p.ends_with('/') {
PathFilter::Prefix(p.to_string())
} else {
PathFilter::Exact(p.to_string())
}
});
let filters = SearchFilters {
source_type,
author: cli_filters.author,
project_id,
since,
updated_since,
labels: cli_filters.labels,
path,
limit: cli_filters.limit,
};
// Run FTS separately for snippet extraction (search_hybrid doesn't return snippets).
let snippet_top_k = filters
.clamp_limit()
.checked_mul(10)
.unwrap_or(500)
.clamp(50, 1500);
let fts_results = search_fts(&conn, query, snippet_top_k, fts_mode)?;
let snippet_map: HashMap<i64, String> = fts_results
.iter()
.map(|r| (r.document_id, r.snippet.clone()))
.collect();
// search_hybrid handles recall sizing, RRF ranking, and filter application internally.
let (hybrid_results, mut hybrid_warnings) = search_hybrid(
&conn,
client.as_ref(),
query,
actual_mode,
&filters,
fts_mode,
)
.await?;
warnings.append(&mut hybrid_warnings);
if hybrid_results.is_empty() {
return Ok(SearchResponse {
query: query.to_string(),
mode: actual_mode.as_str().to_string(),
total_results: 0,
results: vec![],
warnings,
});
}
let ranked_ids: Vec<i64> = hybrid_results.iter().map(|r| r.document_id).collect();
let hydrated = hydrate_results(&conn, &ranked_ids)?;
let hybrid_map: HashMap<i64, &HybridResult> =
hybrid_results.iter().map(|r| (r.document_id, r)).collect();
let mut results: Vec<SearchResultDisplay> = Vec::with_capacity(hydrated.len());
for row in &hydrated {
let hr = hybrid_map.get(&row.document_id);
let fts_snippet = snippet_map.get(&row.document_id).map(|s| s.as_str());
let snippet = get_result_snippet(fts_snippet, &row.content_text);
let explain_data = if explain {
hr.map(|r| ExplainData {
vector_rank: r.vector_rank,
fts_rank: r.fts_rank,
rrf_score: r.rrf_score,
})
} else {
None
};
results.push(SearchResultDisplay {
document_id: row.document_id,
source_type: row.source_type.clone(),
title: row.title.clone().unwrap_or_default(),
url: row.url.clone(),
author: row.author.clone(),
created_at: row.created_at.map(ms_to_iso),
updated_at: row.updated_at.map(ms_to_iso),
project_path: row.project_path.clone(),
labels: row.labels.clone(),
paths: row.paths.clone(),
snippet,
score: hr.map(|r| r.score).unwrap_or(0.0),
explain: explain_data,
});
}
Ok(SearchResponse {
query: query.to_string(),
mode: actual_mode.as_str().to_string(),
total_results: results.len(),
results,
warnings,
})
}
struct HydratedRow {
document_id: i64,
source_type: String,
title: Option<String>,
url: Option<String>,
author: Option<String>,
created_at: Option<i64>,
updated_at: Option<i64>,
content_text: String,
project_path: String,
labels: Vec<String>,
paths: Vec<String>,
}
fn hydrate_results(conn: &rusqlite::Connection, document_ids: &[i64]) -> Result<Vec<HydratedRow>> {
if document_ids.is_empty() {
return Ok(Vec::new());
}
let ids_json =
serde_json::to_string(document_ids).map_err(|e| LoreError::Other(e.to_string()))?;
let sql = r#"
SELECT d.id, d.source_type, d.title, d.url, d.author_username,
d.created_at, d.updated_at, d.content_text,
p.path_with_namespace AS project_path,
(SELECT json_group_array(dl.label_name)
FROM document_labels dl WHERE dl.document_id = d.id) AS labels_json,
(SELECT json_group_array(dp.path)
FROM document_paths dp WHERE dp.document_id = d.id) AS paths_json
FROM json_each(?1) AS j
JOIN documents d ON d.id = j.value
JOIN projects p ON p.id = d.project_id
ORDER BY j.key
"#;
let mut stmt = conn.prepare(sql)?;
let rows = stmt
.query_map([ids_json], |row| {
let labels_json: String = row.get(9)?;
let paths_json: String = row.get(10)?;
Ok(HydratedRow {
document_id: row.get(0)?,
source_type: row.get(1)?,
title: row.get(2)?,
url: row.get(3)?,
author: row.get(4)?,
created_at: row.get(5)?,
updated_at: row.get(6)?,
content_text: row.get(7)?,
project_path: row.get(8)?,
labels: parse_json_array(&labels_json),
paths: parse_json_array(&paths_json),
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(rows)
}
fn parse_json_array(json: &str) -> Vec<String> {
serde_json::from_str::<Vec<serde_json::Value>>(json)
.unwrap_or_default()
.into_iter()
.filter_map(|v| v.as_str().map(|s| s.to_string()))
.filter(|s| !s.is_empty())
.collect()
}
pub fn print_search_results(response: &SearchResponse) {
if !response.warnings.is_empty() {
for w in &response.warnings {
eprintln!("{} {}", style("Warning:").yellow(), w);
}
}
if response.results.is_empty() {
println!("No results found for '{}'", style(&response.query).bold());
return;
}
println!(
"{} results for '{}' ({})",
response.total_results,
style(&response.query).bold(),
response.mode
);
println!();
for (i, result) in response.results.iter().enumerate() {
let type_prefix = match result.source_type.as_str() {
"issue" => "Issue",
"merge_request" => "MR",
"discussion" => "Discussion",
"note" => "Note",
_ => &result.source_type,
};
println!(
"[{}] {} - {} (score: {:.2})",
i + 1,
style(type_prefix).cyan(),
result.title,
result.score
);
if let Some(ref url) = result.url {
println!(" {}", style(url).dim());
}
println!(
" {} | {}",
style(&result.project_path).dim(),
result
.author
.as_deref()
.map(|a| format!("@{}", a))
.unwrap_or_default()
);
if !result.labels.is_empty() {
println!(" Labels: {}", result.labels.join(", "));
}
let clean_snippet = result.snippet.replace("<mark>", "").replace("</mark>", "");
println!(" {}", style(clean_snippet).dim());
if let Some(ref explain) = result.explain {
println!(
" {} vector_rank={} fts_rank={} rrf_score={:.6}",
style("[explain]").magenta(),
explain
.vector_rank
.map(|r| r.to_string())
.unwrap_or_else(|| "-".into()),
explain
.fts_rank
.map(|r| r.to_string())
.unwrap_or_else(|| "-".into()),
explain.rrf_score
);
}
println!();
}
}
#[derive(Serialize)]
struct SearchJsonOutput<'a> {
ok: bool,
data: &'a SearchResponse,
meta: SearchMeta,
}
#[derive(Serialize)]
struct SearchMeta {
elapsed_ms: u64,
}
pub fn print_search_results_json(
response: &SearchResponse,
elapsed_ms: u64,
fields: Option<&[String]>,
) {
let output = SearchJsonOutput {
ok: true,
data: response,
meta: SearchMeta { elapsed_ms },
};
let mut value = serde_json::to_value(&output).unwrap();
if let Some(f) = fields {
let expanded = crate::cli::robot::expand_fields_preset(f, "search");
crate::cli::robot::filter_fields(&mut value, "results", &expanded);
}
println!("{}", serde_json::to_string(&value).unwrap());
}