RobotMeta previously required direct struct literal construction with only elapsed_ms. This made it impossible to add optional fields without updating every call site to include them. Introduce two constructors: - RobotMeta::new(elapsed_ms) — standard meta with timing only - RobotMeta::with_base_url(elapsed_ms, base_url) — meta enriched with the GitLab instance URL, enabling consumers to construct entity links without needing config access The gitlab_base_url field uses #[serde(skip_serializing_if = "Option::is_none")] so existing JSON envelopes are byte-identical — no breaking change for any robot mode consumer. All 22 call sites across handlers, count, cron, drift, embed, generate_docs, ingest, list (mrs/notes), related, show, stats, sync_status, and who are updated from struct literals to RobotMeta::new(). Three tests verify the new constructors and trailing-slash normalization. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
638 lines
19 KiB
Rust
638 lines
19 KiB
Rust
//! Semantic similarity discovery: find related entities via vector search.
|
|
|
|
use std::collections::HashSet;
|
|
|
|
use rusqlite::Connection;
|
|
use serde::Serialize;
|
|
|
|
use crate::cli::render::{Icons, Theme};
|
|
use crate::cli::robot::RobotMeta;
|
|
use crate::core::config::Config;
|
|
use crate::core::db::create_connection;
|
|
use crate::core::error::{LoreError, Result};
|
|
use crate::core::paths::get_db_path;
|
|
use crate::core::project::resolve_project;
|
|
use crate::core::time::ms_to_iso;
|
|
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
|
|
use crate::search::search_vector;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Response types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct RelatedResponse {
|
|
pub mode: String,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub source: Option<RelatedSource>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub query: Option<String>,
|
|
pub results: Vec<RelatedResult>,
|
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
|
pub warnings: Vec<String>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct RelatedSource {
|
|
pub source_type: String,
|
|
pub iid: i64,
|
|
pub title: String,
|
|
pub project_path: String,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct RelatedResult {
|
|
pub source_type: String,
|
|
pub iid: i64,
|
|
pub title: String,
|
|
pub url: String,
|
|
pub similarity_score: f64,
|
|
pub project_path: String,
|
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
|
pub shared_labels: Vec<String>,
|
|
pub author: Option<String>,
|
|
pub updated_at: String,
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Internal row types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
struct DocumentRow {
|
|
id: i64,
|
|
source_type: String,
|
|
source_id: i64,
|
|
#[allow(dead_code)]
|
|
project_id: i64,
|
|
#[allow(dead_code)]
|
|
title: Option<String>,
|
|
url: Option<String>,
|
|
content_text: String,
|
|
label_names: Option<String>,
|
|
author_username: Option<String>,
|
|
updated_at: Option<i64>,
|
|
}
|
|
|
|
struct EntityInfo {
|
|
#[allow(dead_code)]
|
|
iid: i64,
|
|
title: String,
|
|
project_path: String,
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main entry point
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Run the related command.
|
|
///
|
|
/// Modes:
|
|
/// - Entity mode: `lore related issues 42` or `lore related mrs 99`
|
|
/// - Query mode: `lore related 'search terms'`
|
|
pub async fn run_related(
|
|
config: &Config,
|
|
query_or_type: &str,
|
|
iid: Option<i64>,
|
|
limit: usize,
|
|
project: Option<&str>,
|
|
) -> Result<RelatedResponse> {
|
|
let db_path = get_db_path(config.storage.db_path.as_deref());
|
|
let conn = create_connection(&db_path)?;
|
|
|
|
// Check if embeddings exist
|
|
let embedding_count: i64 = conn
|
|
.query_row("SELECT COUNT(*) FROM embedding_metadata", [], |row| {
|
|
row.get(0)
|
|
})
|
|
.unwrap_or(0);
|
|
|
|
if embedding_count == 0 {
|
|
return Err(LoreError::Other(
|
|
"No embeddings found. Run 'lore embed' first to generate vector embeddings.".into(),
|
|
));
|
|
}
|
|
|
|
// Validate input
|
|
if query_or_type.trim().is_empty() {
|
|
return Err(LoreError::Other(
|
|
"Query cannot be empty. Provide an entity type (issues/mrs) and IID, or a search query.".into(),
|
|
));
|
|
}
|
|
|
|
// Determine mode: entity vs query
|
|
let entity_type = match query_or_type.to_lowercase().as_str() {
|
|
"issues" | "issue" | "i" => Some("issue"),
|
|
"mrs" | "mr" | "m" | "merge_request" => Some("merge_request"),
|
|
_ => None,
|
|
};
|
|
|
|
if let Some(etype) = entity_type {
|
|
// Entity mode
|
|
let iid = iid.ok_or_else(|| {
|
|
LoreError::Other("Entity mode requires an IID (e.g., 'lore related issues 42')".into())
|
|
})?;
|
|
run_related_entity(&conn, config, etype, iid, limit, project).await
|
|
} else {
|
|
// Query mode - treat query_or_type as free text
|
|
run_related_query(&conn, config, query_or_type, limit, project).await
|
|
}
|
|
}
|
|
|
|
async fn run_related_entity(
|
|
conn: &Connection,
|
|
config: &Config,
|
|
entity_type: &str,
|
|
iid: i64,
|
|
limit: usize,
|
|
project_filter: Option<&str>,
|
|
) -> Result<RelatedResponse> {
|
|
// Find the source document
|
|
let source_doc = find_entity_document(conn, entity_type, iid, project_filter)?;
|
|
let source_info = get_entity_info(conn, entity_type, source_doc.source_id)?;
|
|
|
|
// Embed the source content
|
|
let embedding = embed_text(config, &source_doc.content_text).await?;
|
|
|
|
// Search for similar documents (limit + 1 to account for filtering self)
|
|
let vector_results = search_vector(conn, &embedding, limit.saturating_add(1))?;
|
|
|
|
// Filter out self and hydrate results
|
|
let source_labels = parse_label_names(&source_doc.label_names);
|
|
let mut results = Vec::new();
|
|
let mut warnings = Vec::new();
|
|
|
|
for vr in vector_results {
|
|
// Skip self
|
|
if vr.document_id == source_doc.id {
|
|
continue;
|
|
}
|
|
|
|
if let Some(result) = hydrate_result(conn, vr.document_id, vr.distance, &source_labels)? {
|
|
results.push(result);
|
|
}
|
|
|
|
if results.len() >= limit {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check for low similarity
|
|
if !results.is_empty() && results.iter().all(|r| r.similarity_score < 0.3) {
|
|
warnings.push("No strongly related entities found (all scores < 0.3)".to_string());
|
|
}
|
|
|
|
Ok(RelatedResponse {
|
|
mode: "entity".to_string(),
|
|
source: Some(RelatedSource {
|
|
source_type: entity_type.to_string(),
|
|
iid,
|
|
title: source_info.title,
|
|
project_path: source_info.project_path,
|
|
}),
|
|
query: None,
|
|
results,
|
|
warnings,
|
|
})
|
|
}
|
|
|
|
async fn run_related_query(
|
|
conn: &Connection,
|
|
config: &Config,
|
|
query: &str,
|
|
limit: usize,
|
|
project_filter: Option<&str>,
|
|
) -> Result<RelatedResponse> {
|
|
let mut warnings = Vec::new();
|
|
|
|
// Warn if query is very short
|
|
if query.split_whitespace().count() <= 2 {
|
|
warnings.push("Short queries may produce noisy results".to_string());
|
|
}
|
|
|
|
// Embed the query
|
|
let embedding = embed_text(config, query).await?;
|
|
|
|
// Search for similar documents (fetch extra to allow for project filtering)
|
|
let vector_results = search_vector(conn, &embedding, limit.saturating_mul(2))?;
|
|
|
|
// Filter by project if specified and hydrate
|
|
let project_id = project_filter
|
|
.map(|p| resolve_project(conn, p))
|
|
.transpose()?;
|
|
|
|
let mut results = Vec::new();
|
|
let empty_labels: HashSet<String> = HashSet::new();
|
|
|
|
for vr in vector_results {
|
|
// Check project filter
|
|
if let Some(pid) = project_id {
|
|
let doc_project_id: Option<i64> = conn
|
|
.query_row(
|
|
"SELECT project_id FROM documents WHERE id = ?1",
|
|
[vr.document_id],
|
|
|row| row.get(0),
|
|
)
|
|
.ok();
|
|
|
|
if doc_project_id != Some(pid) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if let Some(result) = hydrate_result(conn, vr.document_id, vr.distance, &empty_labels)? {
|
|
results.push(result);
|
|
}
|
|
|
|
if results.len() >= limit {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check for low similarity
|
|
if !results.is_empty() && results.iter().all(|r| r.similarity_score < 0.3) {
|
|
warnings.push("No strongly related entities found (all scores < 0.3)".to_string());
|
|
}
|
|
|
|
Ok(RelatedResponse {
|
|
mode: "query".to_string(),
|
|
source: None,
|
|
query: Some(query.to_string()),
|
|
results,
|
|
warnings,
|
|
})
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// DB helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn find_entity_document(
|
|
conn: &Connection,
|
|
entity_type: &str,
|
|
iid: i64,
|
|
project_filter: Option<&str>,
|
|
) -> Result<DocumentRow> {
|
|
let table = match entity_type {
|
|
"issue" => "issues",
|
|
"merge_request" => "merge_requests",
|
|
_ => {
|
|
return Err(LoreError::Other(format!(
|
|
"Unknown entity type: {entity_type}"
|
|
)));
|
|
}
|
|
};
|
|
|
|
let (sql, params): (String, Vec<Box<dyn rusqlite::ToSql>>) = match project_filter {
|
|
Some(project) => {
|
|
let project_id = resolve_project(conn, project)?;
|
|
(
|
|
format!(
|
|
"SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url,
|
|
d.content_text, d.label_names, d.author_username, d.updated_at
|
|
FROM documents d
|
|
JOIN {table} e ON d.source_id = e.id
|
|
WHERE d.source_type = ?1 AND e.iid = ?2 AND e.project_id = ?3"
|
|
),
|
|
vec![
|
|
Box::new(entity_type.to_string()),
|
|
Box::new(iid),
|
|
Box::new(project_id),
|
|
],
|
|
)
|
|
}
|
|
None => (
|
|
format!(
|
|
"SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url,
|
|
d.content_text, d.label_names, d.author_username, d.updated_at
|
|
FROM documents d
|
|
JOIN {table} e ON d.source_id = e.id
|
|
WHERE d.source_type = ?1 AND e.iid = ?2"
|
|
),
|
|
vec![Box::new(entity_type.to_string()), Box::new(iid)],
|
|
),
|
|
};
|
|
|
|
let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();
|
|
|
|
let mut stmt = conn.prepare(&sql)?;
|
|
let rows: Vec<DocumentRow> = stmt
|
|
.query_map(param_refs.as_slice(), |row| {
|
|
Ok(DocumentRow {
|
|
id: row.get(0)?,
|
|
source_type: row.get(1)?,
|
|
source_id: row.get(2)?,
|
|
project_id: row.get(3)?,
|
|
title: row.get(4)?,
|
|
url: row.get(5)?,
|
|
content_text: row.get(6)?,
|
|
label_names: row.get(7)?,
|
|
author_username: row.get(8)?,
|
|
updated_at: row.get(9)?,
|
|
})
|
|
})?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
match rows.len() {
|
|
0 => Err(LoreError::NotFound(format!(
|
|
"{entity_type} #{iid} not found (run 'lore sync' first?)"
|
|
))),
|
|
1 => Ok(rows.into_iter().next().unwrap()),
|
|
_ => Err(LoreError::Ambiguous(format!(
|
|
"{entity_type} #{iid} exists in multiple projects. Use --project to specify."
|
|
))),
|
|
}
|
|
}
|
|
|
|
fn get_entity_info(conn: &Connection, entity_type: &str, entity_id: i64) -> Result<EntityInfo> {
|
|
let table = match entity_type {
|
|
"issue" => "issues",
|
|
"merge_request" => "merge_requests",
|
|
_ => {
|
|
return Err(LoreError::Other(format!(
|
|
"Unknown entity type: {entity_type}"
|
|
)));
|
|
}
|
|
};
|
|
|
|
let sql = format!(
|
|
"SELECT e.iid, e.title, p.path_with_namespace
|
|
FROM {table} e
|
|
JOIN projects p ON e.project_id = p.id
|
|
WHERE e.id = ?1"
|
|
);
|
|
|
|
conn.query_row(&sql, [entity_id], |row| {
|
|
Ok(EntityInfo {
|
|
iid: row.get(0)?,
|
|
title: row.get(1)?,
|
|
project_path: row.get(2)?,
|
|
})
|
|
})
|
|
.map_err(|e| LoreError::NotFound(format!("Entity not found: {e}")))
|
|
}
|
|
|
|
fn hydrate_result(
|
|
conn: &Connection,
|
|
document_id: i64,
|
|
distance: f64,
|
|
source_labels: &HashSet<String>,
|
|
) -> Result<Option<RelatedResult>> {
|
|
let doc: Option<DocumentRow> = conn
|
|
.query_row(
|
|
"SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url,
|
|
d.content_text, d.label_names, d.author_username, d.updated_at
|
|
FROM documents d
|
|
WHERE d.id = ?1",
|
|
[document_id],
|
|
|row| {
|
|
Ok(DocumentRow {
|
|
id: row.get(0)?,
|
|
source_type: row.get(1)?,
|
|
source_id: row.get(2)?,
|
|
project_id: row.get(3)?,
|
|
title: row.get(4)?,
|
|
url: row.get(5)?,
|
|
content_text: row.get(6)?,
|
|
label_names: row.get(7)?,
|
|
author_username: row.get(8)?,
|
|
updated_at: row.get(9)?,
|
|
})
|
|
},
|
|
)
|
|
.ok();
|
|
|
|
let Some(doc) = doc else {
|
|
return Ok(None);
|
|
};
|
|
|
|
// Skip discussion/note documents - we want entities only
|
|
if doc.source_type == "discussion" || doc.source_type == "note" {
|
|
return Ok(None);
|
|
}
|
|
|
|
// Get IID from the source entity
|
|
let table = match doc.source_type.as_str() {
|
|
"issue" => "issues",
|
|
"merge_request" => "merge_requests",
|
|
_ => return Ok(None),
|
|
};
|
|
|
|
// Get IID and title from the source entity - skip gracefully if not found
|
|
// (this handles orphaned documents where the entity was deleted)
|
|
let entity_info: Option<(i64, String, String)> = conn
|
|
.query_row(
|
|
&format!(
|
|
"SELECT e.iid, e.title, p.path_with_namespace
|
|
FROM {table} e
|
|
JOIN projects p ON e.project_id = p.id
|
|
WHERE e.id = ?1"
|
|
),
|
|
[doc.source_id],
|
|
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
|
|
)
|
|
.ok();
|
|
|
|
let Some((iid, title, project_path)) = entity_info else {
|
|
// Entity not found in database - skip this result
|
|
return Ok(None);
|
|
};
|
|
|
|
// Compute shared labels
|
|
let result_labels = parse_label_names(&doc.label_names);
|
|
let shared_labels: Vec<String> = source_labels
|
|
.intersection(&result_labels)
|
|
.cloned()
|
|
.collect();
|
|
|
|
Ok(Some(RelatedResult {
|
|
source_type: doc.source_type,
|
|
iid,
|
|
title,
|
|
url: doc.url.unwrap_or_default(),
|
|
similarity_score: distance_to_similarity(distance),
|
|
project_path,
|
|
shared_labels,
|
|
author: doc.author_username,
|
|
updated_at: doc.updated_at.map(ms_to_iso).unwrap_or_default(),
|
|
}))
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Embedding helper
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async fn embed_text(config: &Config, text: &str) -> Result<Vec<f32>> {
|
|
let ollama = OllamaClient::new(OllamaConfig {
|
|
base_url: config.embedding.base_url.clone(),
|
|
model: config.embedding.model.clone(),
|
|
timeout_secs: 60,
|
|
});
|
|
|
|
let embeddings = ollama.embed_batch(&[text]).await?;
|
|
embeddings
|
|
.into_iter()
|
|
.next()
|
|
.ok_or_else(|| LoreError::EmbeddingFailed {
|
|
document_id: 0,
|
|
reason: "No embedding returned".to_string(),
|
|
})
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Utilities
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Convert L2 distance to a 0-1 similarity score.
|
|
/// Uses inverse relationship: closer (lower distance) = higher similarity.
|
|
fn distance_to_similarity(distance: f64) -> f64 {
|
|
1.0 / (1.0 + distance)
|
|
}
|
|
|
|
fn parse_label_names(label_names_json: &Option<String>) -> HashSet<String> {
|
|
label_names_json
|
|
.as_deref()
|
|
.and_then(|s| serde_json::from_str::<Vec<String>>(s).ok())
|
|
.unwrap_or_default()
|
|
.into_iter()
|
|
.collect()
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Printers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
pub fn print_related_human(response: &RelatedResponse) {
|
|
// Header
|
|
let header = match &response.source {
|
|
Some(src) => format!("Related to {} #{}: {}", src.source_type, src.iid, src.title),
|
|
None => format!(
|
|
"Related to query: \"{}\"",
|
|
response.query.as_deref().unwrap_or("")
|
|
),
|
|
};
|
|
println!("{}", Theme::bold().render(&header));
|
|
println!("{}", "-".repeat(header.len().min(70)));
|
|
println!();
|
|
|
|
if response.results.is_empty() {
|
|
println!("No related entities found.");
|
|
return;
|
|
}
|
|
|
|
for (i, result) in response.results.iter().enumerate() {
|
|
let type_icon = match result.source_type.as_str() {
|
|
"issue" => Icons::issue_opened(),
|
|
"merge_request" => Icons::mr_opened(),
|
|
_ => " ",
|
|
};
|
|
|
|
let score_bar_len = (result.similarity_score * 10.0) as usize;
|
|
let score_bar: String = "\u{2588}".repeat(score_bar_len);
|
|
|
|
println!(
|
|
"{:>2}. {} {} #{} ({:.0}%) {}",
|
|
i + 1,
|
|
type_icon,
|
|
result.source_type,
|
|
result.iid,
|
|
result.similarity_score * 100.0,
|
|
score_bar
|
|
);
|
|
println!(" {}", result.title);
|
|
println!(
|
|
" {} | @{}",
|
|
result.project_path,
|
|
result.author.as_deref().unwrap_or("?")
|
|
);
|
|
|
|
if !result.shared_labels.is_empty() {
|
|
println!(" Labels shared: {}", result.shared_labels.join(", "));
|
|
}
|
|
println!();
|
|
}
|
|
|
|
// Warnings
|
|
for warning in &response.warnings {
|
|
println!("{} {}", Theme::warning().render(Icons::warning()), warning);
|
|
}
|
|
}
|
|
|
|
pub fn print_related_json(response: &RelatedResponse, elapsed_ms: u64) {
|
|
let meta = RobotMeta::new(elapsed_ms);
|
|
let output = serde_json::json!({
|
|
"ok": true,
|
|
"data": response,
|
|
"meta": meta,
|
|
});
|
|
match serde_json::to_string(&output) {
|
|
Ok(json) => println!("{json}"),
|
|
Err(e) => eprintln!("Error serializing to JSON: {e}"),
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_distance_to_similarity_identical() {
|
|
assert!((distance_to_similarity(0.0) - 1.0).abs() < f64::EPSILON);
|
|
}
|
|
|
|
#[test]
|
|
fn test_distance_to_similarity_midpoint() {
|
|
assert!((distance_to_similarity(1.0) - 0.5).abs() < f64::EPSILON);
|
|
}
|
|
|
|
#[test]
|
|
fn test_distance_to_similarity_large() {
|
|
let sim = distance_to_similarity(2.0);
|
|
assert!(sim > 0.0 && sim < 0.5);
|
|
assert!((sim - 0.333_333_333_333_333_3).abs() < 0.001);
|
|
}
|
|
|
|
#[test]
|
|
fn test_distance_to_similarity_range() {
|
|
for d in [0.0, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0] {
|
|
let sim = distance_to_similarity(d);
|
|
assert!(
|
|
sim > 0.0 && sim <= 1.0,
|
|
"score {sim} out of range for distance {d}"
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_label_names_valid() {
|
|
let json = Some(r#"["bug", "priority::high"]"#.to_string());
|
|
let labels = parse_label_names(&json);
|
|
assert!(labels.contains("bug"));
|
|
assert!(labels.contains("priority::high"));
|
|
assert_eq!(labels.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_label_names_empty() {
|
|
let labels = parse_label_names(&None);
|
|
assert!(labels.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_label_names_invalid_json() {
|
|
let json = Some("not valid json".to_string());
|
|
let labels = parse_label_names(&json);
|
|
assert!(labels.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_label_names_empty_array() {
|
|
let json = Some("[]".to_string());
|
|
let labels = parse_label_names(&json);
|
|
assert!(labels.is_empty());
|
|
}
|
|
}
|