//! Semantic similarity discovery: find related entities via vector search. use std::collections::HashSet; use rusqlite::Connection; use serde::Serialize; use crate::cli::render::{Icons, Theme}; use crate::cli::robot::RobotMeta; use crate::core::config::Config; use crate::core::db::create_connection; use crate::core::error::{LoreError, Result}; use crate::core::paths::get_db_path; use crate::core::project::resolve_project; use crate::core::time::ms_to_iso; use crate::embedding::ollama::{OllamaClient, OllamaConfig}; use crate::search::search_vector; // --------------------------------------------------------------------------- // Response types // --------------------------------------------------------------------------- #[derive(Debug, Serialize)] pub struct RelatedResponse { pub mode: String, #[serde(skip_serializing_if = "Option::is_none")] pub source: Option, #[serde(skip_serializing_if = "Option::is_none")] pub query: Option, pub results: Vec, #[serde(skip_serializing_if = "Vec::is_empty")] pub warnings: Vec, } #[derive(Debug, Serialize)] pub struct RelatedSource { pub source_type: String, pub iid: i64, pub title: String, pub project_path: String, } #[derive(Debug, Serialize)] pub struct RelatedResult { pub source_type: String, pub iid: i64, pub title: String, pub url: String, pub similarity_score: f64, pub project_path: String, #[serde(skip_serializing_if = "Vec::is_empty")] pub shared_labels: Vec, pub author: Option, pub updated_at: String, } // --------------------------------------------------------------------------- // Internal row types // --------------------------------------------------------------------------- struct DocumentRow { id: i64, source_type: String, source_id: i64, #[allow(dead_code)] project_id: i64, #[allow(dead_code)] title: Option, url: Option, content_text: String, label_names: Option, author_username: Option, updated_at: Option, } struct EntityInfo { #[allow(dead_code)] iid: i64, title: String, project_path: String, } // --------------------------------------------------------------------------- // Main entry point // --------------------------------------------------------------------------- /// Run the related command. /// /// Modes: /// - Entity mode: `lore related issues 42` or `lore related mrs 99` /// - Query mode: `lore related 'search terms'` pub async fn run_related( config: &Config, query_or_type: &str, iid: Option, limit: usize, project: Option<&str>, ) -> Result { let db_path = get_db_path(config.storage.db_path.as_deref()); let conn = create_connection(&db_path)?; // Check if embeddings exist let embedding_count: i64 = conn .query_row("SELECT COUNT(*) FROM embedding_metadata", [], |row| { row.get(0) }) .unwrap_or(0); if embedding_count == 0 { return Err(LoreError::Other( "No embeddings found. Run 'lore embed' first to generate vector embeddings.".into(), )); } // Validate input if query_or_type.trim().is_empty() { return Err(LoreError::Other( "Query cannot be empty. Provide an entity type (issues/mrs) and IID, or a search query.".into(), )); } // Determine mode: entity vs query let entity_type = match query_or_type.to_lowercase().as_str() { "issues" | "issue" | "i" => Some("issue"), "mrs" | "mr" | "m" | "merge_request" => Some("merge_request"), _ => None, }; if let Some(etype) = entity_type { // Entity mode let iid = iid.ok_or_else(|| { LoreError::Other("Entity mode requires an IID (e.g., 'lore related issues 42')".into()) })?; run_related_entity(&conn, config, etype, iid, limit, project).await } else { // Query mode - treat query_or_type as free text run_related_query(&conn, config, query_or_type, limit, project).await } } async fn run_related_entity( conn: &Connection, config: &Config, entity_type: &str, iid: i64, limit: usize, project_filter: Option<&str>, ) -> Result { // Find the source document let source_doc = find_entity_document(conn, entity_type, iid, project_filter)?; let source_info = get_entity_info(conn, entity_type, source_doc.source_id)?; // Embed the source content let embedding = embed_text(config, &source_doc.content_text).await?; // Search for similar documents (limit + 1 to account for filtering self) let vector_results = search_vector(conn, &embedding, limit.saturating_add(1))?; // Filter out self and hydrate results let source_labels = parse_label_names(&source_doc.label_names); let mut results = Vec::new(); let mut warnings = Vec::new(); for vr in vector_results { // Skip self if vr.document_id == source_doc.id { continue; } if let Some(result) = hydrate_result(conn, vr.document_id, vr.distance, &source_labels)? { results.push(result); } if results.len() >= limit { break; } } // Check for low similarity if !results.is_empty() && results.iter().all(|r| r.similarity_score < 0.3) { warnings.push("No strongly related entities found (all scores < 0.3)".to_string()); } Ok(RelatedResponse { mode: "entity".to_string(), source: Some(RelatedSource { source_type: entity_type.to_string(), iid, title: source_info.title, project_path: source_info.project_path, }), query: None, results, warnings, }) } async fn run_related_query( conn: &Connection, config: &Config, query: &str, limit: usize, project_filter: Option<&str>, ) -> Result { let mut warnings = Vec::new(); // Warn if query is very short if query.split_whitespace().count() <= 2 { warnings.push("Short queries may produce noisy results".to_string()); } // Embed the query let embedding = embed_text(config, query).await?; // Search for similar documents (fetch extra to allow for project filtering) let vector_results = search_vector(conn, &embedding, limit.saturating_mul(2))?; // Filter by project if specified and hydrate let project_id = project_filter .map(|p| resolve_project(conn, p)) .transpose()?; let mut results = Vec::new(); let empty_labels: HashSet = HashSet::new(); for vr in vector_results { // Check project filter if let Some(pid) = project_id { let doc_project_id: Option = conn .query_row( "SELECT project_id FROM documents WHERE id = ?1", [vr.document_id], |row| row.get(0), ) .ok(); if doc_project_id != Some(pid) { continue; } } if let Some(result) = hydrate_result(conn, vr.document_id, vr.distance, &empty_labels)? { results.push(result); } if results.len() >= limit { break; } } // Check for low similarity if !results.is_empty() && results.iter().all(|r| r.similarity_score < 0.3) { warnings.push("No strongly related entities found (all scores < 0.3)".to_string()); } Ok(RelatedResponse { mode: "query".to_string(), source: None, query: Some(query.to_string()), results, warnings, }) } // --------------------------------------------------------------------------- // DB helpers // --------------------------------------------------------------------------- fn find_entity_document( conn: &Connection, entity_type: &str, iid: i64, project_filter: Option<&str>, ) -> Result { let table = match entity_type { "issue" => "issues", "merge_request" => "merge_requests", _ => { return Err(LoreError::Other(format!( "Unknown entity type: {entity_type}" ))); } }; let (sql, params): (String, Vec>) = match project_filter { Some(project) => { let project_id = resolve_project(conn, project)?; ( format!( "SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url, d.content_text, d.label_names, d.author_username, d.updated_at FROM documents d JOIN {table} e ON d.source_id = e.id WHERE d.source_type = ?1 AND e.iid = ?2 AND e.project_id = ?3" ), vec![ Box::new(entity_type.to_string()), Box::new(iid), Box::new(project_id), ], ) } None => ( format!( "SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url, d.content_text, d.label_names, d.author_username, d.updated_at FROM documents d JOIN {table} e ON d.source_id = e.id WHERE d.source_type = ?1 AND e.iid = ?2" ), vec![Box::new(entity_type.to_string()), Box::new(iid)], ), }; let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect(); let mut stmt = conn.prepare(&sql)?; let rows: Vec = stmt .query_map(param_refs.as_slice(), |row| { Ok(DocumentRow { id: row.get(0)?, source_type: row.get(1)?, source_id: row.get(2)?, project_id: row.get(3)?, title: row.get(4)?, url: row.get(5)?, content_text: row.get(6)?, label_names: row.get(7)?, author_username: row.get(8)?, updated_at: row.get(9)?, }) })? .collect::, _>>()?; match rows.len() { 0 => Err(LoreError::NotFound(format!( "{entity_type} #{iid} not found (run 'lore sync' first?)" ))), 1 => Ok(rows.into_iter().next().unwrap()), _ => Err(LoreError::Ambiguous(format!( "{entity_type} #{iid} exists in multiple projects. Use --project to specify." ))), } } fn get_entity_info(conn: &Connection, entity_type: &str, entity_id: i64) -> Result { let table = match entity_type { "issue" => "issues", "merge_request" => "merge_requests", _ => { return Err(LoreError::Other(format!( "Unknown entity type: {entity_type}" ))); } }; let sql = format!( "SELECT e.iid, e.title, p.path_with_namespace FROM {table} e JOIN projects p ON e.project_id = p.id WHERE e.id = ?1" ); conn.query_row(&sql, [entity_id], |row| { Ok(EntityInfo { iid: row.get(0)?, title: row.get(1)?, project_path: row.get(2)?, }) }) .map_err(|e| LoreError::NotFound(format!("Entity not found: {e}"))) } fn hydrate_result( conn: &Connection, document_id: i64, distance: f64, source_labels: &HashSet, ) -> Result> { let doc: Option = conn .query_row( "SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url, d.content_text, d.label_names, d.author_username, d.updated_at FROM documents d WHERE d.id = ?1", [document_id], |row| { Ok(DocumentRow { id: row.get(0)?, source_type: row.get(1)?, source_id: row.get(2)?, project_id: row.get(3)?, title: row.get(4)?, url: row.get(5)?, content_text: row.get(6)?, label_names: row.get(7)?, author_username: row.get(8)?, updated_at: row.get(9)?, }) }, ) .ok(); let Some(doc) = doc else { return Ok(None); }; // Skip discussion/note documents - we want entities only if doc.source_type == "discussion" || doc.source_type == "note" { return Ok(None); } // Get IID from the source entity let table = match doc.source_type.as_str() { "issue" => "issues", "merge_request" => "merge_requests", _ => return Ok(None), }; // Get IID and title from the source entity - skip gracefully if not found // (this handles orphaned documents where the entity was deleted) let entity_info: Option<(i64, String, String)> = conn .query_row( &format!( "SELECT e.iid, e.title, p.path_with_namespace FROM {table} e JOIN projects p ON e.project_id = p.id WHERE e.id = ?1" ), [doc.source_id], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), ) .ok(); let Some((iid, title, project_path)) = entity_info else { // Entity not found in database - skip this result return Ok(None); }; // Compute shared labels let result_labels = parse_label_names(&doc.label_names); let shared_labels: Vec = source_labels .intersection(&result_labels) .cloned() .collect(); Ok(Some(RelatedResult { source_type: doc.source_type, iid, title, url: doc.url.unwrap_or_default(), similarity_score: distance_to_similarity(distance), project_path, shared_labels, author: doc.author_username, updated_at: doc.updated_at.map(ms_to_iso).unwrap_or_default(), })) } // --------------------------------------------------------------------------- // Embedding helper // --------------------------------------------------------------------------- async fn embed_text(config: &Config, text: &str) -> Result> { let ollama = OllamaClient::new(OllamaConfig { base_url: config.embedding.base_url.clone(), model: config.embedding.model.clone(), timeout_secs: 60, }); let embeddings = ollama.embed_batch(&[text]).await?; embeddings .into_iter() .next() .ok_or_else(|| LoreError::EmbeddingFailed { document_id: 0, reason: "No embedding returned".to_string(), }) } // --------------------------------------------------------------------------- // Utilities // --------------------------------------------------------------------------- /// Convert L2 distance to a 0-1 similarity score. /// Uses inverse relationship: closer (lower distance) = higher similarity. fn distance_to_similarity(distance: f64) -> f64 { 1.0 / (1.0 + distance) } fn parse_label_names(label_names_json: &Option) -> HashSet { label_names_json .as_deref() .and_then(|s| serde_json::from_str::>(s).ok()) .unwrap_or_default() .into_iter() .collect() } // --------------------------------------------------------------------------- // Printers // --------------------------------------------------------------------------- pub fn print_related_human(response: &RelatedResponse) { // Header let header = match &response.source { Some(src) => format!("Related to {} #{}: {}", src.source_type, src.iid, src.title), None => format!( "Related to query: \"{}\"", response.query.as_deref().unwrap_or("") ), }; println!("{}", Theme::bold().render(&header)); println!("{}", "-".repeat(header.len().min(70))); println!(); if response.results.is_empty() { println!("No related entities found."); return; } for (i, result) in response.results.iter().enumerate() { let type_icon = match result.source_type.as_str() { "issue" => Icons::issue_opened(), "merge_request" => Icons::mr_opened(), _ => " ", }; let score_bar_len = (result.similarity_score * 10.0) as usize; let score_bar: String = "\u{2588}".repeat(score_bar_len); println!( "{:>2}. {} {} #{} ({:.0}%) {}", i + 1, type_icon, result.source_type, result.iid, result.similarity_score * 100.0, score_bar ); println!(" {}", result.title); println!( " {} | @{}", result.project_path, result.author.as_deref().unwrap_or("?") ); if !result.shared_labels.is_empty() { println!(" Labels shared: {}", result.shared_labels.join(", ")); } println!(); } // Warnings for warning in &response.warnings { println!("{} {}", Theme::warning().render(Icons::warning()), warning); } } pub fn print_related_json(response: &RelatedResponse, elapsed_ms: u64) { let meta = RobotMeta { elapsed_ms }; let output = serde_json::json!({ "ok": true, "data": response, "meta": meta, }); match serde_json::to_string(&output) { Ok(json) => println!("{json}"), Err(e) => eprintln!("Error serializing to JSON: {e}"), } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; #[test] fn test_distance_to_similarity_identical() { assert!((distance_to_similarity(0.0) - 1.0).abs() < f64::EPSILON); } #[test] fn test_distance_to_similarity_midpoint() { assert!((distance_to_similarity(1.0) - 0.5).abs() < f64::EPSILON); } #[test] fn test_distance_to_similarity_large() { let sim = distance_to_similarity(2.0); assert!(sim > 0.0 && sim < 0.5); assert!((sim - 0.333_333_333_333_333_3).abs() < 0.001); } #[test] fn test_distance_to_similarity_range() { for d in [0.0, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0] { let sim = distance_to_similarity(d); assert!( sim > 0.0 && sim <= 1.0, "score {sim} out of range for distance {d}" ); } } #[test] fn test_parse_label_names_valid() { let json = Some(r#"["bug", "priority::high"]"#.to_string()); let labels = parse_label_names(&json); assert!(labels.contains("bug")); assert!(labels.contains("priority::high")); assert_eq!(labels.len(), 2); } #[test] fn test_parse_label_names_empty() { let labels = parse_label_names(&None); assert!(labels.is_empty()); } #[test] fn test_parse_label_names_invalid_json() { let json = Some("not valid json".to_string()); let labels = parse_label_names(&json); assert!(labels.is_empty()); } #[test] fn test_parse_label_names_empty_array() { let json = Some("[]".to_string()); let labels = parse_label_names(&json); assert!(labels.is_empty()); } }