feat(related): add semantic similarity discovery command

Implement `lore related` command for discovering semantically similar entities
using vector embeddings. Supports two modes:

Entity mode:
  lore related issues 42     # Find entities similar to issue #42
  lore related mrs 99        # Find entities similar to MR !99

Query mode:
  lore related "auth bug"    # Find entities matching free text query

Key features:
- Uses existing embedding infrastructure (nomic-embed-text via Ollama)
- Computes shared labels between source and results
- Shows similarity scores as percentage (0-100%)
- Warns when all results have low similarity (<30%)
- Warns for short queries (<=2 words) that may produce noisy results
- Filters out discussion/note documents, returning only issues and MRs
- Handles orphaned documents gracefully (skips if entity deleted)
- Robot mode JSON output with {ok, data, meta} envelope

Implementation details:
- distance_to_similarity() converts L2 distance to 0-1 score: 1/(1+distance)
- Uses saturating_add/saturating_mul for overflow safety on limit parameter
- Proper error handling for missing embeddings ("run lore embed first")
- Project scoping via -p flag with fuzzy matching

CLI integration:
- Added to autocorrect.rs command registry
- Added Related variant to Commands enum in cli/mod.rs
- Wired into main.rs with handle_related()

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
teernisse
2026-02-26 11:06:01 -05:00
parent 7fdeafa330
commit 8657e10822
5 changed files with 711 additions and 10 deletions

View File

@@ -183,6 +183,7 @@ const COMMAND_FLAGS: &[(&str, &[&str])] = &[
"--max-evidence",
],
),
("related", &["--limit", "--project"]),
(
"who",
&[

View File

@@ -11,6 +11,7 @@ pub mod ingest;
pub mod init;
pub mod list;
pub mod me;
pub mod related;
pub mod search;
pub mod show;
pub mod stats;
@@ -48,6 +49,7 @@ pub use list::{
print_list_notes, print_list_notes_json, query_notes, run_list_issues, run_list_mrs,
};
pub use me::run_me;
pub use related::{RelatedResponse, print_related_human, print_related_json, run_related};
pub use search::{
SearchCliFilters, SearchResponse, print_search_results, print_search_results_json, run_search,
};

637
src/cli/commands/related.rs Normal file
View File

@@ -0,0 +1,637 @@
//! Semantic similarity discovery: find related entities via vector search.
use std::collections::HashSet;
use rusqlite::Connection;
use serde::Serialize;
use crate::cli::render::{Icons, Theme};
use crate::cli::robot::RobotMeta;
use crate::core::config::Config;
use crate::core::db::create_connection;
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::core::time::ms_to_iso;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::search::search_vector;
// ---------------------------------------------------------------------------
// Response types
// ---------------------------------------------------------------------------
#[derive(Debug, Serialize)]
pub struct RelatedResponse {
pub mode: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub source: Option<RelatedSource>,
#[serde(skip_serializing_if = "Option::is_none")]
pub query: Option<String>,
pub results: Vec<RelatedResult>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub warnings: Vec<String>,
}
#[derive(Debug, Serialize)]
pub struct RelatedSource {
pub source_type: String,
pub iid: i64,
pub title: String,
pub project_path: String,
}
#[derive(Debug, Serialize)]
pub struct RelatedResult {
pub source_type: String,
pub iid: i64,
pub title: String,
pub url: String,
pub similarity_score: f64,
pub project_path: String,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub shared_labels: Vec<String>,
pub author: Option<String>,
pub updated_at: String,
}
// ---------------------------------------------------------------------------
// Internal row types
// ---------------------------------------------------------------------------
struct DocumentRow {
id: i64,
source_type: String,
source_id: i64,
#[allow(dead_code)]
project_id: i64,
#[allow(dead_code)]
title: Option<String>,
url: Option<String>,
content_text: String,
label_names: Option<String>,
author_username: Option<String>,
updated_at: Option<i64>,
}
struct EntityInfo {
#[allow(dead_code)]
iid: i64,
title: String,
project_path: String,
}
// ---------------------------------------------------------------------------
// Main entry point
// ---------------------------------------------------------------------------
/// Run the related command.
///
/// Modes:
/// - Entity mode: `lore related issues 42` or `lore related mrs 99`
/// - Query mode: `lore related 'search terms'`
pub async fn run_related(
config: &Config,
query_or_type: &str,
iid: Option<i64>,
limit: usize,
project: Option<&str>,
) -> Result<RelatedResponse> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
// Check if embeddings exist
let embedding_count: i64 = conn
.query_row("SELECT COUNT(*) FROM embedding_metadata", [], |row| {
row.get(0)
})
.unwrap_or(0);
if embedding_count == 0 {
return Err(LoreError::Other(
"No embeddings found. Run 'lore embed' first to generate vector embeddings.".into(),
));
}
// Validate input
if query_or_type.trim().is_empty() {
return Err(LoreError::Other(
"Query cannot be empty. Provide an entity type (issues/mrs) and IID, or a search query.".into(),
));
}
// Determine mode: entity vs query
let entity_type = match query_or_type.to_lowercase().as_str() {
"issues" | "issue" | "i" => Some("issue"),
"mrs" | "mr" | "m" | "merge_request" => Some("merge_request"),
_ => None,
};
if let Some(etype) = entity_type {
// Entity mode
let iid = iid.ok_or_else(|| {
LoreError::Other("Entity mode requires an IID (e.g., 'lore related issues 42')".into())
})?;
run_related_entity(&conn, config, etype, iid, limit, project).await
} else {
// Query mode - treat query_or_type as free text
run_related_query(&conn, config, query_or_type, limit, project).await
}
}
async fn run_related_entity(
conn: &Connection,
config: &Config,
entity_type: &str,
iid: i64,
limit: usize,
project_filter: Option<&str>,
) -> Result<RelatedResponse> {
// Find the source document
let source_doc = find_entity_document(conn, entity_type, iid, project_filter)?;
let source_info = get_entity_info(conn, entity_type, source_doc.source_id)?;
// Embed the source content
let embedding = embed_text(config, &source_doc.content_text).await?;
// Search for similar documents (limit + 1 to account for filtering self)
let vector_results = search_vector(conn, &embedding, limit.saturating_add(1))?;
// Filter out self and hydrate results
let source_labels = parse_label_names(&source_doc.label_names);
let mut results = Vec::new();
let mut warnings = Vec::new();
for vr in vector_results {
// Skip self
if vr.document_id == source_doc.id {
continue;
}
if let Some(result) = hydrate_result(conn, vr.document_id, vr.distance, &source_labels)? {
results.push(result);
}
if results.len() >= limit {
break;
}
}
// Check for low similarity
if !results.is_empty() && results.iter().all(|r| r.similarity_score < 0.3) {
warnings.push("No strongly related entities found (all scores < 0.3)".to_string());
}
Ok(RelatedResponse {
mode: "entity".to_string(),
source: Some(RelatedSource {
source_type: entity_type.to_string(),
iid,
title: source_info.title,
project_path: source_info.project_path,
}),
query: None,
results,
warnings,
})
}
async fn run_related_query(
conn: &Connection,
config: &Config,
query: &str,
limit: usize,
project_filter: Option<&str>,
) -> Result<RelatedResponse> {
let mut warnings = Vec::new();
// Warn if query is very short
if query.split_whitespace().count() <= 2 {
warnings.push("Short queries may produce noisy results".to_string());
}
// Embed the query
let embedding = embed_text(config, query).await?;
// Search for similar documents (fetch extra to allow for project filtering)
let vector_results = search_vector(conn, &embedding, limit.saturating_mul(2))?;
// Filter by project if specified and hydrate
let project_id = project_filter
.map(|p| resolve_project(conn, p))
.transpose()?;
let mut results = Vec::new();
let empty_labels: HashSet<String> = HashSet::new();
for vr in vector_results {
// Check project filter
if let Some(pid) = project_id {
let doc_project_id: Option<i64> = conn
.query_row(
"SELECT project_id FROM documents WHERE id = ?1",
[vr.document_id],
|row| row.get(0),
)
.ok();
if doc_project_id != Some(pid) {
continue;
}
}
if let Some(result) = hydrate_result(conn, vr.document_id, vr.distance, &empty_labels)? {
results.push(result);
}
if results.len() >= limit {
break;
}
}
// Check for low similarity
if !results.is_empty() && results.iter().all(|r| r.similarity_score < 0.3) {
warnings.push("No strongly related entities found (all scores < 0.3)".to_string());
}
Ok(RelatedResponse {
mode: "query".to_string(),
source: None,
query: Some(query.to_string()),
results,
warnings,
})
}
// ---------------------------------------------------------------------------
// DB helpers
// ---------------------------------------------------------------------------
fn find_entity_document(
conn: &Connection,
entity_type: &str,
iid: i64,
project_filter: Option<&str>,
) -> Result<DocumentRow> {
let table = match entity_type {
"issue" => "issues",
"merge_request" => "merge_requests",
_ => {
return Err(LoreError::Other(format!(
"Unknown entity type: {entity_type}"
)));
}
};
let (sql, params): (String, Vec<Box<dyn rusqlite::ToSql>>) = match project_filter {
Some(project) => {
let project_id = resolve_project(conn, project)?;
(
format!(
"SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url,
d.content_text, d.label_names, d.author_username, d.updated_at
FROM documents d
JOIN {table} e ON d.source_id = e.id
WHERE d.source_type = ?1 AND e.iid = ?2 AND e.project_id = ?3"
),
vec![
Box::new(entity_type.to_string()),
Box::new(iid),
Box::new(project_id),
],
)
}
None => (
format!(
"SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url,
d.content_text, d.label_names, d.author_username, d.updated_at
FROM documents d
JOIN {table} e ON d.source_id = e.id
WHERE d.source_type = ?1 AND e.iid = ?2"
),
vec![Box::new(entity_type.to_string()), Box::new(iid)],
),
};
let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();
let mut stmt = conn.prepare(&sql)?;
let rows: Vec<DocumentRow> = stmt
.query_map(param_refs.as_slice(), |row| {
Ok(DocumentRow {
id: row.get(0)?,
source_type: row.get(1)?,
source_id: row.get(2)?,
project_id: row.get(3)?,
title: row.get(4)?,
url: row.get(5)?,
content_text: row.get(6)?,
label_names: row.get(7)?,
author_username: row.get(8)?,
updated_at: row.get(9)?,
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
match rows.len() {
0 => Err(LoreError::NotFound(format!(
"{entity_type} #{iid} not found (run 'lore sync' first?)"
))),
1 => Ok(rows.into_iter().next().unwrap()),
_ => Err(LoreError::Ambiguous(format!(
"{entity_type} #{iid} exists in multiple projects. Use --project to specify."
))),
}
}
fn get_entity_info(conn: &Connection, entity_type: &str, entity_id: i64) -> Result<EntityInfo> {
let table = match entity_type {
"issue" => "issues",
"merge_request" => "merge_requests",
_ => {
return Err(LoreError::Other(format!(
"Unknown entity type: {entity_type}"
)));
}
};
let sql = format!(
"SELECT e.iid, e.title, p.path_with_namespace
FROM {table} e
JOIN projects p ON e.project_id = p.id
WHERE e.id = ?1"
);
conn.query_row(&sql, [entity_id], |row| {
Ok(EntityInfo {
iid: row.get(0)?,
title: row.get(1)?,
project_path: row.get(2)?,
})
})
.map_err(|e| LoreError::NotFound(format!("Entity not found: {e}")))
}
fn hydrate_result(
conn: &Connection,
document_id: i64,
distance: f64,
source_labels: &HashSet<String>,
) -> Result<Option<RelatedResult>> {
let doc: Option<DocumentRow> = conn
.query_row(
"SELECT d.id, d.source_type, d.source_id, d.project_id, d.title, d.url,
d.content_text, d.label_names, d.author_username, d.updated_at
FROM documents d
WHERE d.id = ?1",
[document_id],
|row| {
Ok(DocumentRow {
id: row.get(0)?,
source_type: row.get(1)?,
source_id: row.get(2)?,
project_id: row.get(3)?,
title: row.get(4)?,
url: row.get(5)?,
content_text: row.get(6)?,
label_names: row.get(7)?,
author_username: row.get(8)?,
updated_at: row.get(9)?,
})
},
)
.ok();
let Some(doc) = doc else {
return Ok(None);
};
// Skip discussion/note documents - we want entities only
if doc.source_type == "discussion" || doc.source_type == "note" {
return Ok(None);
}
// Get IID from the source entity
let table = match doc.source_type.as_str() {
"issue" => "issues",
"merge_request" => "merge_requests",
_ => return Ok(None),
};
// Get IID and title from the source entity - skip gracefully if not found
// (this handles orphaned documents where the entity was deleted)
let entity_info: Option<(i64, String, String)> = conn
.query_row(
&format!(
"SELECT e.iid, e.title, p.path_with_namespace
FROM {table} e
JOIN projects p ON e.project_id = p.id
WHERE e.id = ?1"
),
[doc.source_id],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
)
.ok();
let Some((iid, title, project_path)) = entity_info else {
// Entity not found in database - skip this result
return Ok(None);
};
// Compute shared labels
let result_labels = parse_label_names(&doc.label_names);
let shared_labels: Vec<String> = source_labels
.intersection(&result_labels)
.cloned()
.collect();
Ok(Some(RelatedResult {
source_type: doc.source_type,
iid,
title,
url: doc.url.unwrap_or_default(),
similarity_score: distance_to_similarity(distance),
project_path,
shared_labels,
author: doc.author_username,
updated_at: doc.updated_at.map(ms_to_iso).unwrap_or_default(),
}))
}
// ---------------------------------------------------------------------------
// Embedding helper
// ---------------------------------------------------------------------------
async fn embed_text(config: &Config, text: &str) -> Result<Vec<f32>> {
let ollama = OllamaClient::new(OllamaConfig {
base_url: config.embedding.base_url.clone(),
model: config.embedding.model.clone(),
timeout_secs: 60,
});
let embeddings = ollama.embed_batch(&[text]).await?;
embeddings
.into_iter()
.next()
.ok_or_else(|| LoreError::EmbeddingFailed {
document_id: 0,
reason: "No embedding returned".to_string(),
})
}
// ---------------------------------------------------------------------------
// Utilities
// ---------------------------------------------------------------------------
/// Convert L2 distance to a 0-1 similarity score.
/// Uses inverse relationship: closer (lower distance) = higher similarity.
fn distance_to_similarity(distance: f64) -> f64 {
1.0 / (1.0 + distance)
}
fn parse_label_names(label_names_json: &Option<String>) -> HashSet<String> {
label_names_json
.as_deref()
.and_then(|s| serde_json::from_str::<Vec<String>>(s).ok())
.unwrap_or_default()
.into_iter()
.collect()
}
// ---------------------------------------------------------------------------
// Printers
// ---------------------------------------------------------------------------
pub fn print_related_human(response: &RelatedResponse) {
// Header
let header = match &response.source {
Some(src) => format!("Related to {} #{}: {}", src.source_type, src.iid, src.title),
None => format!(
"Related to query: \"{}\"",
response.query.as_deref().unwrap_or("")
),
};
println!("{}", Theme::bold().render(&header));
println!("{}", "-".repeat(header.len().min(70)));
println!();
if response.results.is_empty() {
println!("No related entities found.");
return;
}
for (i, result) in response.results.iter().enumerate() {
let type_icon = match result.source_type.as_str() {
"issue" => Icons::issue_opened(),
"merge_request" => Icons::mr_opened(),
_ => " ",
};
let score_bar_len = (result.similarity_score * 10.0) as usize;
let score_bar: String = "\u{2588}".repeat(score_bar_len);
println!(
"{:>2}. {} {} #{} ({:.0}%) {}",
i + 1,
type_icon,
result.source_type,
result.iid,
result.similarity_score * 100.0,
score_bar
);
println!(" {}", result.title);
println!(
" {} | @{}",
result.project_path,
result.author.as_deref().unwrap_or("?")
);
if !result.shared_labels.is_empty() {
println!(" Labels shared: {}", result.shared_labels.join(", "));
}
println!();
}
// Warnings
for warning in &response.warnings {
println!("{} {}", Theme::warning().render(Icons::warning()), warning);
}
}
pub fn print_related_json(response: &RelatedResponse, elapsed_ms: u64) {
let meta = RobotMeta { elapsed_ms };
let output = serde_json::json!({
"ok": true,
"data": response,
"meta": meta,
});
match serde_json::to_string(&output) {
Ok(json) => println!("{json}"),
Err(e) => eprintln!("Error serializing to JSON: {e}"),
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_distance_to_similarity_identical() {
assert!((distance_to_similarity(0.0) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_distance_to_similarity_midpoint() {
assert!((distance_to_similarity(1.0) - 0.5).abs() < f64::EPSILON);
}
#[test]
fn test_distance_to_similarity_large() {
let sim = distance_to_similarity(2.0);
assert!(sim > 0.0 && sim < 0.5);
assert!((sim - 0.333_333_333_333_333_3).abs() < 0.001);
}
#[test]
fn test_distance_to_similarity_range() {
for d in [0.0, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0] {
let sim = distance_to_similarity(d);
assert!(
sim > 0.0 && sim <= 1.0,
"score {sim} out of range for distance {d}"
);
}
}
#[test]
fn test_parse_label_names_valid() {
let json = Some(r#"["bug", "priority::high"]"#.to_string());
let labels = parse_label_names(&json);
assert!(labels.contains("bug"));
assert!(labels.contains("priority::high"));
assert_eq!(labels.len(), 2);
}
#[test]
fn test_parse_label_names_empty() {
let labels = parse_label_names(&None);
assert!(labels.is_empty());
}
#[test]
fn test_parse_label_names_invalid_json() {
let json = Some("not valid json".to_string());
let labels = parse_label_names(&json);
assert!(labels.is_empty());
}
#[test]
fn test_parse_label_names_empty_array() {
let json = Some("[]".to_string());
let labels = parse_label_names(&json);
assert!(labels.is_empty());
}
}

View File

@@ -293,6 +293,28 @@ pub enum Commands {
project: Option<String>,
},
/// Find semantically related entities via vector search
#[command(after_help = "\x1b[1mExamples:\x1b[0m
lore related issues 42 # Find entities related to issue #42
lore related mrs 99 -p group/repo # Related to MR #99 in specific project
lore related 'authentication flow' # Find entities matching free text query
lore --robot related issues 42 -n 5 # JSON output, limit 5 results")]
Related {
/// Entity type (issues, mrs) or free text query
query_or_type: String,
/// Entity IID (required when first arg is entity type)
iid: Option<i64>,
/// Maximum results
#[arg(short = 'n', long, default_value = "10")]
limit: usize,
/// Scope to project (fuzzy match)
#[arg(short, long)]
project: Option<String>,
},
/// Manage cron-based automatic syncing
#[command(after_help = "\x1b[1mExamples:\x1b[0m
lore cron install # Install cron job (every 8 minutes)

View File

@@ -18,15 +18,16 @@ use lore::cli::commands::{
print_event_count, print_event_count_json, print_file_history, print_file_history_json,
print_generate_docs, print_generate_docs_json, print_ingest_summary, print_ingest_summary_json,
print_list_issues, print_list_issues_json, print_list_mrs, print_list_mrs_json,
print_list_notes, print_list_notes_json, print_search_results, print_search_results_json,
print_show_issue, print_show_issue_json, print_show_mr, print_show_mr_json, print_stats,
print_stats_json, print_sync, print_sync_json, print_sync_status, print_sync_status_json,
print_timeline, print_timeline_json_with_meta, print_trace, print_trace_json, print_who_human,
print_who_json, query_notes, run_auth_test, run_count, run_count_events, run_cron_install,
run_cron_status, run_cron_uninstall, run_doctor, run_drift, run_embed, run_file_history,
run_generate_docs, run_ingest, run_ingest_dry_run, run_init, run_list_issues, run_list_mrs,
run_me, run_search, run_show_issue, run_show_mr, run_stats, run_sync, run_sync_status,
run_timeline, run_token_set, run_token_show, run_who,
print_list_notes, print_list_notes_json, print_related_human, print_related_json,
print_search_results, print_search_results_json, print_show_issue, print_show_issue_json,
print_show_mr, print_show_mr_json, print_stats, print_stats_json, print_sync, print_sync_json,
print_sync_status, print_sync_status_json, print_timeline, print_timeline_json_with_meta,
print_trace, print_trace_json, print_who_human, print_who_json, query_notes, run_auth_test,
run_count, run_count_events, run_cron_install, run_cron_status, run_cron_uninstall, run_doctor,
run_drift, run_embed, run_file_history, run_generate_docs, run_ingest, run_ingest_dry_run,
run_init, run_list_issues, run_list_mrs, run_me, run_related, run_search, run_show_issue,
run_show_mr, run_stats, run_sync, run_sync_status, run_timeline, run_token_set, run_token_show,
run_who,
};
use lore::cli::render::{ColorMode, GlyphMode, Icons, LoreRenderer, Theme};
use lore::cli::robot::{RobotMeta, strip_schemas};
@@ -225,6 +226,22 @@ async fn main() {
)
.await
}
Some(Commands::Related {
query_or_type,
iid,
limit,
project,
}) => {
handle_related(
cli.config.as_deref(),
&query_or_type,
iid,
limit,
project.as_deref(),
robot_mode,
)
.await
}
Some(Commands::Stats(args)) => handle_stats(cli.config.as_deref(), args, robot_mode).await,
Some(Commands::Embed(args)) => handle_embed(cli.config.as_deref(), args, robot_mode).await,
Some(Commands::Sync(args)) => {
@@ -1996,7 +2013,7 @@ async fn handle_timeline(
if robot_mode {
print_timeline_json_with_meta(
&result,
result.total_events_before_limit,
result.total_filtered_events,
params.depth,
!params.no_mentions,
args.fields.as_deref(),
@@ -3256,6 +3273,28 @@ async fn handle_drift(
Ok(())
}
async fn handle_related(
config_override: Option<&str>,
query_or_type: &str,
iid: Option<i64>,
limit: usize,
project: Option<&str>,
robot_mode: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let start = std::time::Instant::now();
let config = Config::load(config_override)?;
let effective_project = config.effective_project(project);
let response = run_related(&config, query_or_type, iid, limit, effective_project).await?;
let elapsed_ms = start.elapsed().as_millis() as u64;
if robot_mode {
print_related_json(&response, elapsed_ms);
} else {
print_related_human(&response);
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
async fn handle_list_compat(
config_override: Option<&str>,