feat(bd-1cjx): add lore drift command for discussion divergence detection
Implement drift detection using cosine similarity between issue description embedding and chronological note embeddings. Sliding window (size 3) identifies topic drift points. Includes human and robot output formatters. New files: drift.rs, similarity.rs Closes: bd-1cjx
This commit is contained in:
642
src/cli/commands/drift.rs
Normal file
642
src/cli/commands/drift.rs
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
|
use console::style;
|
||||||
|
use regex::Regex;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::cli::robot::RobotMeta;
|
||||||
|
use crate::core::config::Config;
|
||||||
|
use crate::core::db::create_connection;
|
||||||
|
use crate::core::error::{LoreError, Result};
|
||||||
|
use crate::core::paths::get_db_path;
|
||||||
|
use crate::core::project::resolve_project;
|
||||||
|
use crate::core::time::ms_to_iso;
|
||||||
|
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
|
||||||
|
use crate::embedding::similarity::cosine_similarity;
|
||||||
|
|
||||||
|
const BATCH_SIZE: usize = 32;
|
||||||
|
const WINDOW_SIZE: usize = 3;
|
||||||
|
const MIN_DESCRIPTION_LEN: usize = 20;
|
||||||
|
const MAX_NOTES: i64 = 200;
|
||||||
|
const TOP_TOPICS: usize = 3;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Response types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct DriftResponse {
|
||||||
|
pub entity: DriftEntity,
|
||||||
|
pub drift_detected: bool,
|
||||||
|
pub threshold: f32,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub drift_point: Option<DriftPoint>,
|
||||||
|
pub drift_topics: Vec<String>,
|
||||||
|
pub similarity_curve: Vec<SimilarityPoint>,
|
||||||
|
pub recommendation: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct DriftEntity {
|
||||||
|
pub entity_type: String,
|
||||||
|
pub iid: i64,
|
||||||
|
pub title: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct DriftPoint {
|
||||||
|
pub note_index: usize,
|
||||||
|
pub note_id: i64,
|
||||||
|
pub author: String,
|
||||||
|
pub created_at: String,
|
||||||
|
pub similarity: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct SimilarityPoint {
|
||||||
|
pub note_index: usize,
|
||||||
|
pub similarity: f32,
|
||||||
|
pub author: String,
|
||||||
|
pub created_at: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Internal row types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
struct IssueInfo {
|
||||||
|
id: i64,
|
||||||
|
iid: i64,
|
||||||
|
title: String,
|
||||||
|
description: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct NoteRow {
|
||||||
|
id: i64,
|
||||||
|
body: String,
|
||||||
|
author_username: String,
|
||||||
|
created_at: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Main entry point
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
pub async fn run_drift(
|
||||||
|
config: &Config,
|
||||||
|
entity_type: &str,
|
||||||
|
iid: i64,
|
||||||
|
threshold: f32,
|
||||||
|
project: Option<&str>,
|
||||||
|
) -> Result<DriftResponse> {
|
||||||
|
if entity_type != "issues" {
|
||||||
|
return Err(LoreError::Other(
|
||||||
|
"drift currently supports 'issues' only".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||||
|
let conn = create_connection(&db_path)?;
|
||||||
|
|
||||||
|
let issue = find_issue(&conn, iid, project)?;
|
||||||
|
|
||||||
|
let description = match &issue.description {
|
||||||
|
Some(d) if d.len() >= MIN_DESCRIPTION_LEN => d.clone(),
|
||||||
|
_ => {
|
||||||
|
return Ok(DriftResponse {
|
||||||
|
entity: DriftEntity {
|
||||||
|
entity_type: entity_type.to_string(),
|
||||||
|
iid: issue.iid,
|
||||||
|
title: issue.title,
|
||||||
|
},
|
||||||
|
drift_detected: false,
|
||||||
|
threshold,
|
||||||
|
drift_point: None,
|
||||||
|
drift_topics: vec![],
|
||||||
|
similarity_curve: vec![],
|
||||||
|
recommendation: "Description too short for drift analysis.".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let notes = fetch_notes(&conn, issue.id)?;
|
||||||
|
|
||||||
|
if notes.len() < WINDOW_SIZE {
|
||||||
|
return Ok(DriftResponse {
|
||||||
|
entity: DriftEntity {
|
||||||
|
entity_type: entity_type.to_string(),
|
||||||
|
iid: issue.iid,
|
||||||
|
title: issue.title,
|
||||||
|
},
|
||||||
|
drift_detected: false,
|
||||||
|
threshold,
|
||||||
|
drift_point: None,
|
||||||
|
drift_topics: vec![],
|
||||||
|
similarity_curve: vec![],
|
||||||
|
recommendation: format!(
|
||||||
|
"Only {} note(s) found; need at least {} for drift detection.",
|
||||||
|
notes.len(),
|
||||||
|
WINDOW_SIZE
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build texts to embed: description first, then each note body.
|
||||||
|
let mut texts: Vec<String> = Vec::with_capacity(1 + notes.len());
|
||||||
|
texts.push(description.clone());
|
||||||
|
for note in ¬es {
|
||||||
|
texts.push(note.body.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let embeddings = embed_texts(config, &texts).await?;
|
||||||
|
|
||||||
|
let desc_embedding = &embeddings[0];
|
||||||
|
let note_embeddings = &embeddings[1..];
|
||||||
|
|
||||||
|
// Build similarity curve.
|
||||||
|
let similarity_curve: Vec<SimilarityPoint> = note_embeddings
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, emb)| SimilarityPoint {
|
||||||
|
note_index: i,
|
||||||
|
similarity: cosine_similarity(desc_embedding, emb),
|
||||||
|
author: notes[i].author_username.clone(),
|
||||||
|
created_at: ms_to_iso(notes[i].created_at),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Detect drift via sliding window.
|
||||||
|
let (drift_detected, drift_point) = detect_drift(&similarity_curve, ¬es, threshold);
|
||||||
|
|
||||||
|
// Extract drift topics.
|
||||||
|
let drift_topics = if drift_detected {
|
||||||
|
let drift_idx = drift_point.as_ref().map_or(0, |dp| dp.note_index);
|
||||||
|
extract_drift_topics(&description, ¬es, drift_idx)
|
||||||
|
} else {
|
||||||
|
vec![]
|
||||||
|
};
|
||||||
|
|
||||||
|
let recommendation = if drift_detected {
|
||||||
|
let dp = drift_point.as_ref().unwrap();
|
||||||
|
format!(
|
||||||
|
"Discussion drifted at note {} by @{} (similarity {:.2}). Consider splitting into a new issue.",
|
||||||
|
dp.note_index, dp.author, dp.similarity
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
"Discussion remains on topic.".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(DriftResponse {
|
||||||
|
entity: DriftEntity {
|
||||||
|
entity_type: entity_type.to_string(),
|
||||||
|
iid: issue.iid,
|
||||||
|
title: issue.title,
|
||||||
|
},
|
||||||
|
drift_detected,
|
||||||
|
threshold,
|
||||||
|
drift_point,
|
||||||
|
drift_topics,
|
||||||
|
similarity_curve,
|
||||||
|
recommendation,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// DB helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn find_issue(
|
||||||
|
conn: &rusqlite::Connection,
|
||||||
|
iid: i64,
|
||||||
|
project_filter: Option<&str>,
|
||||||
|
) -> Result<IssueInfo> {
|
||||||
|
let (sql, params): (&str, Vec<Box<dyn rusqlite::ToSql>>) = match project_filter {
|
||||||
|
Some(project) => {
|
||||||
|
let project_id = resolve_project(conn, project)?;
|
||||||
|
(
|
||||||
|
"SELECT i.id, i.iid, i.title, i.description
|
||||||
|
FROM issues i
|
||||||
|
WHERE i.iid = ? AND i.project_id = ?",
|
||||||
|
vec![Box::new(iid), Box::new(project_id)],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => (
|
||||||
|
"SELECT i.id, i.iid, i.title, i.description
|
||||||
|
FROM issues i
|
||||||
|
WHERE i.iid = ?",
|
||||||
|
vec![Box::new(iid)],
|
||||||
|
),
|
||||||
|
};
|
||||||
|
|
||||||
|
let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();
|
||||||
|
|
||||||
|
let mut stmt = conn.prepare(sql)?;
|
||||||
|
let rows: Vec<IssueInfo> = stmt
|
||||||
|
.query_map(param_refs.as_slice(), |row| {
|
||||||
|
Ok(IssueInfo {
|
||||||
|
id: row.get(0)?,
|
||||||
|
iid: row.get(1)?,
|
||||||
|
title: row.get(2)?,
|
||||||
|
description: row.get(3)?,
|
||||||
|
})
|
||||||
|
})?
|
||||||
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
match rows.len() {
|
||||||
|
0 => Err(LoreError::NotFound(format!("Issue #{iid} not found"))),
|
||||||
|
1 => Ok(rows.into_iter().next().unwrap()),
|
||||||
|
_ => Err(LoreError::Ambiguous(format!(
|
||||||
|
"Issue #{iid} exists in multiple projects. Use --project to specify."
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_notes(conn: &rusqlite::Connection, issue_id: i64) -> Result<Vec<NoteRow>> {
|
||||||
|
let mut stmt = conn.prepare(
|
||||||
|
"SELECT n.id, n.body, n.author_username, n.created_at
|
||||||
|
FROM notes n
|
||||||
|
JOIN discussions d ON n.discussion_id = d.id
|
||||||
|
WHERE d.issue_id = ?
|
||||||
|
AND n.is_system = 0
|
||||||
|
AND LENGTH(n.body) >= 20
|
||||||
|
ORDER BY n.created_at ASC
|
||||||
|
LIMIT ?",
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let notes: Vec<NoteRow> = stmt
|
||||||
|
.query_map(rusqlite::params![issue_id, MAX_NOTES], |row| {
|
||||||
|
Ok(NoteRow {
|
||||||
|
id: row.get(0)?,
|
||||||
|
body: row.get(1)?,
|
||||||
|
author_username: row.get(2)?,
|
||||||
|
created_at: row.get(3)?,
|
||||||
|
})
|
||||||
|
})?
|
||||||
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
Ok(notes)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Embedding helper
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async fn embed_texts(config: &Config, texts: &[String]) -> Result<Vec<Vec<f32>>> {
|
||||||
|
let ollama = OllamaClient::new(OllamaConfig {
|
||||||
|
base_url: config.embedding.base_url.clone(),
|
||||||
|
model: config.embedding.model.clone(),
|
||||||
|
timeout_secs: 60,
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut all_embeddings: Vec<Vec<f32>> = Vec::with_capacity(texts.len());
|
||||||
|
|
||||||
|
for chunk in texts.chunks(BATCH_SIZE) {
|
||||||
|
let refs: Vec<&str> = chunk.iter().map(|s| s.as_str()).collect();
|
||||||
|
let batch_result = ollama.embed_batch(&refs).await?;
|
||||||
|
all_embeddings.extend(batch_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(all_embeddings)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Drift detection
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn detect_drift(
|
||||||
|
curve: &[SimilarityPoint],
|
||||||
|
notes: &[NoteRow],
|
||||||
|
threshold: f32,
|
||||||
|
) -> (bool, Option<DriftPoint>) {
|
||||||
|
if curve.len() < WINDOW_SIZE {
|
||||||
|
return (false, None);
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..=curve.len() - WINDOW_SIZE {
|
||||||
|
let window_avg: f32 = curve[i..i + WINDOW_SIZE]
|
||||||
|
.iter()
|
||||||
|
.map(|p| p.similarity)
|
||||||
|
.sum::<f32>()
|
||||||
|
/ WINDOW_SIZE as f32;
|
||||||
|
|
||||||
|
if window_avg < threshold {
|
||||||
|
return (
|
||||||
|
true,
|
||||||
|
Some(DriftPoint {
|
||||||
|
note_index: i,
|
||||||
|
note_id: notes[i].id,
|
||||||
|
author: notes[i].author_username.clone(),
|
||||||
|
created_at: ms_to_iso(notes[i].created_at),
|
||||||
|
similarity: curve[i].similarity,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(false, None)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Topic extraction
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static STOPWORDS: LazyLock<std::collections::HashSet<&'static str>> = LazyLock::new(|| {
|
||||||
|
[
|
||||||
|
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
|
||||||
|
"do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can",
|
||||||
|
"need", "dare", "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
|
||||||
|
"as", "into", "through", "during", "before", "after", "above", "below", "between", "out",
|
||||||
|
"off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
|
||||||
|
"where", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other",
|
||||||
|
"some", "such", "no", "not", "only", "own", "same", "so", "than", "too", "very", "just",
|
||||||
|
"because", "but", "and", "or", "if", "while", "about", "up", "it", "its", "this", "that",
|
||||||
|
"these", "those", "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she",
|
||||||
|
"her", "they", "them", "their", "what", "which", "who", "whom", "also", "like", "get",
|
||||||
|
"got", "think", "know", "see", "make", "go", "one", "two", "new", "way",
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect()
|
||||||
|
});
|
||||||
|
|
||||||
|
fn tokenize(text: &str) -> Vec<String> {
|
||||||
|
let cleaned = strip_markdown(text);
|
||||||
|
cleaned
|
||||||
|
.split(|c: char| !c.is_alphanumeric() && c != '_')
|
||||||
|
.filter(|w| w.len() >= 3)
|
||||||
|
.map(|w| w.to_lowercase())
|
||||||
|
.filter(|w| !STOPWORDS.contains(w.as_str()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_drift_topics(description: &str, notes: &[NoteRow], drift_idx: usize) -> Vec<String> {
|
||||||
|
let desc_terms: std::collections::HashSet<String> = tokenize(description).into_iter().collect();
|
||||||
|
|
||||||
|
let mut freq: HashMap<String, usize> = HashMap::new();
|
||||||
|
for note in notes.iter().skip(drift_idx) {
|
||||||
|
for term in tokenize(¬e.body) {
|
||||||
|
if !desc_terms.contains(&term) {
|
||||||
|
*freq.entry(term).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut sorted: Vec<(String, usize)> = freq.into_iter().collect();
|
||||||
|
sorted.sort_by(|a, b| b.1.cmp(&a.1));
|
||||||
|
|
||||||
|
sorted
|
||||||
|
.into_iter()
|
||||||
|
.take(TOP_TOPICS)
|
||||||
|
.map(|(t, _)| t)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Markdown stripping
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static RE_FENCED_CODE: LazyLock<Regex> =
|
||||||
|
LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap());
|
||||||
|
static RE_INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap());
|
||||||
|
static RE_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]+\)").unwrap());
|
||||||
|
static RE_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^>\s?").unwrap());
|
||||||
|
static RE_HTML_TAG: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||||
|
|
||||||
|
fn strip_markdown(text: &str) -> String {
|
||||||
|
let text = RE_FENCED_CODE.replace_all(text, "");
|
||||||
|
let text = RE_INLINE_CODE.replace_all(&text, "");
|
||||||
|
let text = RE_LINK.replace_all(&text, "$1");
|
||||||
|
let text = RE_BLOCKQUOTE.replace_all(&text, "");
|
||||||
|
let text = RE_HTML_TAG.replace_all(&text, "");
|
||||||
|
text.into_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Printers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
pub fn print_drift_human(response: &DriftResponse) {
|
||||||
|
let header = format!(
|
||||||
|
"Drift Analysis: {} #{}",
|
||||||
|
response.entity.entity_type, response.entity.iid
|
||||||
|
);
|
||||||
|
println!("{}", style(&header).bold());
|
||||||
|
println!("{}", "-".repeat(header.len().min(60)));
|
||||||
|
println!("Title: {}", response.entity.title);
|
||||||
|
println!("Threshold: {:.2}", response.threshold);
|
||||||
|
println!("Notes: {}", response.similarity_curve.len());
|
||||||
|
println!();
|
||||||
|
|
||||||
|
if response.drift_detected {
|
||||||
|
println!("{}", style("DRIFT DETECTED").red().bold());
|
||||||
|
if let Some(dp) = &response.drift_point {
|
||||||
|
println!(
|
||||||
|
" At note #{} by @{} ({}) - similarity {:.2}",
|
||||||
|
dp.note_index, dp.author, dp.created_at, dp.similarity
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if !response.drift_topics.is_empty() {
|
||||||
|
println!(" Topics: {}", response.drift_topics.join(", "));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("{}", style("No drift detected").green());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!();
|
||||||
|
println!("{}", response.recommendation);
|
||||||
|
|
||||||
|
if !response.similarity_curve.is_empty() {
|
||||||
|
println!();
|
||||||
|
println!("{}", style("Similarity Curve:").bold());
|
||||||
|
for pt in &response.similarity_curve {
|
||||||
|
let bar_len = ((pt.similarity.max(0.0)) * 30.0) as usize;
|
||||||
|
let bar: String = "#".repeat(bar_len);
|
||||||
|
println!(
|
||||||
|
" {:>3} {:.2} {} @{}",
|
||||||
|
pt.note_index, pt.similarity, bar, pt.author
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn print_drift_json(response: &DriftResponse, elapsed_ms: u64) {
|
||||||
|
let meta = RobotMeta { elapsed_ms };
|
||||||
|
let output = serde_json::json!({
|
||||||
|
"ok": true,
|
||||||
|
"data": response,
|
||||||
|
"meta": meta,
|
||||||
|
});
|
||||||
|
match serde_json::to_string(&output) {
|
||||||
|
Ok(json) => println!("{json}"),
|
||||||
|
Err(e) => eprintln!("Error serializing to JSON: {e}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_drift_when_divergent() {
|
||||||
|
let notes: Vec<NoteRow> = (0..6)
|
||||||
|
.map(|i| NoteRow {
|
||||||
|
id: i as i64,
|
||||||
|
body: format!("note {i}"),
|
||||||
|
author_username: "user".to_string(),
|
||||||
|
created_at: 1000 + i as i64,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let curve: Vec<SimilarityPoint> = [0.9, 0.85, 0.8, 0.25, 0.2, 0.15]
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, &sim)| SimilarityPoint {
|
||||||
|
note_index: i,
|
||||||
|
similarity: sim,
|
||||||
|
author: "user".to_string(),
|
||||||
|
created_at: ms_to_iso(1000 + i as i64),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let (detected, point) = detect_drift(&curve, ¬es, 0.4);
|
||||||
|
assert!(detected);
|
||||||
|
assert!(point.is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_no_drift_consistent() {
|
||||||
|
let notes: Vec<NoteRow> = (0..5)
|
||||||
|
.map(|i| NoteRow {
|
||||||
|
id: i as i64,
|
||||||
|
body: format!("note {i}"),
|
||||||
|
author_username: "user".to_string(),
|
||||||
|
created_at: 1000 + i as i64,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let curve: Vec<SimilarityPoint> = [0.85, 0.8, 0.75, 0.7, 0.65]
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, &sim)| SimilarityPoint {
|
||||||
|
note_index: i,
|
||||||
|
similarity: sim,
|
||||||
|
author: "user".to_string(),
|
||||||
|
created_at: ms_to_iso(1000 + i as i64),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let (detected, _) = detect_drift(&curve, ¬es, 0.4);
|
||||||
|
assert!(!detected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_drift_point_is_first_divergent() {
|
||||||
|
let notes: Vec<NoteRow> = (0..5)
|
||||||
|
.map(|i| NoteRow {
|
||||||
|
id: (i * 10) as i64,
|
||||||
|
body: format!("note {i}"),
|
||||||
|
author_username: format!("user{i}"),
|
||||||
|
created_at: 1000 + i as i64,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Window of 3: indices [0,1,2] avg=0.83, [1,2,3] avg=0.55, [2,3,4] avg=0.23
|
||||||
|
let curve: Vec<SimilarityPoint> = [0.9, 0.8, 0.8, 0.05, 0.05]
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, &sim)| SimilarityPoint {
|
||||||
|
note_index: i,
|
||||||
|
similarity: sim,
|
||||||
|
author: format!("user{i}"),
|
||||||
|
created_at: ms_to_iso(1000 + i as i64),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let (detected, point) = detect_drift(&curve, ¬es, 0.4);
|
||||||
|
assert!(detected);
|
||||||
|
let dp = point.unwrap();
|
||||||
|
// Window [2,3,4] avg = (0.8+0.05+0.05)/3 = 0.3 < 0.4
|
||||||
|
// But [1,2,3] avg = (0.8+0.8+0.05)/3 = 0.55 >= 0.4, so first failing is index 2
|
||||||
|
assert_eq!(dp.note_index, 2);
|
||||||
|
assert_eq!(dp.note_id, 20);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_drift_topics_excludes_description_terms() {
|
||||||
|
let description = "We need to fix the authentication flow for login users";
|
||||||
|
let notes = vec![
|
||||||
|
NoteRow {
|
||||||
|
id: 1,
|
||||||
|
body: "The database migration script is broken and needs postgres update"
|
||||||
|
.to_string(),
|
||||||
|
author_username: "dev".to_string(),
|
||||||
|
created_at: 1000,
|
||||||
|
},
|
||||||
|
NoteRow {
|
||||||
|
id: 2,
|
||||||
|
body: "The database connection pool also has migration issues with postgres"
|
||||||
|
.to_string(),
|
||||||
|
author_username: "dev".to_string(),
|
||||||
|
created_at: 2000,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
let topics = extract_drift_topics(description, ¬es, 0);
|
||||||
|
// "database", "migration", "postgres" should appear; "fix" should not (it's in description)
|
||||||
|
assert!(!topics.is_empty());
|
||||||
|
for t in &topics {
|
||||||
|
assert_ne!(t, "fix");
|
||||||
|
assert_ne!(t, "authentication");
|
||||||
|
assert_ne!(t, "login");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strip_markdown_code_blocks() {
|
||||||
|
let input = "Before\n```rust\nfn main() {}\n```\nAfter";
|
||||||
|
let result = strip_markdown(input);
|
||||||
|
assert!(!result.contains("fn main"));
|
||||||
|
assert!(result.contains("Before"));
|
||||||
|
assert!(result.contains("After"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strip_markdown_preserves_text() {
|
||||||
|
let input = "Check [this link](https://example.com) and `inline code` for details";
|
||||||
|
let result = strip_markdown(input);
|
||||||
|
assert!(result.contains("this link"));
|
||||||
|
assert!(!result.contains("https://example.com"));
|
||||||
|
assert!(!result.contains("inline code"));
|
||||||
|
assert!(result.contains("details"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_too_few_notes() {
|
||||||
|
let notes: Vec<NoteRow> = (0..2)
|
||||||
|
.map(|i| NoteRow {
|
||||||
|
id: i as i64,
|
||||||
|
body: format!("note {i}"),
|
||||||
|
author_username: "user".to_string(),
|
||||||
|
created_at: 1000 + i as i64,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let curve: Vec<SimilarityPoint> = [0.1, 0.1]
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, &sim)| SimilarityPoint {
|
||||||
|
note_index: i,
|
||||||
|
similarity: sim,
|
||||||
|
author: "user".to_string(),
|
||||||
|
created_at: ms_to_iso(1000 + i as i64),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let (detected, _) = detect_drift(&curve, ¬es, 0.4);
|
||||||
|
assert!(!detected);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
pub mod auth_test;
|
pub mod auth_test;
|
||||||
pub mod count;
|
pub mod count;
|
||||||
pub mod doctor;
|
pub mod doctor;
|
||||||
|
pub mod drift;
|
||||||
pub mod embed;
|
pub mod embed;
|
||||||
pub mod generate_docs;
|
pub mod generate_docs;
|
||||||
pub mod ingest;
|
pub mod ingest;
|
||||||
@@ -20,6 +21,7 @@ pub use count::{
|
|||||||
run_count_events,
|
run_count_events,
|
||||||
};
|
};
|
||||||
pub use doctor::{DoctorChecks, print_doctor_results, run_doctor};
|
pub use doctor::{DoctorChecks, print_doctor_results, run_doctor};
|
||||||
|
pub use drift::{DriftResponse, print_drift_human, print_drift_json, run_drift};
|
||||||
pub use embed::{print_embed, print_embed_json, run_embed};
|
pub use embed::{print_embed, print_embed_json, run_embed};
|
||||||
pub use generate_docs::{print_generate_docs, print_generate_docs_json, run_generate_docs};
|
pub use generate_docs::{print_generate_docs, print_generate_docs_json, run_generate_docs};
|
||||||
pub use ingest::{
|
pub use ingest::{
|
||||||
|
|||||||
@@ -215,6 +215,24 @@ pub enum Commands {
|
|||||||
/// People intelligence: experts, workload, active discussions, overlap
|
/// People intelligence: experts, workload, active discussions, overlap
|
||||||
Who(WhoArgs),
|
Who(WhoArgs),
|
||||||
|
|
||||||
|
/// Detect discussion divergence from original intent
|
||||||
|
Drift {
|
||||||
|
/// Entity type (currently only "issues" supported)
|
||||||
|
#[arg(value_parser = ["issues"])]
|
||||||
|
entity_type: String,
|
||||||
|
|
||||||
|
/// Entity IID
|
||||||
|
iid: i64,
|
||||||
|
|
||||||
|
/// Similarity threshold for drift detection (0.0-1.0)
|
||||||
|
#[arg(long, default_value = "0.4")]
|
||||||
|
threshold: f32,
|
||||||
|
|
||||||
|
/// Scope to project (fuzzy match)
|
||||||
|
#[arg(short, long)]
|
||||||
|
project: Option<String>,
|
||||||
|
},
|
||||||
|
|
||||||
#[command(hide = true)]
|
#[command(hide = true)]
|
||||||
List {
|
List {
|
||||||
#[arg(value_parser = ["issues", "mrs"])]
|
#[arg(value_parser = ["issues", "mrs"])]
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ pub mod chunk_ids;
|
|||||||
pub mod chunking;
|
pub mod chunking;
|
||||||
pub mod ollama;
|
pub mod ollama;
|
||||||
pub mod pipeline;
|
pub mod pipeline;
|
||||||
|
pub mod similarity;
|
||||||
|
|
||||||
pub use change_detector::{PendingDocument, count_pending_documents, find_pending_documents};
|
pub use change_detector::{PendingDocument, count_pending_documents, find_pending_documents};
|
||||||
pub use chunking::{CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS, split_into_chunks};
|
pub use chunking::{CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS, split_into_chunks};
|
||||||
pub use pipeline::{EmbedResult, embed_documents};
|
pub use pipeline::{EmbedResult, embed_documents};
|
||||||
|
pub use similarity::cosine_similarity;
|
||||||
|
|||||||
48
src/embedding/similarity.rs
Normal file
48
src/embedding/similarity.rs
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/// Cosine similarity between two embedding vectors.
|
||||||
|
/// Returns value in [-1, 1] range; higher = more similar.
|
||||||
|
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||||
|
debug_assert_eq!(a.len(), b.len(), "embedding dimensions must match");
|
||||||
|
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
|
||||||
|
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||||
|
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||||
|
if norm_a == 0.0 || norm_b == 0.0 {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
dot / (norm_a * norm_b)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cosine_similarity_identical() {
|
||||||
|
let v = [1.0, 2.0, 3.0];
|
||||||
|
let sim = cosine_similarity(&v, &v);
|
||||||
|
assert!((sim - 1.0).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cosine_similarity_orthogonal() {
|
||||||
|
let a = [1.0, 0.0, 0.0];
|
||||||
|
let b = [0.0, 1.0, 0.0];
|
||||||
|
let sim = cosine_similarity(&a, &b);
|
||||||
|
assert!(sim.abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cosine_similarity_zero_vector() {
|
||||||
|
let a = [1.0, 2.0, 3.0];
|
||||||
|
let b = [0.0, 0.0, 0.0];
|
||||||
|
let sim = cosine_similarity(&a, &b);
|
||||||
|
assert!((sim - 0.0).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cosine_similarity_opposite() {
|
||||||
|
let a = [1.0, 2.0, 3.0];
|
||||||
|
let b = [-1.0, -2.0, -3.0];
|
||||||
|
let sim = cosine_similarity(&a, &b);
|
||||||
|
assert!((sim - (-1.0)).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
}
|
||||||
60
src/main.rs
60
src/main.rs
@@ -12,17 +12,17 @@ use lore::cli::autocorrect::{self, CorrectionResult};
|
|||||||
use lore::cli::commands::{
|
use lore::cli::commands::{
|
||||||
IngestDisplay, InitInputs, InitOptions, InitResult, ListFilters, MrListFilters,
|
IngestDisplay, InitInputs, InitOptions, InitResult, ListFilters, MrListFilters,
|
||||||
SearchCliFilters, SyncOptions, TimelineParams, open_issue_in_browser, open_mr_in_browser,
|
SearchCliFilters, SyncOptions, TimelineParams, open_issue_in_browser, open_mr_in_browser,
|
||||||
print_count, print_count_json, print_doctor_results, print_dry_run_preview,
|
print_count, print_count_json, print_doctor_results, print_drift_human, print_drift_json,
|
||||||
print_dry_run_preview_json, print_embed, print_embed_json, print_event_count,
|
print_dry_run_preview, print_dry_run_preview_json, print_embed, print_embed_json,
|
||||||
print_event_count_json, print_generate_docs, print_generate_docs_json, print_ingest_summary,
|
print_event_count, print_event_count_json, print_generate_docs, print_generate_docs_json,
|
||||||
print_ingest_summary_json, print_list_issues, print_list_issues_json, print_list_mrs,
|
print_ingest_summary, print_ingest_summary_json, print_list_issues, print_list_issues_json,
|
||||||
print_list_mrs_json, print_search_results, print_search_results_json, print_show_issue,
|
print_list_mrs, print_list_mrs_json, print_search_results, print_search_results_json,
|
||||||
print_show_issue_json, print_show_mr, print_show_mr_json, print_stats, print_stats_json,
|
print_show_issue, print_show_issue_json, print_show_mr, print_show_mr_json, print_stats,
|
||||||
print_sync, print_sync_json, print_sync_status, print_sync_status_json, print_timeline,
|
print_stats_json, print_sync, print_sync_json, print_sync_status, print_sync_status_json,
|
||||||
print_timeline_json_with_meta, print_who_human, print_who_json, run_auth_test, run_count,
|
print_timeline, print_timeline_json_with_meta, print_who_human, print_who_json, run_auth_test,
|
||||||
run_count_events, run_doctor, run_embed, run_generate_docs, run_ingest, run_ingest_dry_run,
|
run_count, run_count_events, run_doctor, run_drift, run_embed, run_generate_docs, run_ingest,
|
||||||
run_init, run_list_issues, run_list_mrs, run_search, run_show_issue, run_show_mr, run_stats,
|
run_ingest_dry_run, run_init, run_list_issues, run_list_mrs, run_search, run_show_issue,
|
||||||
run_sync, run_sync_status, run_timeline, run_who,
|
run_show_mr, run_stats, run_sync, run_sync_status, run_timeline, run_who,
|
||||||
};
|
};
|
||||||
use lore::cli::robot::{RobotMeta, strip_schemas};
|
use lore::cli::robot::{RobotMeta, strip_schemas};
|
||||||
use lore::cli::{
|
use lore::cli::{
|
||||||
@@ -178,6 +178,22 @@ async fn main() {
|
|||||||
}
|
}
|
||||||
Some(Commands::Timeline(args)) => handle_timeline(cli.config.as_deref(), args, robot_mode),
|
Some(Commands::Timeline(args)) => handle_timeline(cli.config.as_deref(), args, robot_mode),
|
||||||
Some(Commands::Who(args)) => handle_who(cli.config.as_deref(), args, robot_mode),
|
Some(Commands::Who(args)) => handle_who(cli.config.as_deref(), args, robot_mode),
|
||||||
|
Some(Commands::Drift {
|
||||||
|
entity_type,
|
||||||
|
iid,
|
||||||
|
threshold,
|
||||||
|
project,
|
||||||
|
}) => {
|
||||||
|
handle_drift(
|
||||||
|
cli.config.as_deref(),
|
||||||
|
&entity_type,
|
||||||
|
iid,
|
||||||
|
threshold,
|
||||||
|
project.as_deref(),
|
||||||
|
robot_mode,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
Some(Commands::Stats(args)) => handle_stats(cli.config.as_deref(), args, robot_mode).await,
|
Some(Commands::Stats(args)) => handle_stats(cli.config.as_deref(), args, robot_mode).await,
|
||||||
Some(Commands::Embed(args)) => handle_embed(cli.config.as_deref(), args, robot_mode).await,
|
Some(Commands::Embed(args)) => handle_embed(cli.config.as_deref(), args, robot_mode).await,
|
||||||
Some(Commands::Sync(args)) => {
|
Some(Commands::Sync(args)) => {
|
||||||
@@ -2445,6 +2461,28 @@ fn handle_who(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn handle_drift(
|
||||||
|
config_override: Option<&str>,
|
||||||
|
entity_type: &str,
|
||||||
|
iid: i64,
|
||||||
|
threshold: f32,
|
||||||
|
project: Option<&str>,
|
||||||
|
robot_mode: bool,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let config = Config::load(config_override)?;
|
||||||
|
let effective_project = config.effective_project(project);
|
||||||
|
let response = run_drift(&config, entity_type, iid, threshold, effective_project).await?;
|
||||||
|
let elapsed_ms = start.elapsed().as_millis() as u64;
|
||||||
|
|
||||||
|
if robot_mode {
|
||||||
|
print_drift_json(&response, elapsed_ms);
|
||||||
|
} else {
|
||||||
|
print_drift_human(&response);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
async fn handle_list_compat(
|
async fn handle_list_compat(
|
||||||
config_override: Option<&str>,
|
config_override: Option<&str>,
|
||||||
|
|||||||
Reference in New Issue
Block a user