use std::collections::HashMap; use std::sync::LazyLock; use console::style; use regex::Regex; use serde::Serialize; use crate::cli::robot::RobotMeta; use crate::core::config::Config; use crate::core::db::create_connection; use crate::core::error::{LoreError, Result}; use crate::core::paths::get_db_path; use crate::core::project::resolve_project; use crate::core::time::ms_to_iso; use crate::embedding::ollama::{OllamaClient, OllamaConfig}; use crate::embedding::similarity::cosine_similarity; const BATCH_SIZE: usize = 32; const WINDOW_SIZE: usize = 3; const MIN_DESCRIPTION_LEN: usize = 20; const MAX_NOTES: i64 = 200; const TOP_TOPICS: usize = 3; // --------------------------------------------------------------------------- // Response types // --------------------------------------------------------------------------- #[derive(Debug, Serialize)] pub struct DriftResponse { pub entity: DriftEntity, pub drift_detected: bool, pub threshold: f32, #[serde(skip_serializing_if = "Option::is_none")] pub drift_point: Option, pub drift_topics: Vec, pub similarity_curve: Vec, pub recommendation: String, } #[derive(Debug, Serialize)] pub struct DriftEntity { pub entity_type: String, pub iid: i64, pub title: String, } #[derive(Debug, Serialize)] pub struct DriftPoint { pub note_index: usize, pub note_id: i64, pub author: String, pub created_at: String, pub similarity: f32, } #[derive(Debug, Serialize)] pub struct SimilarityPoint { pub note_index: usize, pub similarity: f32, pub author: String, pub created_at: String, } // --------------------------------------------------------------------------- // Internal row types // --------------------------------------------------------------------------- struct IssueInfo { id: i64, iid: i64, title: String, description: Option, } struct NoteRow { id: i64, body: String, author_username: String, created_at: i64, } // --------------------------------------------------------------------------- // Main entry point // --------------------------------------------------------------------------- pub async fn run_drift( config: &Config, entity_type: &str, iid: i64, threshold: f32, project: Option<&str>, ) -> Result { if entity_type != "issues" { return Err(LoreError::Other( "drift currently supports 'issues' only".to_string(), )); } let db_path = get_db_path(config.storage.db_path.as_deref()); let conn = create_connection(&db_path)?; let issue = find_issue(&conn, iid, project)?; let description = match &issue.description { Some(d) if d.len() >= MIN_DESCRIPTION_LEN => d.clone(), _ => { return Ok(DriftResponse { entity: DriftEntity { entity_type: entity_type.to_string(), iid: issue.iid, title: issue.title, }, drift_detected: false, threshold, drift_point: None, drift_topics: vec![], similarity_curve: vec![], recommendation: "Description too short for drift analysis.".to_string(), }); } }; let notes = fetch_notes(&conn, issue.id)?; if notes.len() < WINDOW_SIZE { return Ok(DriftResponse { entity: DriftEntity { entity_type: entity_type.to_string(), iid: issue.iid, title: issue.title, }, drift_detected: false, threshold, drift_point: None, drift_topics: vec![], similarity_curve: vec![], recommendation: format!( "Only {} note(s) found; need at least {} for drift detection.", notes.len(), WINDOW_SIZE ), }); } // Build texts to embed: description first, then each note body. let mut texts: Vec = Vec::with_capacity(1 + notes.len()); texts.push(description.clone()); for note in ¬es { texts.push(note.body.clone()); } let embeddings = embed_texts(config, &texts).await?; let desc_embedding = &embeddings[0]; let note_embeddings = &embeddings[1..]; // Build similarity curve. let similarity_curve: Vec = note_embeddings .iter() .enumerate() .map(|(i, emb)| SimilarityPoint { note_index: i, similarity: cosine_similarity(desc_embedding, emb), author: notes[i].author_username.clone(), created_at: ms_to_iso(notes[i].created_at), }) .collect(); // Detect drift via sliding window. let (drift_detected, drift_point) = detect_drift(&similarity_curve, ¬es, threshold); // Extract drift topics. let drift_topics = if drift_detected { let drift_idx = drift_point.as_ref().map_or(0, |dp| dp.note_index); extract_drift_topics(&description, ¬es, drift_idx) } else { vec![] }; let recommendation = if drift_detected { let dp = drift_point.as_ref().unwrap(); format!( "Discussion drifted at note {} by @{} (similarity {:.2}). Consider splitting into a new issue.", dp.note_index, dp.author, dp.similarity ) } else { "Discussion remains on topic.".to_string() }; Ok(DriftResponse { entity: DriftEntity { entity_type: entity_type.to_string(), iid: issue.iid, title: issue.title, }, drift_detected, threshold, drift_point, drift_topics, similarity_curve, recommendation, }) } // --------------------------------------------------------------------------- // DB helpers // --------------------------------------------------------------------------- fn find_issue( conn: &rusqlite::Connection, iid: i64, project_filter: Option<&str>, ) -> Result { let (sql, params): (&str, Vec>) = match project_filter { Some(project) => { let project_id = resolve_project(conn, project)?; ( "SELECT i.id, i.iid, i.title, i.description FROM issues i WHERE i.iid = ? AND i.project_id = ?", vec![Box::new(iid), Box::new(project_id)], ) } None => ( "SELECT i.id, i.iid, i.title, i.description FROM issues i WHERE i.iid = ?", vec![Box::new(iid)], ), }; let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect(); let mut stmt = conn.prepare(sql)?; let rows: Vec = stmt .query_map(param_refs.as_slice(), |row| { Ok(IssueInfo { id: row.get(0)?, iid: row.get(1)?, title: row.get(2)?, description: row.get(3)?, }) })? .collect::, _>>()?; match rows.len() { 0 => Err(LoreError::NotFound(format!("Issue #{iid} not found"))), 1 => Ok(rows.into_iter().next().unwrap()), _ => Err(LoreError::Ambiguous(format!( "Issue #{iid} exists in multiple projects. Use --project to specify." ))), } } fn fetch_notes(conn: &rusqlite::Connection, issue_id: i64) -> Result> { let mut stmt = conn.prepare( "SELECT n.id, n.body, n.author_username, n.created_at FROM notes n JOIN discussions d ON n.discussion_id = d.id WHERE d.issue_id = ? AND n.is_system = 0 AND LENGTH(n.body) >= 20 ORDER BY n.created_at ASC LIMIT ?", )?; let notes: Vec = stmt .query_map(rusqlite::params![issue_id, MAX_NOTES], |row| { Ok(NoteRow { id: row.get(0)?, body: row.get(1)?, author_username: row.get(2)?, created_at: row.get(3)?, }) })? .collect::, _>>()?; Ok(notes) } // --------------------------------------------------------------------------- // Embedding helper // --------------------------------------------------------------------------- async fn embed_texts(config: &Config, texts: &[String]) -> Result>> { let ollama = OllamaClient::new(OllamaConfig { base_url: config.embedding.base_url.clone(), model: config.embedding.model.clone(), timeout_secs: 60, }); let mut all_embeddings: Vec> = Vec::with_capacity(texts.len()); for chunk in texts.chunks(BATCH_SIZE) { let refs: Vec<&str> = chunk.iter().map(|s| s.as_str()).collect(); let batch_result = ollama.embed_batch(&refs).await?; all_embeddings.extend(batch_result); } Ok(all_embeddings) } // --------------------------------------------------------------------------- // Drift detection // --------------------------------------------------------------------------- fn detect_drift( curve: &[SimilarityPoint], notes: &[NoteRow], threshold: f32, ) -> (bool, Option) { if curve.len() < WINDOW_SIZE { return (false, None); } for i in 0..=curve.len() - WINDOW_SIZE { let window_avg: f32 = curve[i..i + WINDOW_SIZE] .iter() .map(|p| p.similarity) .sum::() / WINDOW_SIZE as f32; if window_avg < threshold { return ( true, Some(DriftPoint { note_index: i, note_id: notes[i].id, author: notes[i].author_username.clone(), created_at: ms_to_iso(notes[i].created_at), similarity: curve[i].similarity, }), ); } } (false, None) } // --------------------------------------------------------------------------- // Topic extraction // --------------------------------------------------------------------------- static STOPWORDS: LazyLock> = LazyLock::new(|| { [ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can", "need", "dare", "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "into", "through", "during", "before", "after", "above", "below", "between", "out", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other", "some", "such", "no", "not", "only", "own", "same", "so", "than", "too", "very", "just", "because", "but", "and", "or", "if", "while", "about", "up", "it", "its", "this", "that", "these", "those", "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she", "her", "they", "them", "their", "what", "which", "who", "whom", "also", "like", "get", "got", "think", "know", "see", "make", "go", "one", "two", "new", "way", ] .into_iter() .collect() }); fn tokenize(text: &str) -> Vec { let cleaned = strip_markdown(text); cleaned .split(|c: char| !c.is_alphanumeric() && c != '_') .filter(|w| w.len() >= 3) .map(|w| w.to_lowercase()) .filter(|w| !STOPWORDS.contains(w.as_str())) .collect() } fn extract_drift_topics(description: &str, notes: &[NoteRow], drift_idx: usize) -> Vec { let desc_terms: std::collections::HashSet = tokenize(description).into_iter().collect(); let mut freq: HashMap = HashMap::new(); for note in notes.iter().skip(drift_idx) { for term in tokenize(¬e.body) { if !desc_terms.contains(&term) { *freq.entry(term).or_insert(0) += 1; } } } let mut sorted: Vec<(String, usize)> = freq.into_iter().collect(); sorted.sort_by(|a, b| b.1.cmp(&a.1)); sorted .into_iter() .take(TOP_TOPICS) .map(|(t, _)| t) .collect() } // --------------------------------------------------------------------------- // Markdown stripping // --------------------------------------------------------------------------- static RE_FENCED_CODE: LazyLock = LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap()); static RE_INLINE_CODE: LazyLock = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap()); static RE_LINK: LazyLock = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]$[^)]+$").unwrap()); static RE_BLOCKQUOTE: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^>\s?").unwrap()); static RE_HTML_TAG: LazyLock = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap()); fn strip_markdown(text: &str) -> String { let text = RE_FENCED_CODE.replace_all(text, ""); let text = RE_INLINE_CODE.replace_all(&text, ""); let text = RE_LINK.replace_all(&text, "$1"); let text = RE_BLOCKQUOTE.replace_all(&text, ""); let text = RE_HTML_TAG.replace_all(&text, ""); text.into_owned() } // --------------------------------------------------------------------------- // Printers // --------------------------------------------------------------------------- pub fn print_drift_human(response: &DriftResponse) { let header = format!( "Drift Analysis: {} #{}", response.entity.entity_type, response.entity.iid ); println!("{}", style(&header).bold()); println!("{}", "-".repeat(header.len().min(60))); println!("Title: {}", response.entity.title); println!("Threshold: {:.2}", response.threshold); println!("Notes: {}", response.similarity_curve.len()); println!(); if response.drift_detected { println!("{}", style("DRIFT DETECTED").red().bold()); if let Some(dp) = &response.drift_point { println!( " At note #{} by @{} ({}) - similarity {:.2}", dp.note_index, dp.author, dp.created_at, dp.similarity ); } if !response.drift_topics.is_empty() { println!(" Topics: {}", response.drift_topics.join(", ")); } } else { println!("{}", style("No drift detected").green()); } println!(); println!("{}", response.recommendation); if !response.similarity_curve.is_empty() { println!(); println!("{}", style("Similarity Curve:").bold()); for pt in &response.similarity_curve { let bar_len = ((pt.similarity.max(0.0)) * 30.0) as usize; let bar: String = "#".repeat(bar_len); println!( " {:>3} {:.2} {} @{}", pt.note_index, pt.similarity, bar, pt.author ); } } } pub fn print_drift_json(response: &DriftResponse, elapsed_ms: u64) { let meta = RobotMeta { elapsed_ms }; let output = serde_json::json!({ "ok": true, "data": response, "meta": meta, }); match serde_json::to_string(&output) { Ok(json) => println!("{json}"), Err(e) => eprintln!("Error serializing to JSON: {e}"), } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; #[test] fn test_detect_drift_when_divergent() { let notes: Vec = (0..6) .map(|i| NoteRow { id: i as i64, body: format!("note {i}"), author_username: "user".to_string(), created_at: 1000 + i as i64, }) .collect(); let curve: Vec = [0.9, 0.85, 0.8, 0.25, 0.2, 0.15] .iter() .enumerate() .map(|(i, &sim)| SimilarityPoint { note_index: i, similarity: sim, author: "user".to_string(), created_at: ms_to_iso(1000 + i as i64), }) .collect(); let (detected, point) = detect_drift(&curve, ¬es, 0.4); assert!(detected); assert!(point.is_some()); } #[test] fn test_no_drift_consistent() { let notes: Vec = (0..5) .map(|i| NoteRow { id: i as i64, body: format!("note {i}"), author_username: "user".to_string(), created_at: 1000 + i as i64, }) .collect(); let curve: Vec = [0.85, 0.8, 0.75, 0.7, 0.65] .iter() .enumerate() .map(|(i, &sim)| SimilarityPoint { note_index: i, similarity: sim, author: "user".to_string(), created_at: ms_to_iso(1000 + i as i64), }) .collect(); let (detected, _) = detect_drift(&curve, ¬es, 0.4); assert!(!detected); } #[test] fn test_drift_point_is_first_divergent() { let notes: Vec = (0..5) .map(|i| NoteRow { id: (i * 10) as i64, body: format!("note {i}"), author_username: format!("user{i}"), created_at: 1000 + i as i64, }) .collect(); // Window of 3: indices [0,1,2] avg=0.83, [1,2,3] avg=0.55, [2,3,4] avg=0.23 let curve: Vec = [0.9, 0.8, 0.8, 0.05, 0.05] .iter() .enumerate() .map(|(i, &sim)| SimilarityPoint { note_index: i, similarity: sim, author: format!("user{i}"), created_at: ms_to_iso(1000 + i as i64), }) .collect(); let (detected, point) = detect_drift(&curve, ¬es, 0.4); assert!(detected); let dp = point.unwrap(); // Window [2,3,4] avg = (0.8+0.05+0.05)/3 = 0.3 < 0.4 // But [1,2,3] avg = (0.8+0.8+0.05)/3 = 0.55 >= 0.4, so first failing is index 2 assert_eq!(dp.note_index, 2); assert_eq!(dp.note_id, 20); } #[test] fn test_extract_drift_topics_excludes_description_terms() { let description = "We need to fix the authentication flow for login users"; let notes = vec![ NoteRow { id: 1, body: "The database migration script is broken and needs postgres update" .to_string(), author_username: "dev".to_string(), created_at: 1000, }, NoteRow { id: 2, body: "The database connection pool also has migration issues with postgres" .to_string(), author_username: "dev".to_string(), created_at: 2000, }, ]; let topics = extract_drift_topics(description, ¬es, 0); // "database", "migration", "postgres" should appear; "fix" should not (it's in description) assert!(!topics.is_empty()); for t in &topics { assert_ne!(t, "fix"); assert_ne!(t, "authentication"); assert_ne!(t, "login"); } } #[test] fn test_strip_markdown_code_blocks() { let input = "Before\n```rust\nfn main() {}\n```\nAfter"; let result = strip_markdown(input); assert!(!result.contains("fn main")); assert!(result.contains("Before")); assert!(result.contains("After")); } #[test] fn test_strip_markdown_preserves_text() { let input = "Check [this link](https://example.com) and `inline code` for details"; let result = strip_markdown(input); assert!(result.contains("this link")); assert!(!result.contains("https://example.com")); assert!(!result.contains("inline code")); assert!(result.contains("details")); } #[test] fn test_too_few_notes() { let notes: Vec = (0..2) .map(|i| NoteRow { id: i as i64, body: format!("note {i}"), author_username: "user".to_string(), created_at: 1000 + i as i64, }) .collect(); let curve: Vec = [0.1, 0.1] .iter() .enumerate() .map(|(i, &sim)| SimilarityPoint { note_index: i, similarity: sim, author: "user".to_string(), created_at: ms_to_iso(1000 + i as i64), }) .collect(); let (detected, _) = detect_drift(&curve, ¬es, 0.4); assert!(!detected); } }