Files
gitlore/src/cli/commands/drift.rs
teernisse ea6e45e43f refactor(who): make --limit optional (unlimited default) and fix clippy sort lints
Change the `who` command's --limit flag from default=20 to optional,
so omitting it returns all results. This matches the behavior users
expect when they want a complete expert/workload/active/overlap listing
without an arbitrary cap.

Also applies clippy-recommended sort improvements:
- who/reviews: sort_by(|a,b| b.count.cmp(&a.count)) -> sort_by_key with Reverse
- drift: same pattern for frequency sorting

Adds Theme::color_icon() helper to DRY the stage-icon coloring pattern
used in sync output (was inline closure, now shared method).
2026-02-18 16:27:59 -05:00

651 lines
21 KiB
Rust

use std::collections::HashMap;
use std::sync::LazyLock;
use regex::Regex;
use serde::Serialize;
use crate::cli::render::{Icons, Theme};
use crate::cli::robot::RobotMeta;
use crate::core::config::Config;
use crate::core::db::create_connection;
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::core::time::ms_to_iso;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::embedding::similarity::cosine_similarity;
const BATCH_SIZE: usize = 32;
const WINDOW_SIZE: usize = 3;
const MIN_DESCRIPTION_LEN: usize = 20;
const MAX_NOTES: i64 = 200;
const TOP_TOPICS: usize = 3;
// ---------------------------------------------------------------------------
// Response types
// ---------------------------------------------------------------------------
#[derive(Debug, Serialize)]
pub struct DriftResponse {
pub entity: DriftEntity,
pub drift_detected: bool,
pub threshold: f32,
#[serde(skip_serializing_if = "Option::is_none")]
pub drift_point: Option<DriftPoint>,
pub drift_topics: Vec<String>,
pub similarity_curve: Vec<SimilarityPoint>,
pub recommendation: String,
}
#[derive(Debug, Serialize)]
pub struct DriftEntity {
pub entity_type: String,
pub iid: i64,
pub title: String,
}
#[derive(Debug, Serialize)]
pub struct DriftPoint {
pub note_index: usize,
pub note_id: i64,
pub author: String,
pub created_at: String,
pub similarity: f32,
}
#[derive(Debug, Serialize)]
pub struct SimilarityPoint {
pub note_index: usize,
pub similarity: f32,
pub author: String,
pub created_at: String,
}
// ---------------------------------------------------------------------------
// Internal row types
// ---------------------------------------------------------------------------
struct IssueInfo {
id: i64,
iid: i64,
title: String,
description: Option<String>,
}
struct NoteRow {
id: i64,
body: String,
author_username: String,
created_at: i64,
}
// ---------------------------------------------------------------------------
// Main entry point
// ---------------------------------------------------------------------------
pub async fn run_drift(
config: &Config,
entity_type: &str,
iid: i64,
threshold: f32,
project: Option<&str>,
) -> Result<DriftResponse> {
if entity_type != "issues" {
return Err(LoreError::Other(
"drift currently supports 'issues' only".to_string(),
));
}
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let issue = find_issue(&conn, iid, project)?;
let description = match &issue.description {
Some(d) if d.len() >= MIN_DESCRIPTION_LEN => d.clone(),
_ => {
return Ok(DriftResponse {
entity: DriftEntity {
entity_type: entity_type.to_string(),
iid: issue.iid,
title: issue.title,
},
drift_detected: false,
threshold,
drift_point: None,
drift_topics: vec![],
similarity_curve: vec![],
recommendation: "Description too short for drift analysis.".to_string(),
});
}
};
let notes = fetch_notes(&conn, issue.id)?;
if notes.len() < WINDOW_SIZE {
return Ok(DriftResponse {
entity: DriftEntity {
entity_type: entity_type.to_string(),
iid: issue.iid,
title: issue.title,
},
drift_detected: false,
threshold,
drift_point: None,
drift_topics: vec![],
similarity_curve: vec![],
recommendation: format!(
"Only {} note(s) found; need at least {} for drift detection.",
notes.len(),
WINDOW_SIZE
),
});
}
// Build texts to embed: description first, then each note body.
let mut texts: Vec<String> = Vec::with_capacity(1 + notes.len());
texts.push(description.clone());
for note in &notes {
texts.push(note.body.clone());
}
let embeddings = embed_texts(config, &texts).await?;
let desc_embedding = &embeddings[0];
let note_embeddings = &embeddings[1..];
// Build similarity curve.
let similarity_curve: Vec<SimilarityPoint> = note_embeddings
.iter()
.enumerate()
.map(|(i, emb)| SimilarityPoint {
note_index: i,
similarity: cosine_similarity(desc_embedding, emb),
author: notes[i].author_username.clone(),
created_at: ms_to_iso(notes[i].created_at),
})
.collect();
// Detect drift via sliding window.
let (drift_detected, drift_point) = detect_drift(&similarity_curve, &notes, threshold);
// Extract drift topics.
let drift_topics = if drift_detected {
let drift_idx = drift_point.as_ref().map_or(0, |dp| dp.note_index);
extract_drift_topics(&description, &notes, drift_idx)
} else {
vec![]
};
let recommendation = if drift_detected {
let dp = drift_point.as_ref().unwrap();
format!(
"Discussion drifted at note {} by @{} (similarity {:.2}). Consider splitting into a new issue.",
dp.note_index, dp.author, dp.similarity
)
} else {
"Discussion remains on topic.".to_string()
};
Ok(DriftResponse {
entity: DriftEntity {
entity_type: entity_type.to_string(),
iid: issue.iid,
title: issue.title,
},
drift_detected,
threshold,
drift_point,
drift_topics,
similarity_curve,
recommendation,
})
}
// ---------------------------------------------------------------------------
// DB helpers
// ---------------------------------------------------------------------------
fn find_issue(
conn: &rusqlite::Connection,
iid: i64,
project_filter: Option<&str>,
) -> Result<IssueInfo> {
let (sql, params): (&str, Vec<Box<dyn rusqlite::ToSql>>) = match project_filter {
Some(project) => {
let project_id = resolve_project(conn, project)?;
(
"SELECT i.id, i.iid, i.title, i.description
FROM issues i
WHERE i.iid = ? AND i.project_id = ?",
vec![Box::new(iid), Box::new(project_id)],
)
}
None => (
"SELECT i.id, i.iid, i.title, i.description
FROM issues i
WHERE i.iid = ?",
vec![Box::new(iid)],
),
};
let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();
let mut stmt = conn.prepare(sql)?;
let rows: Vec<IssueInfo> = stmt
.query_map(param_refs.as_slice(), |row| {
Ok(IssueInfo {
id: row.get(0)?,
iid: row.get(1)?,
title: row.get(2)?,
description: row.get(3)?,
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
match rows.len() {
0 => Err(LoreError::NotFound(format!("Issue #{iid} not found"))),
1 => Ok(rows.into_iter().next().unwrap()),
_ => Err(LoreError::Ambiguous(format!(
"Issue #{iid} exists in multiple projects. Use --project to specify."
))),
}
}
fn fetch_notes(conn: &rusqlite::Connection, issue_id: i64) -> Result<Vec<NoteRow>> {
let mut stmt = conn.prepare(
"SELECT n.id, n.body, n.author_username, n.created_at
FROM notes n
JOIN discussions d ON n.discussion_id = d.id
WHERE d.issue_id = ?
AND n.is_system = 0
AND LENGTH(n.body) >= 20
ORDER BY n.created_at ASC
LIMIT ?",
)?;
let notes: Vec<NoteRow> = stmt
.query_map(rusqlite::params![issue_id, MAX_NOTES], |row| {
Ok(NoteRow {
id: row.get(0)?,
body: row.get(1)?,
author_username: row.get(2)?,
created_at: row.get(3)?,
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(notes)
}
// ---------------------------------------------------------------------------
// Embedding helper
// ---------------------------------------------------------------------------
async fn embed_texts(config: &Config, texts: &[String]) -> Result<Vec<Vec<f32>>> {
let ollama = OllamaClient::new(OllamaConfig {
base_url: config.embedding.base_url.clone(),
model: config.embedding.model.clone(),
timeout_secs: 60,
});
let mut all_embeddings: Vec<Vec<f32>> = Vec::with_capacity(texts.len());
for chunk in texts.chunks(BATCH_SIZE) {
let refs: Vec<&str> = chunk.iter().map(|s| s.as_str()).collect();
let batch_result = ollama.embed_batch(&refs).await?;
all_embeddings.extend(batch_result);
}
Ok(all_embeddings)
}
// ---------------------------------------------------------------------------
// Drift detection
// ---------------------------------------------------------------------------
fn detect_drift(
curve: &[SimilarityPoint],
notes: &[NoteRow],
threshold: f32,
) -> (bool, Option<DriftPoint>) {
if curve.len() < WINDOW_SIZE {
return (false, None);
}
for i in 0..=curve.len() - WINDOW_SIZE {
let window_avg: f32 = curve[i..i + WINDOW_SIZE]
.iter()
.map(|p| p.similarity)
.sum::<f32>()
/ WINDOW_SIZE as f32;
if window_avg < threshold {
return (
true,
Some(DriftPoint {
note_index: i,
note_id: notes[i].id,
author: notes[i].author_username.clone(),
created_at: ms_to_iso(notes[i].created_at),
similarity: curve[i].similarity,
}),
);
}
}
(false, None)
}
// ---------------------------------------------------------------------------
// Topic extraction
// ---------------------------------------------------------------------------
static STOPWORDS: LazyLock<std::collections::HashSet<&'static str>> = LazyLock::new(|| {
[
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
"do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can",
"need", "dare", "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
"as", "into", "through", "during", "before", "after", "above", "below", "between", "out",
"off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
"where", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other",
"some", "such", "no", "not", "only", "own", "same", "so", "than", "too", "very", "just",
"because", "but", "and", "or", "if", "while", "about", "up", "it", "its", "this", "that",
"these", "those", "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she",
"her", "they", "them", "their", "what", "which", "who", "whom", "also", "like", "get",
"got", "think", "know", "see", "make", "go", "one", "two", "new", "way",
]
.into_iter()
.collect()
});
fn tokenize(text: &str) -> Vec<String> {
let cleaned = strip_markdown(text);
cleaned
.split(|c: char| !c.is_alphanumeric() && c != '_')
.filter(|w| w.len() >= 3)
.map(|w| w.to_lowercase())
.filter(|w| !STOPWORDS.contains(w.as_str()))
.collect()
}
fn extract_drift_topics(description: &str, notes: &[NoteRow], drift_idx: usize) -> Vec<String> {
let desc_terms: std::collections::HashSet<String> = tokenize(description).into_iter().collect();
let mut freq: HashMap<String, usize> = HashMap::new();
for note in notes.iter().skip(drift_idx) {
for term in tokenize(&note.body) {
if !desc_terms.contains(&term) {
*freq.entry(term).or_insert(0) += 1;
}
}
}
let mut sorted: Vec<(String, usize)> = freq.into_iter().collect();
sorted.sort_by_key(|b| std::cmp::Reverse(b.1));
sorted
.into_iter()
.take(TOP_TOPICS)
.map(|(t, _)| t)
.collect()
}
// ---------------------------------------------------------------------------
// Markdown stripping
// ---------------------------------------------------------------------------
static RE_FENCED_CODE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap());
static RE_INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap());
static RE_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]+\)").unwrap());
static RE_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^>\s?").unwrap());
static RE_HTML_TAG: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());
fn strip_markdown(text: &str) -> String {
let text = RE_FENCED_CODE.replace_all(text, "");
let text = RE_INLINE_CODE.replace_all(&text, "");
let text = RE_LINK.replace_all(&text, "$1");
let text = RE_BLOCKQUOTE.replace_all(&text, "");
let text = RE_HTML_TAG.replace_all(&text, "");
text.into_owned()
}
// ---------------------------------------------------------------------------
// Printers
// ---------------------------------------------------------------------------
pub fn print_drift_human(response: &DriftResponse) {
let header = format!(
"Drift Analysis: {} #{}",
response.entity.entity_type, response.entity.iid
);
println!("{}", Theme::bold().render(&header));
println!("{}", "-".repeat(header.len().min(60)));
println!("Title: {}", response.entity.title);
println!("Threshold: {:.2}", response.threshold);
println!("Notes: {}", response.similarity_curve.len());
println!();
if response.drift_detected {
println!(
"{} {}",
Theme::error().render(Icons::error()),
Theme::error().bold().render("DRIFT DETECTED")
);
if let Some(dp) = &response.drift_point {
println!(
" At note #{} by @{} ({}) - similarity {:.2}",
dp.note_index, dp.author, dp.created_at, dp.similarity
);
}
if !response.drift_topics.is_empty() {
println!(" Topics: {}", response.drift_topics.join(", "));
}
} else {
println!(
"{} {}",
Theme::success().render(Icons::success()),
Theme::success().render("No drift detected")
);
}
println!();
println!("{}", response.recommendation);
if !response.similarity_curve.is_empty() {
println!();
println!("{}", Theme::bold().render("Similarity Curve:"));
for pt in &response.similarity_curve {
let bar_len = ((pt.similarity.max(0.0)) * 30.0) as usize;
let bar: String = "\u{2588}".repeat(bar_len);
println!(
" {:>3} {:.2} {} @{}",
pt.note_index, pt.similarity, bar, pt.author
);
}
}
}
pub fn print_drift_json(response: &DriftResponse, elapsed_ms: u64) {
let meta = RobotMeta { elapsed_ms };
let output = serde_json::json!({
"ok": true,
"data": response,
"meta": meta,
});
match serde_json::to_string(&output) {
Ok(json) => println!("{json}"),
Err(e) => eprintln!("Error serializing to JSON: {e}"),
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_drift_when_divergent() {
let notes: Vec<NoteRow> = (0..6)
.map(|i| NoteRow {
id: i as i64,
body: format!("note {i}"),
author_username: "user".to_string(),
created_at: 1000 + i as i64,
})
.collect();
let curve: Vec<SimilarityPoint> = [0.9, 0.85, 0.8, 0.25, 0.2, 0.15]
.iter()
.enumerate()
.map(|(i, &sim)| SimilarityPoint {
note_index: i,
similarity: sim,
author: "user".to_string(),
created_at: ms_to_iso(1000 + i as i64),
})
.collect();
let (detected, point) = detect_drift(&curve, &notes, 0.4);
assert!(detected);
assert!(point.is_some());
}
#[test]
fn test_no_drift_consistent() {
let notes: Vec<NoteRow> = (0..5)
.map(|i| NoteRow {
id: i as i64,
body: format!("note {i}"),
author_username: "user".to_string(),
created_at: 1000 + i as i64,
})
.collect();
let curve: Vec<SimilarityPoint> = [0.85, 0.8, 0.75, 0.7, 0.65]
.iter()
.enumerate()
.map(|(i, &sim)| SimilarityPoint {
note_index: i,
similarity: sim,
author: "user".to_string(),
created_at: ms_to_iso(1000 + i as i64),
})
.collect();
let (detected, _) = detect_drift(&curve, &notes, 0.4);
assert!(!detected);
}
#[test]
fn test_drift_point_is_first_divergent() {
let notes: Vec<NoteRow> = (0..5)
.map(|i| NoteRow {
id: (i * 10) as i64,
body: format!("note {i}"),
author_username: format!("user{i}"),
created_at: 1000 + i as i64,
})
.collect();
// Window of 3: indices [0,1,2] avg=0.83, [1,2,3] avg=0.55, [2,3,4] avg=0.23
let curve: Vec<SimilarityPoint> = [0.9, 0.8, 0.8, 0.05, 0.05]
.iter()
.enumerate()
.map(|(i, &sim)| SimilarityPoint {
note_index: i,
similarity: sim,
author: format!("user{i}"),
created_at: ms_to_iso(1000 + i as i64),
})
.collect();
let (detected, point) = detect_drift(&curve, &notes, 0.4);
assert!(detected);
let dp = point.unwrap();
// Window [2,3,4] avg = (0.8+0.05+0.05)/3 = 0.3 < 0.4
// But [1,2,3] avg = (0.8+0.8+0.05)/3 = 0.55 >= 0.4, so first failing is index 2
assert_eq!(dp.note_index, 2);
assert_eq!(dp.note_id, 20);
}
#[test]
fn test_extract_drift_topics_excludes_description_terms() {
let description = "We need to fix the authentication flow for login users";
let notes = vec![
NoteRow {
id: 1,
body: "The database migration script is broken and needs postgres update"
.to_string(),
author_username: "dev".to_string(),
created_at: 1000,
},
NoteRow {
id: 2,
body: "The database connection pool also has migration issues with postgres"
.to_string(),
author_username: "dev".to_string(),
created_at: 2000,
},
];
let topics = extract_drift_topics(description, &notes, 0);
// "database", "migration", "postgres" should appear; "fix" should not (it's in description)
assert!(!topics.is_empty());
for t in &topics {
assert_ne!(t, "fix");
assert_ne!(t, "authentication");
assert_ne!(t, "login");
}
}
#[test]
fn test_strip_markdown_code_blocks() {
let input = "Before\n```rust\nfn main() {}\n```\nAfter";
let result = strip_markdown(input);
assert!(!result.contains("fn main"));
assert!(result.contains("Before"));
assert!(result.contains("After"));
}
#[test]
fn test_strip_markdown_preserves_text() {
let input = "Check [this link](https://example.com) and `inline code` for details";
let result = strip_markdown(input);
assert!(result.contains("this link"));
assert!(!result.contains("https://example.com"));
assert!(!result.contains("inline code"));
assert!(result.contains("details"));
}
#[test]
fn test_too_few_notes() {
let notes: Vec<NoteRow> = (0..2)
.map(|i| NoteRow {
id: i as i64,
body: format!("note {i}"),
author_username: "user".to_string(),
created_at: 1000 + i as i64,
})
.collect();
let curve: Vec<SimilarityPoint> = [0.1, 0.1]
.iter()
.enumerate()
.map(|(i, &sim)| SimilarityPoint {
note_index: i,
similarity: sim,
author: "user".to_string(),
created_at: ms_to_iso(1000 + i as i64),
})
.collect();
let (detected, _) = detect_drift(&curve, &notes, 0.4);
assert!(!detected);
}
}