gitlore/src/cli/commands/drift.rs

use std::collections::HashMap;
use std::sync::LazyLock;

use regex::Regex;
use serde::Serialize;

use crate::cli::render::{Icons, Theme};
use crate::cli::robot::RobotMeta;
use crate::core::config::Config;
use crate::core::db::create_connection;
use crate::core::error::{LoreError, Result};
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::core::time::ms_to_iso;
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
use crate::embedding::similarity::cosine_similarity;

const BATCH_SIZE: usize = 32;
const WINDOW_SIZE: usize = 3;
const MIN_DESCRIPTION_LEN: usize = 20;
const MAX_NOTES: i64 = 200;
const TOP_TOPICS: usize = 3;

// ---------------------------------------------------------------------------
// Response types
// ---------------------------------------------------------------------------

#[derive(Debug, Serialize)]
pub struct DriftResponse {
    pub entity: DriftEntity,
    pub drift_detected: bool,
    pub threshold: f32,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub drift_point: Option<DriftPoint>,
    pub drift_topics: Vec<String>,
    pub similarity_curve: Vec<SimilarityPoint>,
    pub recommendation: String,
}

#[derive(Debug, Serialize)]
pub struct DriftEntity {
    pub entity_type: String,
    pub iid: i64,
    pub title: String,
}

#[derive(Debug, Serialize)]
pub struct DriftPoint {
    pub note_index: usize,
    pub note_id: i64,
    pub author: String,
    pub created_at: String,
    pub similarity: f32,
}

#[derive(Debug, Serialize)]
pub struct SimilarityPoint {
    pub note_index: usize,
    pub similarity: f32,
    pub author: String,
    pub created_at: String,
}

// ---------------------------------------------------------------------------
// Internal row types
// ---------------------------------------------------------------------------

struct IssueInfo {
    id: i64,
    iid: i64,
    title: String,
    description: Option<String>,
}

struct NoteRow {
    id: i64,
    body: String,
    author_username: String,
    created_at: i64,
}

// ---------------------------------------------------------------------------
// Main entry point
// ---------------------------------------------------------------------------

pub async fn run_drift(
    config: &Config,
    entity_type: &str,
    iid: i64,
    threshold: f32,
    project: Option<&str>,
) -> Result<DriftResponse> {
    if entity_type != "issues" {
        return Err(LoreError::Other(
            "drift currently supports 'issues' only".to_string(),
        ));
    }

    let db_path = get_db_path(config.storage.db_path.as_deref());
    let conn = create_connection(&db_path)?;

    let issue = find_issue(&conn, iid, project)?;

    let description = match &issue.description {
        Some(d) if d.len() >= MIN_DESCRIPTION_LEN => d.clone(),
        _ => {
            return Ok(DriftResponse {
                entity: DriftEntity {
                    entity_type: entity_type.to_string(),
                    iid: issue.iid,
                    title: issue.title,
                },
                drift_detected: false,
                threshold,
                drift_point: None,
                drift_topics: vec![],
                similarity_curve: vec![],
                recommendation: "Description too short for drift analysis.".to_string(),
            });
        }
    };

    let notes = fetch_notes(&conn, issue.id)?;

    if notes.len() < WINDOW_SIZE {
        return Ok(DriftResponse {
            entity: DriftEntity {
                entity_type: entity_type.to_string(),
                iid: issue.iid,
                title: issue.title,
            },
            drift_detected: false,
            threshold,
            drift_point: None,
            drift_topics: vec![],
            similarity_curve: vec![],
            recommendation: format!(
                "Only {} note(s) found; need at least {} for drift detection.",
                notes.len(),
                WINDOW_SIZE
            ),
        });
    }

    // Build texts to embed: description first, then each note body.
    let mut texts: Vec<String> = Vec::with_capacity(1 + notes.len());
    texts.push(description.clone());
    for note in &notes {
        texts.push(note.body.clone());
    }

    let embeddings = embed_texts(config, &texts).await?;

    let desc_embedding = &embeddings[0];
    let note_embeddings = &embeddings[1..];

    // Build similarity curve.
    let similarity_curve: Vec<SimilarityPoint> = note_embeddings
        .iter()
        .enumerate()
        .map(|(i, emb)| SimilarityPoint {
            note_index: i,
            similarity: cosine_similarity(desc_embedding, emb),
            author: notes[i].author_username.clone(),
            created_at: ms_to_iso(notes[i].created_at),
        })
        .collect();

    // Detect drift via sliding window.
    let (drift_detected, drift_point) = detect_drift(&similarity_curve, &notes, threshold);

    // Extract drift topics.
    let drift_topics = if drift_detected {
        let drift_idx = drift_point.as_ref().map_or(0, |dp| dp.note_index);
        extract_drift_topics(&description, &notes, drift_idx)
    } else {
        vec![]
    };

    let recommendation = if drift_detected {
        let dp = drift_point.as_ref().unwrap();
        format!(
            "Discussion drifted at note {} by @{} (similarity {:.2}). Consider splitting into a new issue.",
            dp.note_index, dp.author, dp.similarity
        )
    } else {
        "Discussion remains on topic.".to_string()
    };

    Ok(DriftResponse {
        entity: DriftEntity {
            entity_type: entity_type.to_string(),
            iid: issue.iid,
            title: issue.title,
        },
        drift_detected,
        threshold,
        drift_point,
        drift_topics,
        similarity_curve,
        recommendation,
    })
}

// ---------------------------------------------------------------------------
// DB helpers
// ---------------------------------------------------------------------------

fn find_issue(
    conn: &rusqlite::Connection,
    iid: i64,
    project_filter: Option<&str>,
) -> Result<IssueInfo> {
    let (sql, params): (&str, Vec<Box<dyn rusqlite::ToSql>>) = match project_filter {
        Some(project) => {
            let project_id = resolve_project(conn, project)?;
            (
                "SELECT i.id, i.iid, i.title, i.description
                 FROM issues i
                 WHERE i.iid = ? AND i.project_id = ?",
                vec![Box::new(iid), Box::new(project_id)],
            )
        }
        None => (
            "SELECT i.id, i.iid, i.title, i.description
             FROM issues i
             WHERE i.iid = ?",
            vec![Box::new(iid)],
        ),
    };

    let param_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();

    let mut stmt = conn.prepare(sql)?;
    let rows: Vec<IssueInfo> = stmt
        .query_map(param_refs.as_slice(), |row| {
            Ok(IssueInfo {
                id: row.get(0)?,
                iid: row.get(1)?,
                title: row.get(2)?,
                description: row.get(3)?,
            })
        })?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    match rows.len() {
        0 => Err(LoreError::NotFound(format!("Issue #{iid} not found"))),
        1 => Ok(rows.into_iter().next().unwrap()),
        _ => Err(LoreError::Ambiguous(format!(
            "Issue #{iid} exists in multiple projects. Use --project to specify."
        ))),
    }
}

fn fetch_notes(conn: &rusqlite::Connection, issue_id: i64) -> Result<Vec<NoteRow>> {
    let mut stmt = conn.prepare(
        "SELECT n.id, n.body, n.author_username, n.created_at
         FROM notes n
         JOIN discussions d ON n.discussion_id = d.id
         WHERE d.issue_id = ?
           AND n.is_system = 0
           AND LENGTH(n.body) >= 20
         ORDER BY n.created_at ASC
         LIMIT ?",
    )?;

    let notes: Vec<NoteRow> = stmt
        .query_map(rusqlite::params![issue_id, MAX_NOTES], |row| {
            Ok(NoteRow {
                id: row.get(0)?,
                body: row.get(1)?,
                author_username: row.get(2)?,
                created_at: row.get(3)?,
            })
        })?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    Ok(notes)
}

// ---------------------------------------------------------------------------
// Embedding helper
// ---------------------------------------------------------------------------

async fn embed_texts(config: &Config, texts: &[String]) -> Result<Vec<Vec<f32>>> {
    let ollama = OllamaClient::new(OllamaConfig {
        base_url: config.embedding.base_url.clone(),
        model: config.embedding.model.clone(),
        timeout_secs: 60,
    });

    let mut all_embeddings: Vec<Vec<f32>> = Vec::with_capacity(texts.len());

    for chunk in texts.chunks(BATCH_SIZE) {
        let refs: Vec<&str> = chunk.iter().map(|s| s.as_str()).collect();
        let batch_result = ollama.embed_batch(&refs).await?;
        all_embeddings.extend(batch_result);
    }

    Ok(all_embeddings)
}

// ---------------------------------------------------------------------------
// Drift detection
// ---------------------------------------------------------------------------

fn detect_drift(
    curve: &[SimilarityPoint],
    notes: &[NoteRow],
    threshold: f32,
) -> (bool, Option<DriftPoint>) {
    if curve.len() < WINDOW_SIZE {
        return (false, None);
    }

    for i in 0..=curve.len() - WINDOW_SIZE {
        let window_avg: f32 = curve[i..i + WINDOW_SIZE]
            .iter()
            .map(|p| p.similarity)
            .sum::<f32>()
            / WINDOW_SIZE as f32;

        if window_avg < threshold {
            return (
                true,
                Some(DriftPoint {
                    note_index: i,
                    note_id: notes[i].id,
                    author: notes[i].author_username.clone(),
                    created_at: ms_to_iso(notes[i].created_at),
                    similarity: curve[i].similarity,
                }),
            );
        }
    }

    (false, None)
}

// ---------------------------------------------------------------------------
// Topic extraction
// ---------------------------------------------------------------------------

static STOPWORDS: LazyLock<std::collections::HashSet<&'static str>> = LazyLock::new(|| {
    [
        "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
        "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can",
        "need", "dare", "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
        "as", "into", "through", "during", "before", "after", "above", "below", "between", "out",
        "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
        "where", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other",
        "some", "such", "no", "not", "only", "own", "same", "so", "than", "too", "very", "just",
        "because", "but", "and", "or", "if", "while", "about", "up", "it", "its", "this", "that",
        "these", "those", "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she",
        "her", "they", "them", "their", "what", "which", "who", "whom", "also", "like", "get",
        "got", "think", "know", "see", "make", "go", "one", "two", "new", "way",
    ]
    .into_iter()
    .collect()
});

fn tokenize(text: &str) -> Vec<String> {
    let cleaned = strip_markdown(text);
    cleaned
        .split(|c: char| !c.is_alphanumeric() && c != '_')
        .filter(|w| w.len() >= 3)
        .map(|w| w.to_lowercase())
        .filter(|w| !STOPWORDS.contains(w.as_str()))
        .collect()
}

fn extract_drift_topics(description: &str, notes: &[NoteRow], drift_idx: usize) -> Vec<String> {
    let desc_terms: std::collections::HashSet<String> = tokenize(description).into_iter().collect();

    let mut freq: HashMap<String, usize> = HashMap::new();
    for note in notes.iter().skip(drift_idx) {
        for term in tokenize(&note.body) {
            if !desc_terms.contains(&term) {
                *freq.entry(term).or_insert(0) += 1;
            }
        }
    }

    let mut sorted: Vec<(String, usize)> = freq.into_iter().collect();
    sorted.sort_by_key(|b| std::cmp::Reverse(b.1));

    sorted
        .into_iter()
        .take(TOP_TOPICS)
        .map(|(t, _)| t)
        .collect()
}

// ---------------------------------------------------------------------------
// Markdown stripping
// ---------------------------------------------------------------------------

static RE_FENCED_CODE: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap());
static RE_INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap());
static RE_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]+\)").unwrap());
static RE_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^>\s?").unwrap());
static RE_HTML_TAG: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());

fn strip_markdown(text: &str) -> String {
    let text = RE_FENCED_CODE.replace_all(text, "");
    let text = RE_INLINE_CODE.replace_all(&text, "");
    let text = RE_LINK.replace_all(&text, "$1");
    let text = RE_BLOCKQUOTE.replace_all(&text, "");
    let text = RE_HTML_TAG.replace_all(&text, "");
    text.into_owned()
}

// ---------------------------------------------------------------------------
// Printers
// ---------------------------------------------------------------------------

pub fn print_drift_human(response: &DriftResponse) {
    let header = format!(
        "Drift Analysis: {} #{}",
        response.entity.entity_type, response.entity.iid
    );
    println!("{}", Theme::bold().render(&header));
    println!("{}", "-".repeat(header.len().min(60)));
    println!("Title:     {}", response.entity.title);
    println!("Threshold: {:.2}", response.threshold);
    println!("Notes:     {}", response.similarity_curve.len());
    println!();

    if response.drift_detected {
        println!(
            "{}  {}",
            Theme::error().render(Icons::error()),
            Theme::error().bold().render("DRIFT DETECTED")
        );
        if let Some(dp) = &response.drift_point {
            println!(
                "  At note #{} by @{} ({}) - similarity {:.2}",
                dp.note_index, dp.author, dp.created_at, dp.similarity
            );
        }
        if !response.drift_topics.is_empty() {
            println!("  Topics:  {}", response.drift_topics.join(", "));
        }
    } else {
        println!(
            "{}  {}",
            Theme::success().render(Icons::success()),
            Theme::success().render("No drift detected")
        );
    }

    println!();
    println!("{}", response.recommendation);

    if !response.similarity_curve.is_empty() {
        println!();
        println!("{}", Theme::bold().render("Similarity Curve:"));
        for pt in &response.similarity_curve {
            let bar_len = ((pt.similarity.max(0.0)) * 30.0) as usize;
            let bar: String = "\u{2588}".repeat(bar_len);
            println!(
                "  {:>3}  {:.2}  {}  @{}",
                pt.note_index, pt.similarity, bar, pt.author
            );
        }
    }
}

pub fn print_drift_json(response: &DriftResponse, elapsed_ms: u64) {
    let meta = RobotMeta { elapsed_ms };
    let output = serde_json::json!({
        "ok": true,
        "data": response,
        "meta": meta,
    });
    match serde_json::to_string(&output) {
        Ok(json) => println!("{json}"),
        Err(e) => eprintln!("Error serializing to JSON: {e}"),
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_drift_when_divergent() {
        let notes: Vec<NoteRow> = (0..6)
            .map(|i| NoteRow {
                id: i as i64,
                body: format!("note {i}"),
                author_username: "user".to_string(),
                created_at: 1000 + i as i64,
            })
            .collect();

        let curve: Vec<SimilarityPoint> = [0.9, 0.85, 0.8, 0.25, 0.2, 0.15]
            .iter()
            .enumerate()
            .map(|(i, &sim)| SimilarityPoint {
                note_index: i,
                similarity: sim,
                author: "user".to_string(),
                created_at: ms_to_iso(1000 + i as i64),
            })
            .collect();

        let (detected, point) = detect_drift(&curve, &notes, 0.4);
        assert!(detected);
        assert!(point.is_some());
    }

    #[test]
    fn test_no_drift_consistent() {
        let notes: Vec<NoteRow> = (0..5)
            .map(|i| NoteRow {
                id: i as i64,
                body: format!("note {i}"),
                author_username: "user".to_string(),
                created_at: 1000 + i as i64,
            })
            .collect();

        let curve: Vec<SimilarityPoint> = [0.85, 0.8, 0.75, 0.7, 0.65]
            .iter()
            .enumerate()
            .map(|(i, &sim)| SimilarityPoint {
                note_index: i,
                similarity: sim,
                author: "user".to_string(),
                created_at: ms_to_iso(1000 + i as i64),
            })
            .collect();

        let (detected, _) = detect_drift(&curve, &notes, 0.4);
        assert!(!detected);
    }

    #[test]
    fn test_drift_point_is_first_divergent() {
        let notes: Vec<NoteRow> = (0..5)
            .map(|i| NoteRow {
                id: (i * 10) as i64,
                body: format!("note {i}"),
                author_username: format!("user{i}"),
                created_at: 1000 + i as i64,
            })
            .collect();

        // Window of 3: indices [0,1,2] avg=0.83, [1,2,3] avg=0.55, [2,3,4] avg=0.23
        let curve: Vec<SimilarityPoint> = [0.9, 0.8, 0.8, 0.05, 0.05]
            .iter()
            .enumerate()
            .map(|(i, &sim)| SimilarityPoint {
                note_index: i,
                similarity: sim,
                author: format!("user{i}"),
                created_at: ms_to_iso(1000 + i as i64),
            })
            .collect();

        let (detected, point) = detect_drift(&curve, &notes, 0.4);
        assert!(detected);
        let dp = point.unwrap();
        // Window [2,3,4] avg = (0.8+0.05+0.05)/3 = 0.3 < 0.4
        // But [1,2,3] avg = (0.8+0.8+0.05)/3 = 0.55 >= 0.4, so first failing is index 2
        assert_eq!(dp.note_index, 2);
        assert_eq!(dp.note_id, 20);
    }

    #[test]
    fn test_extract_drift_topics_excludes_description_terms() {
        let description = "We need to fix the authentication flow for login users";
        let notes = vec![
            NoteRow {
                id: 1,
                body: "The database migration script is broken and needs postgres update"
                    .to_string(),
                author_username: "dev".to_string(),
                created_at: 1000,
            },
            NoteRow {
                id: 2,
                body: "The database connection pool also has migration issues with postgres"
                    .to_string(),
                author_username: "dev".to_string(),
                created_at: 2000,
            },
        ];

        let topics = extract_drift_topics(description, &notes, 0);
        // "database", "migration", "postgres" should appear; "fix" should not (it's in description)
        assert!(!topics.is_empty());
        for t in &topics {
            assert_ne!(t, "fix");
            assert_ne!(t, "authentication");
            assert_ne!(t, "login");
        }
    }

    #[test]
    fn test_strip_markdown_code_blocks() {
        let input = "Before\n```rust\nfn main() {}\n```\nAfter";
        let result = strip_markdown(input);
        assert!(!result.contains("fn main"));
        assert!(result.contains("Before"));
        assert!(result.contains("After"));
    }

    #[test]
    fn test_strip_markdown_preserves_text() {
        let input = "Check [this link](https://example.com) and `inline code` for details";
        let result = strip_markdown(input);
        assert!(result.contains("this link"));
        assert!(!result.contains("https://example.com"));
        assert!(!result.contains("inline code"));
        assert!(result.contains("details"));
    }

    #[test]
    fn test_too_few_notes() {
        let notes: Vec<NoteRow> = (0..2)
            .map(|i| NoteRow {
                id: i as i64,
                body: format!("note {i}"),
                author_username: "user".to_string(),
                created_at: 1000 + i as i64,
            })
            .collect();

        let curve: Vec<SimilarityPoint> = [0.1, 0.1]
            .iter()
            .enumerate()
            .map(|(i, &sim)| SimilarityPoint {
                note_index: i,
                similarity: sim,
                author: "user".to_string(),
                created_at: ms_to_iso(1000 + i as i64),
            })
            .collect();

        let (detected, _) = detect_drift(&curve, &notes, 0.4);
        assert!(!detected);
    }
}