gitlore/src/documents/extractor.rs

use chrono::DateTime;
use rusqlite::Connection;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::BTreeSet;

use super::truncation::{
    MAX_DISCUSSION_BYTES, NoteContent, truncate_discussion, truncate_hard_cap,
};
use crate::core::error::Result;

/// Source type for documents.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SourceType {
    Issue,
    MergeRequest,
    Discussion,
}

impl SourceType {
    pub fn as_str(&self) -> &'static str {
        match self {
            Self::Issue => "issue",
            Self::MergeRequest => "merge_request",
            Self::Discussion => "discussion",
        }
    }

    /// Parse from CLI input, accepting common aliases.
    ///
    /// Accepts: "issue", "issues", "mr", "mrs", "merge_request", "merge_requests",
    /// "discussion", "discussions"
    pub fn parse(s: &str) -> Option<Self> {
        match s.to_lowercase().as_str() {
            "issue" | "issues" => Some(Self::Issue),
            "mr" | "mrs" | "merge_request" | "merge_requests" => Some(Self::MergeRequest),
            "discussion" | "discussions" => Some(Self::Discussion),
            _ => None,
        }
    }
}

impl std::fmt::Display for SourceType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.as_str())
    }
}

/// Generated document ready for storage.
#[derive(Debug, Clone)]
pub struct DocumentData {
    pub source_type: SourceType,
    pub source_id: i64,
    pub project_id: i64,
    pub author_username: Option<String>,
    pub labels: Vec<String>,
    pub paths: Vec<String>,
    pub labels_hash: String,
    pub paths_hash: String,
    pub created_at: i64,
    pub updated_at: i64,
    pub url: Option<String>,
    pub title: Option<String>,
    pub content_text: String,
    pub content_hash: String,
    pub is_truncated: bool,
    pub truncated_reason: Option<String>,
}

/// Compute SHA-256 hash of content.
pub fn compute_content_hash(content: &str) -> String {
    let mut hasher = Sha256::new();
    hasher.update(content.as_bytes());
    format!("{:x}", hasher.finalize())
}

/// Compute SHA-256 hash over a sorted list of strings.
/// Used for labels_hash and paths_hash to detect changes efficiently.
/// Sorts by index reference to avoid cloning, hashes incrementally to avoid join allocation.
pub fn compute_list_hash(items: &[String]) -> String {
    let mut indices: Vec<usize> = (0..items.len()).collect();
    indices.sort_by(|a, b| items[*a].cmp(&items[*b]));
    let mut hasher = Sha256::new();
    for (i, &idx) in indices.iter().enumerate() {
        if i > 0 {
            hasher.update(b"\n");
        }
        hasher.update(items[idx].as_bytes());
    }
    format!("{:x}", hasher.finalize())
}

/// Extract a searchable document from an issue.
/// Returns None if the issue has been deleted from the DB.
pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option<DocumentData>> {
    // Query main issue entity with project info
    let row = conn.query_row(
        "SELECT i.id, i.iid, i.title, i.description, i.state, i.author_username,
                i.created_at, i.updated_at, i.web_url,
                p.path_with_namespace, p.id AS project_id
         FROM issues i
         JOIN projects p ON p.id = i.project_id
         WHERE i.id = ?1",
        rusqlite::params![issue_id],
        |row| {
            Ok((
                row.get::<_, i64>(0)?,            // id
                row.get::<_, i64>(1)?,            // iid
                row.get::<_, Option<String>>(2)?, // title
                row.get::<_, Option<String>>(3)?, // description
                row.get::<_, String>(4)?,         // state
                row.get::<_, Option<String>>(5)?, // author_username
                row.get::<_, i64>(6)?,            // created_at
                row.get::<_, i64>(7)?,            // updated_at
                row.get::<_, Option<String>>(8)?, // web_url
                row.get::<_, String>(9)?,         // path_with_namespace
                row.get::<_, i64>(10)?,           // project_id
            ))
        },
    );

    let (
        id,
        iid,
        title,
        description,
        state,
        author_username,
        created_at,
        updated_at,
        web_url,
        path_with_namespace,
        project_id,
    ) = match row {
        Ok(r) => r,
        Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
        Err(e) => return Err(e.into()),
    };

    // Query labels via junction table
    let mut label_stmt = conn.prepare_cached(
        "SELECT l.name FROM issue_labels il
         JOIN labels l ON l.id = il.label_id
         WHERE il.issue_id = ?1
         ORDER BY l.name",
    )?;
    let labels: Vec<String> = label_stmt
        .query_map(rusqlite::params![id], |row| row.get(0))?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    // Build labels JSON array string
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());

    // Format content_text per PRD template
    let display_title = title.as_deref().unwrap_or("(untitled)");
    let mut content = format!(
        "[[Issue]] #{}: {}\nProject: {}\n",
        iid, display_title, path_with_namespace
    );
    if let Some(ref url) = web_url {
        content.push_str(&format!("URL: {}\n", url));
    }
    content.push_str(&format!("Labels: {}\n", labels_json));
    content.push_str(&format!("State: {}\n", state));
    if let Some(ref author) = author_username {
        content.push_str(&format!("Author: @{}\n", author));
    }

    // Add description section only if description is Some
    if let Some(ref desc) = description {
        content.push_str("\n--- Description ---\n\n");
        content.push_str(desc);
    }

    let labels_hash = compute_list_hash(&labels);
    let paths_hash = compute_list_hash(&[]); // Issues have no paths

    // Apply hard cap truncation for safety, then hash the final stored content
    let hard_cap = truncate_hard_cap(&content);
    let content_hash = compute_content_hash(&hard_cap.content);

    Ok(Some(DocumentData {
        source_type: SourceType::Issue,
        source_id: id,
        project_id,
        author_username,
        labels,
        paths: Vec::new(),
        labels_hash,
        paths_hash,
        created_at,
        updated_at,
        url: web_url,
        title: Some(display_title.to_string()),
        content_text: hard_cap.content,
        content_hash,
        is_truncated: hard_cap.is_truncated,
        truncated_reason: hard_cap.reason.map(|r| r.as_str().to_string()),
    }))
}

/// Extract a searchable document from a merge request.
/// Returns None if the MR has been deleted from the DB.
pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<DocumentData>> {
    let row = conn.query_row(
        "SELECT m.id, m.iid, m.title, m.description, m.state, m.author_username,
                m.source_branch, m.target_branch,
                m.created_at, m.updated_at, m.web_url,
                p.path_with_namespace, p.id AS project_id
         FROM merge_requests m
         JOIN projects p ON p.id = m.project_id
         WHERE m.id = ?1",
        rusqlite::params![mr_id],
        |row| {
            Ok((
                row.get::<_, i64>(0)?,             // id
                row.get::<_, i64>(1)?,             // iid
                row.get::<_, Option<String>>(2)?,  // title
                row.get::<_, Option<String>>(3)?,  // description
                row.get::<_, Option<String>>(4)?,  // state
                row.get::<_, Option<String>>(5)?,  // author_username
                row.get::<_, Option<String>>(6)?,  // source_branch
                row.get::<_, Option<String>>(7)?,  // target_branch
                row.get::<_, Option<i64>>(8)?,     // created_at (nullable in schema)
                row.get::<_, Option<i64>>(9)?,     // updated_at (nullable in schema)
                row.get::<_, Option<String>>(10)?, // web_url
                row.get::<_, String>(11)?,         // path_with_namespace
                row.get::<_, i64>(12)?,            // project_id
            ))
        },
    );

    let (
        id,
        iid,
        title,
        description,
        state,
        author_username,
        source_branch,
        target_branch,
        created_at,
        updated_at,
        web_url,
        path_with_namespace,
        project_id,
    ) = match row {
        Ok(r) => r,
        Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
        Err(e) => return Err(e.into()),
    };

    // Query labels via junction table
    let mut label_stmt = conn.prepare_cached(
        "SELECT l.name FROM mr_labels ml
         JOIN labels l ON l.id = ml.label_id
         WHERE ml.merge_request_id = ?1
         ORDER BY l.name",
    )?;
    let labels: Vec<String> = label_stmt
        .query_map(rusqlite::params![id], |row| row.get(0))?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());

    let display_title = title.as_deref().unwrap_or("(untitled)");
    let display_state = state.as_deref().unwrap_or("unknown");
    let mut content = format!(
        "[[MergeRequest]] !{}: {}\nProject: {}\n",
        iid, display_title, path_with_namespace
    );
    if let Some(ref url) = web_url {
        content.push_str(&format!("URL: {}\n", url));
    }
    content.push_str(&format!("Labels: {}\n", labels_json));
    content.push_str(&format!("State: {}\n", display_state));
    if let Some(ref author) = author_username {
        content.push_str(&format!("Author: @{}\n", author));
    }
    // Source line: source_branch -> target_branch
    if let (Some(src), Some(tgt)) = (&source_branch, &target_branch) {
        content.push_str(&format!("Source: {} -> {}\n", src, tgt));
    }

    if let Some(ref desc) = description {
        content.push_str("\n--- Description ---\n\n");
        content.push_str(desc);
    }

    let labels_hash = compute_list_hash(&labels);
    let paths_hash = compute_list_hash(&[]);

    // Apply hard cap truncation for safety, then hash the final stored content
    let hard_cap = truncate_hard_cap(&content);
    let content_hash = compute_content_hash(&hard_cap.content);

    Ok(Some(DocumentData {
        source_type: SourceType::MergeRequest,
        source_id: id,
        project_id,
        author_username,
        labels,
        paths: Vec::new(),
        labels_hash,
        paths_hash,
        created_at: created_at.unwrap_or(0),
        updated_at: updated_at.unwrap_or(0),
        url: web_url,
        title: Some(display_title.to_string()),
        content_text: hard_cap.content,
        content_hash,
        is_truncated: hard_cap.is_truncated,
        truncated_reason: hard_cap.reason.map(|r| r.as_str().to_string()),
    }))
}

/// Format ms epoch as YYYY-MM-DD date string.
fn format_date(ms: i64) -> String {
    DateTime::from_timestamp_millis(ms)
        .map(|dt| dt.format("%Y-%m-%d").to_string())
        .unwrap_or_else(|| "unknown".to_string())
}

/// Extract a searchable document from a discussion thread.
/// Returns None if the discussion or its parent has been deleted.
pub fn extract_discussion_document(
    conn: &Connection,
    discussion_id: i64,
) -> Result<Option<DocumentData>> {
    // Query discussion metadata
    let disc_row = conn.query_row(
        "SELECT d.id, d.noteable_type, d.issue_id, d.merge_request_id,
                p.path_with_namespace, p.id AS project_id
         FROM discussions d
         JOIN projects p ON p.id = d.project_id
         WHERE d.id = ?1",
        rusqlite::params![discussion_id],
        |row| {
            Ok((
                row.get::<_, i64>(0)?,         // id
                row.get::<_, String>(1)?,      // noteable_type
                row.get::<_, Option<i64>>(2)?, // issue_id
                row.get::<_, Option<i64>>(3)?, // merge_request_id
                row.get::<_, String>(4)?,      // path_with_namespace
                row.get::<_, i64>(5)?,         // project_id
            ))
        },
    );

    let (id, noteable_type, issue_id, merge_request_id, path_with_namespace, project_id) =
        match disc_row {
            Ok(r) => r,
            Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
            Err(e) => return Err(e.into()),
        };

    // Query parent entity
    let (_parent_iid, parent_title, parent_web_url, parent_type_prefix, labels) =
        match noteable_type.as_str() {
            "Issue" => {
                let parent_id = match issue_id {
                    Some(pid) => pid,
                    None => return Ok(None),
                };
                let parent = conn.query_row(
                    "SELECT i.iid, i.title, i.web_url FROM issues i WHERE i.id = ?1",
                    rusqlite::params![parent_id],
                    |row| {
                        Ok((
                            row.get::<_, i64>(0)?,
                            row.get::<_, Option<String>>(1)?,
                            row.get::<_, Option<String>>(2)?,
                        ))
                    },
                );
                let (iid, title, web_url) = match parent {
                    Ok(r) => r,
                    Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
                    Err(e) => return Err(e.into()),
                };
                // Query parent labels
                let mut label_stmt = conn.prepare_cached(
                    "SELECT l.name FROM issue_labels il
                     JOIN labels l ON l.id = il.label_id
                     WHERE il.issue_id = ?1
                     ORDER BY l.name",
                )?;
                let labels: Vec<String> = label_stmt
                    .query_map(rusqlite::params![parent_id], |row| row.get(0))?
                    .collect::<std::result::Result<Vec<_>, _>>()?;

                (iid, title, web_url, format!("Issue #{}", iid), labels)
            }
            "MergeRequest" => {
                let parent_id = match merge_request_id {
                    Some(pid) => pid,
                    None => return Ok(None),
                };
                let parent = conn.query_row(
                    "SELECT m.iid, m.title, m.web_url FROM merge_requests m WHERE m.id = ?1",
                    rusqlite::params![parent_id],
                    |row| {
                        Ok((
                            row.get::<_, i64>(0)?,
                            row.get::<_, Option<String>>(1)?,
                            row.get::<_, Option<String>>(2)?,
                        ))
                    },
                );
                let (iid, title, web_url) = match parent {
                    Ok(r) => r,
                    Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
                    Err(e) => return Err(e.into()),
                };
                // Query parent labels
                let mut label_stmt = conn.prepare_cached(
                    "SELECT l.name FROM mr_labels ml
                     JOIN labels l ON l.id = ml.label_id
                     WHERE ml.merge_request_id = ?1
                     ORDER BY l.name",
                )?;
                let labels: Vec<String> = label_stmt
                    .query_map(rusqlite::params![parent_id], |row| row.get(0))?
                    .collect::<std::result::Result<Vec<_>, _>>()?;

                (iid, title, web_url, format!("MR !{}", iid), labels)
            }
            _ => return Ok(None),
        };

    // Query non-system notes in thread order
    let mut note_stmt = conn.prepare_cached(
        "SELECT n.author_username, n.body, n.created_at, n.gitlab_id,
                n.note_type, n.position_old_path, n.position_new_path
         FROM notes n
         WHERE n.discussion_id = ?1 AND n.is_system = 0
         ORDER BY n.created_at ASC, n.id ASC",
    )?;

    struct NoteRow {
        author: Option<String>,
        body: Option<String>,
        created_at: i64,
        gitlab_id: i64,
        old_path: Option<String>,
        new_path: Option<String>,
    }

    let notes: Vec<NoteRow> = note_stmt
        .query_map(rusqlite::params![id], |row| {
            Ok(NoteRow {
                author: row.get(0)?,
                body: row.get(1)?,
                created_at: row.get(2)?,
                gitlab_id: row.get(3)?,
                // index 4 is note_type (unused here)
                old_path: row.get(5)?,
                new_path: row.get(6)?,
            })
        })?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    if notes.is_empty() {
        return Ok(None);
    }

    // Extract DiffNote paths (deduplicated, sorted)
    let mut path_set = BTreeSet::new();
    for note in &notes {
        if let Some(ref p) = note.old_path
            && !p.is_empty()
        {
            path_set.insert(p.clone());
        }
        if let Some(ref p) = note.new_path
            && !p.is_empty()
        {
            path_set.insert(p.clone());
        }
    }
    let paths: Vec<String> = path_set.into_iter().collect();

    // Construct URL: parent_web_url#note_{first_note_gitlab_id}
    let first_note_gitlab_id = notes[0].gitlab_id;
    let url = parent_web_url
        .as_ref()
        .map(|wu| format!("{}#note_{}", wu, first_note_gitlab_id));

    // First non-system note author
    let author_username = notes[0].author.clone();

    // Build content
    let display_title = parent_title.as_deref().unwrap_or("(untitled)");
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());

    let mut content = format!(
        "[[Discussion]] {}: {}\nProject: {}\n",
        parent_type_prefix, display_title, path_with_namespace
    );
    if let Some(ref u) = url {
        content.push_str(&format!("URL: {}\n", u));
    }
    content.push_str(&format!("Labels: {}\n", labels_json));
    if !paths.is_empty() {
        content.push_str(&format!("Files: {}\n", paths_json));
    }

    // Build NoteContent list for truncation-aware thread rendering
    let note_contents: Vec<NoteContent> = notes
        .iter()
        .map(|note| NoteContent {
            author: note.author.as_deref().unwrap_or("unknown").to_string(),
            date: format_date(note.created_at),
            body: note.body.as_deref().unwrap_or("").to_string(),
        })
        .collect();

    // Estimate header size to reserve budget for thread content
    let header_len = content.len() + "\n--- Thread ---\n\n".len();
    let thread_budget = MAX_DISCUSSION_BYTES.saturating_sub(header_len);

    let thread_result = truncate_discussion(&note_contents, thread_budget);
    content.push_str("\n--- Thread ---\n\n");
    content.push_str(&thread_result.content);

    // Use first note's created_at and last note's created_at for timestamps
    let created_at = notes[0].created_at;
    let updated_at = notes.last().map(|n| n.created_at).unwrap_or(created_at);

    let content_hash = compute_content_hash(&content);
    let labels_hash = compute_list_hash(&labels);
    let paths_hash = compute_list_hash(&paths);

    Ok(Some(DocumentData {
        source_type: SourceType::Discussion,
        source_id: id,
        project_id,
        author_username,
        labels,
        paths,
        labels_hash,
        paths_hash,
        created_at,
        updated_at,
        url,
        title: None, // Discussions don't have their own title
        content_text: content,
        content_hash,
        is_truncated: thread_result.is_truncated,
        truncated_reason: thread_result.reason.map(|r| r.as_str().to_string()),
    }))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_source_type_parse_aliases() {
        assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue));
        assert_eq!(SourceType::parse("issues"), Some(SourceType::Issue));
        assert_eq!(SourceType::parse("mr"), Some(SourceType::MergeRequest));
        assert_eq!(SourceType::parse("mrs"), Some(SourceType::MergeRequest));
        assert_eq!(
            SourceType::parse("merge_request"),
            Some(SourceType::MergeRequest)
        );
        assert_eq!(
            SourceType::parse("merge_requests"),
            Some(SourceType::MergeRequest)
        );
        assert_eq!(
            SourceType::parse("discussion"),
            Some(SourceType::Discussion)
        );
        assert_eq!(
            SourceType::parse("discussions"),
            Some(SourceType::Discussion)
        );
        assert_eq!(SourceType::parse("invalid"), None);
        assert_eq!(SourceType::parse("ISSUE"), Some(SourceType::Issue)); // case insensitive
    }

    #[test]
    fn test_source_type_as_str() {
        assert_eq!(SourceType::Issue.as_str(), "issue");
        assert_eq!(SourceType::MergeRequest.as_str(), "merge_request");
        assert_eq!(SourceType::Discussion.as_str(), "discussion");
    }

    #[test]
    fn test_source_type_display() {
        assert_eq!(format!("{}", SourceType::Issue), "issue");
        assert_eq!(format!("{}", SourceType::MergeRequest), "merge_request");
        assert_eq!(format!("{}", SourceType::Discussion), "discussion");
    }

    #[test]
    fn test_content_hash_deterministic() {
        let hash1 = compute_content_hash("hello");
        let hash2 = compute_content_hash("hello");
        assert_eq!(hash1, hash2);
        assert!(!hash1.is_empty());
        // SHA-256 of "hello" is known
        assert_eq!(hash1.len(), 64); // 256 bits = 64 hex chars
    }

    #[test]
    fn test_content_hash_different_inputs() {
        let hash1 = compute_content_hash("hello");
        let hash2 = compute_content_hash("world");
        assert_ne!(hash1, hash2);
    }

    #[test]
    fn test_content_hash_empty() {
        let hash = compute_content_hash("");
        assert_eq!(hash.len(), 64);
    }

    #[test]
    fn test_list_hash_order_independent() {
        let hash1 = compute_list_hash(&["b".to_string(), "a".to_string()]);
        let hash2 = compute_list_hash(&["a".to_string(), "b".to_string()]);
        assert_eq!(hash1, hash2);
    }

    #[test]
    fn test_list_hash_empty() {
        let hash = compute_list_hash(&[]);
        assert_eq!(hash.len(), 64);
        // Empty list hashes consistently
        let hash2 = compute_list_hash(&[]);
        assert_eq!(hash, hash2);
    }

    // Helper to create an in-memory DB with the required tables for extraction tests
    fn setup_test_db() -> Connection {
        let conn = Connection::open_in_memory().unwrap();
        conn.execute_batch(
            "
            CREATE TABLE projects (
                id INTEGER PRIMARY KEY,
                gitlab_project_id INTEGER UNIQUE NOT NULL,
                path_with_namespace TEXT NOT NULL,
                default_branch TEXT,
                web_url TEXT,
                created_at INTEGER,
                updated_at INTEGER,
                raw_payload_id INTEGER
            );
            CREATE TABLE issues (
                id INTEGER PRIMARY KEY,
                gitlab_id INTEGER UNIQUE NOT NULL,
                project_id INTEGER NOT NULL REFERENCES projects(id),
                iid INTEGER NOT NULL,
                title TEXT,
                description TEXT,
                state TEXT NOT NULL,
                author_username TEXT,
                created_at INTEGER NOT NULL,
                updated_at INTEGER NOT NULL,
                last_seen_at INTEGER NOT NULL,
                discussions_synced_for_updated_at INTEGER,
                resource_events_synced_for_updated_at INTEGER,
                web_url TEXT,
                raw_payload_id INTEGER
            );
            CREATE TABLE labels (
                id INTEGER PRIMARY KEY,
                gitlab_id INTEGER,
                project_id INTEGER NOT NULL REFERENCES projects(id),
                name TEXT NOT NULL,
                color TEXT,
                description TEXT
            );
            CREATE TABLE issue_labels (
                issue_id INTEGER NOT NULL REFERENCES issues(id),
                label_id INTEGER NOT NULL REFERENCES labels(id),
                PRIMARY KEY(issue_id, label_id)
            );
        ",
        )
        .unwrap();

        // Insert a test project
        conn.execute(
            "INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url) VALUES (1, 100, 'group/project-one', 'https://gitlab.example.com/group/project-one')",
            [],
        ).unwrap();

        conn
    }

    #[allow(clippy::too_many_arguments)]
    fn insert_issue(
        conn: &Connection,
        id: i64,
        iid: i64,
        title: Option<&str>,
        description: Option<&str>,
        state: &str,
        author: Option<&str>,
        web_url: Option<&str>,
    ) {
        conn.execute(
            "INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (?1, ?2, 1, ?3, ?4, ?5, ?6, ?7, 1000, 2000, 3000, ?8)",
            rusqlite::params![id, id * 10, iid, title, description, state, author, web_url],
        ).unwrap();
    }

    fn insert_label(conn: &Connection, id: i64, name: &str) {
        conn.execute(
            "INSERT INTO labels (id, project_id, name) VALUES (?1, 1, ?2)",
            rusqlite::params![id, name],
        )
        .unwrap();
    }

    fn link_issue_label(conn: &Connection, issue_id: i64, label_id: i64) {
        conn.execute(
            "INSERT INTO issue_labels (issue_id, label_id) VALUES (?1, ?2)",
            rusqlite::params![issue_id, label_id],
        )
        .unwrap();
    }

    #[test]
    fn test_issue_document_format() {
        let conn = setup_test_db();
        insert_issue(
            &conn,
            1,
            234,
            Some("Authentication redesign"),
            Some("We need to modernize our authentication system..."),
            "opened",
            Some("johndoe"),
            Some("https://gitlab.example.com/group/project-one/-/issues/234"),
        );
        insert_label(&conn, 1, "auth");
        insert_label(&conn, 2, "bug");
        link_issue_label(&conn, 1, 1);
        link_issue_label(&conn, 1, 2);

        let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
        assert_eq!(doc.source_type, SourceType::Issue);
        assert_eq!(doc.source_id, 1);
        assert_eq!(doc.project_id, 1);
        assert_eq!(doc.author_username, Some("johndoe".to_string()));
        assert!(
            doc.content_text
                .starts_with("[[Issue]] #234: Authentication redesign\n")
        );
        assert!(doc.content_text.contains("Project: group/project-one\n"));
        assert!(
            doc.content_text
                .contains("URL: https://gitlab.example.com/group/project-one/-/issues/234\n")
        );
        assert!(doc.content_text.contains("Labels: [\"auth\",\"bug\"]\n"));
        assert!(doc.content_text.contains("State: opened\n"));
        assert!(doc.content_text.contains("Author: @johndoe\n"));
        assert!(
            doc.content_text.contains(
                "--- Description ---\n\nWe need to modernize our authentication system..."
            )
        );
        assert!(!doc.is_truncated);
        assert!(doc.paths.is_empty());
    }

    #[test]
    fn test_issue_not_found() {
        let conn = setup_test_db();
        let result = extract_issue_document(&conn, 999).unwrap();
        assert!(result.is_none());
    }

    #[test]
    fn test_issue_no_description() {
        let conn = setup_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Quick fix"),
            None,
            "opened",
            Some("alice"),
            None,
        );

        let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
        assert!(!doc.content_text.contains("--- Description ---"));
        assert!(doc.content_text.contains("[[Issue]] #10: Quick fix\n"));
    }

    #[test]
    fn test_issue_labels_sorted() {
        let conn = setup_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("Body"),
            "opened",
            Some("bob"),
            None,
        );
        insert_label(&conn, 1, "zeta");
        insert_label(&conn, 2, "alpha");
        insert_label(&conn, 3, "middle");
        link_issue_label(&conn, 1, 1);
        link_issue_label(&conn, 1, 2);
        link_issue_label(&conn, 1, 3);

        let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
        assert_eq!(doc.labels, vec!["alpha", "middle", "zeta"]);
        assert!(
            doc.content_text
                .contains("Labels: [\"alpha\",\"middle\",\"zeta\"]")
        );
    }

    #[test]
    fn test_issue_no_labels() {
        let conn = setup_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("Body"),
            "opened",
            None,
            None,
        );

        let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
        assert!(doc.labels.is_empty());
        assert!(doc.content_text.contains("Labels: []\n"));
    }

    #[test]
    fn test_issue_hash_deterministic() {
        let conn = setup_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("Body"),
            "opened",
            Some("alice"),
            None,
        );

        let doc1 = extract_issue_document(&conn, 1).unwrap().unwrap();
        let doc2 = extract_issue_document(&conn, 1).unwrap().unwrap();
        assert_eq!(doc1.content_hash, doc2.content_hash);
        assert_eq!(doc1.labels_hash, doc2.labels_hash);
        assert_eq!(doc1.content_hash.len(), 64);
    }

    #[test]
    fn test_issue_empty_description() {
        let conn = setup_test_db();
        insert_issue(&conn, 1, 10, Some("Test"), Some(""), "opened", None, None);

        let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
        // Empty string description still includes the section header
        assert!(doc.content_text.contains("--- Description ---\n\n"));
    }

    // --- MR extraction tests ---

    fn setup_mr_test_db() -> Connection {
        let conn = setup_test_db();
        conn.execute_batch(
            "
            CREATE TABLE merge_requests (
                id INTEGER PRIMARY KEY,
                gitlab_id INTEGER UNIQUE NOT NULL,
                project_id INTEGER NOT NULL REFERENCES projects(id),
                iid INTEGER NOT NULL,
                title TEXT,
                description TEXT,
                state TEXT,
                draft INTEGER NOT NULL DEFAULT 0,
                author_username TEXT,
                source_branch TEXT,
                target_branch TEXT,
                head_sha TEXT,
                references_short TEXT,
                references_full TEXT,
                detailed_merge_status TEXT,
                merge_user_username TEXT,
                created_at INTEGER,
                updated_at INTEGER,
                merged_at INTEGER,
                closed_at INTEGER,
                last_seen_at INTEGER NOT NULL,
                discussions_synced_for_updated_at INTEGER,
                discussions_sync_last_attempt_at INTEGER,
                discussions_sync_attempts INTEGER DEFAULT 0,
                discussions_sync_last_error TEXT,
                resource_events_synced_for_updated_at INTEGER,
                web_url TEXT,
                raw_payload_id INTEGER
            );
            CREATE TABLE mr_labels (
                merge_request_id INTEGER REFERENCES merge_requests(id),
                label_id INTEGER REFERENCES labels(id),
                PRIMARY KEY(merge_request_id, label_id)
            );
        ",
        )
        .unwrap();
        conn
    }

    #[allow(clippy::too_many_arguments)]
    fn insert_mr(
        conn: &Connection,
        id: i64,
        iid: i64,
        title: Option<&str>,
        description: Option<&str>,
        state: Option<&str>,
        author: Option<&str>,
        source_branch: Option<&str>,
        target_branch: Option<&str>,
        web_url: Option<&str>,
    ) {
        conn.execute(
            "INSERT INTO merge_requests (id, gitlab_id, project_id, iid, title, description, state, author_username, source_branch, target_branch, created_at, updated_at, last_seen_at, web_url) VALUES (?1, ?2, 1, ?3, ?4, ?5, ?6, ?7, ?8, ?9, 1000, 2000, 3000, ?10)",
            rusqlite::params![id, id * 10, iid, title, description, state, author, source_branch, target_branch, web_url],
        ).unwrap();
    }

    fn link_mr_label(conn: &Connection, mr_id: i64, label_id: i64) {
        conn.execute(
            "INSERT INTO mr_labels (merge_request_id, label_id) VALUES (?1, ?2)",
            rusqlite::params![mr_id, label_id],
        )
        .unwrap();
    }

    #[test]
    fn test_mr_document_format() {
        let conn = setup_mr_test_db();
        insert_mr(
            &conn,
            1,
            456,
            Some("Implement JWT authentication"),
            Some("This MR implements JWT-based authentication..."),
            Some("opened"),
            Some("johndoe"),
            Some("feature/jwt-auth"),
            Some("main"),
            Some("https://gitlab.example.com/group/project-one/-/merge_requests/456"),
        );
        insert_label(&conn, 1, "auth");
        insert_label(&conn, 2, "feature");
        link_mr_label(&conn, 1, 1);
        link_mr_label(&conn, 1, 2);

        let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
        assert_eq!(doc.source_type, SourceType::MergeRequest);
        assert_eq!(doc.source_id, 1);
        assert!(
            doc.content_text
                .starts_with("[[MergeRequest]] !456: Implement JWT authentication\n")
        );
        assert!(doc.content_text.contains("Project: group/project-one\n"));
        assert!(
            doc.content_text
                .contains("Labels: [\"auth\",\"feature\"]\n")
        );
        assert!(doc.content_text.contains("State: opened\n"));
        assert!(doc.content_text.contains("Author: @johndoe\n"));
        assert!(
            doc.content_text
                .contains("Source: feature/jwt-auth -> main\n")
        );
        assert!(
            doc.content_text
                .contains("--- Description ---\n\nThis MR implements JWT-based authentication...")
        );
    }

    #[test]
    fn test_mr_not_found() {
        let conn = setup_mr_test_db();
        let result = extract_mr_document(&conn, 999).unwrap();
        assert!(result.is_none());
    }

    #[test]
    fn test_mr_no_description() {
        let conn = setup_mr_test_db();
        insert_mr(
            &conn,
            1,
            10,
            Some("Quick fix"),
            None,
            Some("merged"),
            Some("alice"),
            Some("fix/bug"),
            Some("main"),
            None,
        );

        let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
        assert!(!doc.content_text.contains("--- Description ---"));
        assert!(
            doc.content_text
                .contains("[[MergeRequest]] !10: Quick fix\n")
        );
    }

    #[test]
    fn test_mr_branch_info() {
        let conn = setup_mr_test_db();
        insert_mr(
            &conn,
            1,
            10,
            Some("Test"),
            Some("Body"),
            Some("opened"),
            None,
            Some("feature/foo"),
            Some("develop"),
            None,
        );

        let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
        assert!(
            doc.content_text
                .contains("Source: feature/foo -> develop\n")
        );
    }

    #[test]
    fn test_mr_no_branches() {
        let conn = setup_mr_test_db();
        insert_mr(
            &conn,
            1,
            10,
            Some("Test"),
            None,
            Some("opened"),
            None,
            None,
            None,
            None,
        );

        let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
        assert!(!doc.content_text.contains("Source:"));
    }

    // --- Discussion extraction tests ---

    fn setup_discussion_test_db() -> Connection {
        let conn = setup_mr_test_db(); // includes projects, issues schema, labels, mr tables
        conn.execute_batch(
            "
            CREATE TABLE discussions (
                id INTEGER PRIMARY KEY,
                gitlab_discussion_id TEXT NOT NULL,
                project_id INTEGER NOT NULL REFERENCES projects(id),
                issue_id INTEGER REFERENCES issues(id),
                merge_request_id INTEGER,
                noteable_type TEXT NOT NULL,
                individual_note INTEGER NOT NULL DEFAULT 0,
                first_note_at INTEGER,
                last_note_at INTEGER,
                last_seen_at INTEGER NOT NULL,
                resolvable INTEGER NOT NULL DEFAULT 0,
                resolved INTEGER NOT NULL DEFAULT 0
            );
            CREATE TABLE notes (
                id INTEGER PRIMARY KEY,
                gitlab_id INTEGER UNIQUE NOT NULL,
                discussion_id INTEGER NOT NULL REFERENCES discussions(id),
                project_id INTEGER NOT NULL REFERENCES projects(id),
                note_type TEXT,
                is_system INTEGER NOT NULL DEFAULT 0,
                author_username TEXT,
                body TEXT,
                created_at INTEGER NOT NULL,
                updated_at INTEGER NOT NULL,
                last_seen_at INTEGER NOT NULL,
                position INTEGER,
                resolvable INTEGER NOT NULL DEFAULT 0,
                resolved INTEGER NOT NULL DEFAULT 0,
                resolved_by TEXT,
                resolved_at INTEGER,
                position_old_path TEXT,
                position_new_path TEXT,
                position_old_line INTEGER,
                position_new_line INTEGER,
                raw_payload_id INTEGER
            );
        ",
        )
        .unwrap();
        conn
    }

    fn insert_discussion(
        conn: &Connection,
        id: i64,
        noteable_type: &str,
        issue_id: Option<i64>,
        mr_id: Option<i64>,
    ) {
        conn.execute(
            "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, merge_request_id, noteable_type, last_seen_at) VALUES (?1, ?2, 1, ?3, ?4, ?5, 3000)",
            rusqlite::params![id, format!("disc_{}", id), issue_id, mr_id, noteable_type],
        ).unwrap();
    }

    #[allow(clippy::too_many_arguments)]
    fn insert_note(
        conn: &Connection,
        id: i64,
        gitlab_id: i64,
        discussion_id: i64,
        author: Option<&str>,
        body: Option<&str>,
        created_at: i64,
        is_system: bool,
        old_path: Option<&str>,
        new_path: Option<&str>,
    ) {
        conn.execute(
            "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system, position_old_path, position_new_path) VALUES (?1, ?2, ?3, 1, ?4, ?5, ?6, ?6, ?6, ?7, ?8, ?9)",
            rusqlite::params![id, gitlab_id, discussion_id, author, body, created_at, is_system as i32, old_path, new_path],
        ).unwrap();
    }

    #[test]
    fn test_discussion_document_format() {
        let conn = setup_discussion_test_db();
        insert_issue(
            &conn,
            1,
            234,
            Some("Authentication redesign"),
            Some("desc"),
            "opened",
            Some("johndoe"),
            Some("https://gitlab.example.com/group/project-one/-/issues/234"),
        );
        insert_label(&conn, 1, "auth");
        insert_label(&conn, 2, "bug");
        link_issue_label(&conn, 1, 1);
        link_issue_label(&conn, 1, 2);
        insert_discussion(&conn, 1, "Issue", Some(1), None);
        // 1710460800000 = 2024-03-15T00:00:00Z
        insert_note(
            &conn,
            1,
            12345,
            1,
            Some("johndoe"),
            Some("I think we should move to JWT-based auth..."),
            1710460800000,
            false,
            None,
            None,
        );
        insert_note(
            &conn,
            2,
            12346,
            1,
            Some("janedoe"),
            Some("Agreed. What about refresh token strategy?"),
            1710460800000,
            false,
            None,
            None,
        );

        let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
        assert_eq!(doc.source_type, SourceType::Discussion);
        assert!(
            doc.content_text
                .starts_with("[[Discussion]] Issue #234: Authentication redesign\n")
        );
        assert!(doc.content_text.contains("Project: group/project-one\n"));
        assert!(doc.content_text.contains(
            "URL: https://gitlab.example.com/group/project-one/-/issues/234#note_12345\n"
        ));
        assert!(doc.content_text.contains("Labels: [\"auth\",\"bug\"]\n"));
        assert!(doc.content_text.contains("--- Thread ---"));
        assert!(
            doc.content_text
                .contains("@johndoe (2024-03-15):\nI think we should move to JWT-based auth...")
        );
        assert!(
            doc.content_text
                .contains("@janedoe (2024-03-15):\nAgreed. What about refresh token strategy?")
        );
        assert_eq!(doc.author_username, Some("johndoe".to_string()));
        assert!(doc.title.is_none()); // Discussions don't have their own title
    }

    #[test]
    fn test_discussion_not_found() {
        let conn = setup_discussion_test_db();
        let result = extract_discussion_document(&conn, 999).unwrap();
        assert!(result.is_none());
    }

    #[test]
    fn test_discussion_parent_deleted() {
        let conn = setup_discussion_test_db();
        // Insert issue, create discussion, then delete the issue
        insert_issue(
            &conn,
            99,
            10,
            Some("To be deleted"),
            None,
            "opened",
            None,
            None,
        );
        insert_discussion(&conn, 1, "Issue", Some(99), None);
        insert_note(
            &conn,
            1,
            100,
            1,
            Some("alice"),
            Some("Hello"),
            1000,
            false,
            None,
            None,
        );
        // Delete the parent issue — FK cascade won't delete discussion in test since
        // we used REFERENCES without ON DELETE CASCADE in test schema, so just delete from issues
        conn.execute("PRAGMA foreign_keys = OFF", []).unwrap();
        conn.execute("DELETE FROM issues WHERE id = 99", [])
            .unwrap();
        conn.execute("PRAGMA foreign_keys = ON", []).unwrap();

        let result = extract_discussion_document(&conn, 1).unwrap();
        assert!(result.is_none());
    }

    #[test]
    fn test_discussion_system_notes_excluded() {
        let conn = setup_discussion_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("desc"),
            "opened",
            Some("alice"),
            None,
        );
        insert_discussion(&conn, 1, "Issue", Some(1), None);
        insert_note(
            &conn,
            1,
            100,
            1,
            Some("alice"),
            Some("Real comment"),
            1000,
            false,
            None,
            None,
        );
        insert_note(
            &conn,
            2,
            101,
            1,
            Some("bot"),
            Some("assigned to @alice"),
            2000,
            true,
            None,
            None,
        );
        insert_note(
            &conn,
            3,
            102,
            1,
            Some("bob"),
            Some("Follow-up"),
            3000,
            false,
            None,
            None,
        );

        let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
        assert!(doc.content_text.contains("@alice"));
        assert!(doc.content_text.contains("@bob"));
        assert!(!doc.content_text.contains("assigned to"));
    }

    #[test]
    fn test_discussion_diffnote_paths() {
        let conn = setup_discussion_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("desc"),
            "opened",
            None,
            None,
        );
        insert_discussion(&conn, 1, "Issue", Some(1), None);
        insert_note(
            &conn,
            1,
            100,
            1,
            Some("alice"),
            Some("Comment on code"),
            1000,
            false,
            Some("src/old.rs"),
            Some("src/new.rs"),
        );
        insert_note(
            &conn,
            2,
            101,
            1,
            Some("bob"),
            Some("Reply"),
            2000,
            false,
            Some("src/old.rs"),
            Some("src/new.rs"),
        );

        let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
        // Paths should be deduplicated and sorted
        assert_eq!(doc.paths, vec!["src/new.rs", "src/old.rs"]);
        assert!(
            doc.content_text
                .contains("Files: [\"src/new.rs\",\"src/old.rs\"]")
        );
    }

    #[test]
    fn test_discussion_url_construction() {
        let conn = setup_discussion_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("desc"),
            "opened",
            None,
            Some("https://gitlab.example.com/group/project-one/-/issues/10"),
        );
        insert_discussion(&conn, 1, "Issue", Some(1), None);
        insert_note(
            &conn,
            1,
            54321,
            1,
            Some("alice"),
            Some("Hello"),
            1000,
            false,
            None,
            None,
        );

        let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
        assert_eq!(
            doc.url,
            Some("https://gitlab.example.com/group/project-one/-/issues/10#note_54321".to_string())
        );
    }

    #[test]
    fn test_discussion_uses_parent_labels() {
        let conn = setup_discussion_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("desc"),
            "opened",
            None,
            None,
        );
        insert_label(&conn, 1, "backend");
        insert_label(&conn, 2, "api");
        link_issue_label(&conn, 1, 1);
        link_issue_label(&conn, 1, 2);
        insert_discussion(&conn, 1, "Issue", Some(1), None);
        insert_note(
            &conn,
            1,
            100,
            1,
            Some("alice"),
            Some("Comment"),
            1000,
            false,
            None,
            None,
        );

        let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
        assert_eq!(doc.labels, vec!["api", "backend"]);
    }

    #[test]
    fn test_discussion_on_mr() {
        let conn = setup_discussion_test_db();
        insert_mr(
            &conn,
            1,
            456,
            Some("JWT Auth"),
            Some("desc"),
            Some("opened"),
            Some("johndoe"),
            Some("feature/jwt"),
            Some("main"),
            Some("https://gitlab.example.com/group/project-one/-/merge_requests/456"),
        );
        insert_discussion(&conn, 1, "MergeRequest", None, Some(1));
        insert_note(
            &conn,
            1,
            100,
            1,
            Some("alice"),
            Some("LGTM"),
            1000,
            false,
            None,
            None,
        );

        let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
        assert!(
            doc.content_text
                .contains("[[Discussion]] MR !456: JWT Auth\n")
        );
    }

    #[test]
    fn test_discussion_all_system_notes() {
        let conn = setup_discussion_test_db();
        insert_issue(
            &conn,
            1,
            10,
            Some("Test"),
            Some("desc"),
            "opened",
            None,
            None,
        );
        insert_discussion(&conn, 1, "Issue", Some(1), None);
        insert_note(
            &conn,
            1,
            100,
            1,
            Some("bot"),
            Some("assigned to @alice"),
            1000,
            true,
            None,
            None,
        );

        // All notes are system notes -> no content -> returns None
        let result = extract_discussion_document(&conn, 1).unwrap();
        assert!(result.is_none());
    }
}