gitlore/src/documents/truncation.rs

pub const MAX_DISCUSSION_BYTES: usize = 32_000;

pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;

pub struct NoteContent {
    pub author: String,
    pub date: String,
    pub body: String,
}

pub struct TruncationResult {
    pub content: String,
    pub is_truncated: bool,
    pub reason: Option<TruncationReason>,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TruncationReason {
    TokenLimitMiddleDrop,
    SingleNoteOversized,
    FirstLastOversized,
    HardCapOversized,
}

impl TruncationReason {
    pub fn as_str(&self) -> &'static str {
        match self {
            Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
            Self::SingleNoteOversized => "single_note_oversized",
            Self::FirstLastOversized => "first_last_oversized",
            Self::HardCapOversized => "hard_cap_oversized",
        }
    }
}

fn format_note(note: &NoteContent) -> String {
    format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
}

pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
    if s.len() <= max_bytes {
        return s;
    }
    let mut end = max_bytes;
    while end > 0 && !s.is_char_boundary(end) {
        end -= 1;
    }
    &s[..end]
}

pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
    if notes.is_empty() {
        return TruncationResult {
            content: String::new(),
            is_truncated: false,
            reason: None,
        };
    }

    let formatted: Vec<String> = notes.iter().map(format_note).collect();
    let total_len: usize = formatted.iter().map(|s| s.len()).sum();

    if total_len <= max_bytes {
        let mut total = String::with_capacity(total_len);
        for s in &formatted {
            total.push_str(s);
        }
        return TruncationResult {
            content: total,
            is_truncated: false,
            reason: None,
        };
    }

    if notes.len() == 1 {
        let truncated = truncate_utf8(&formatted[0], max_bytes.saturating_sub(11));
        let content = format!("{}[truncated]", truncated);
        return TruncationResult {
            content,
            is_truncated: true,
            reason: Some(TruncationReason::SingleNoteOversized),
        };
    }

    let last_note = &formatted[formatted.len() - 1];

    let mut best_n = 0;
    for n in 1..formatted.len() - 1 {
        let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
        let omitted = formatted.len() - n - 1;
        let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
        let candidate_len = first_n + marker.len() + last_note.len();
        if candidate_len <= max_bytes {
            best_n = n;
        } else {
            break;
        }
    }

    if best_n > 0 {
        let first_part: String = formatted[..best_n].concat();
        let omitted = formatted.len() - best_n - 1;
        let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
        let content = format!("{}{}{}", first_part, marker, last_note);
        return TruncationResult {
            content,
            is_truncated: true,
            reason: Some(TruncationReason::TokenLimitMiddleDrop),
        };
    }

    let first_note = &formatted[0];
    let omitted = formatted.len() - 2;
    let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
    if first_note.len() + marker.len() + last_note.len() > max_bytes {
        let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
        let content = format!("{}[truncated]", truncated);
        return TruncationResult {
            content,
            is_truncated: true,
            reason: Some(TruncationReason::FirstLastOversized),
        };
    }

    let content = format!("{}{}{}", formatted[0], marker, last_note);
    TruncationResult {
        content,
        is_truncated: true,
        reason: Some(TruncationReason::TokenLimitMiddleDrop),
    }
}

pub fn truncate_hard_cap(content: &str) -> TruncationResult {
    if content.len() <= MAX_DOCUMENT_BYTES_HARD {
        return TruncationResult {
            content: content.to_string(),
            is_truncated: false,
            reason: None,
        };
    }

    let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
    TruncationResult {
        content: format!("{}[truncated]", truncated),
        is_truncated: true,
        reason: Some(TruncationReason::HardCapOversized),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_note(author: &str, body: &str) -> NoteContent {
        NoteContent {
            author: author.to_string(),
            date: "2024-01-01".to_string(),
            body: body.to_string(),
        }
    }

    #[test]
    fn test_no_truncation_under_limit() {
        let notes = vec![
            make_note("alice", "Short note 1"),
            make_note("bob", "Short note 2"),
            make_note("carol", "Short note 3"),
        ];
        let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
        assert!(!result.is_truncated);
        assert!(result.reason.is_none());
        assert!(result.content.contains("@alice"));
        assert!(result.content.contains("@bob"));
        assert!(result.content.contains("@carol"));
    }

    #[test]
    fn test_middle_notes_dropped() {
        let big_body = "x".repeat(4000);
        let notes: Vec<NoteContent> = (0..10)
            .map(|i| make_note(&format!("user{}", i), &big_body))
            .collect();
        let result = truncate_discussion(&notes, 10_000);
        assert!(result.is_truncated);
        assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
        assert!(result.content.contains("@user0"));
        assert!(result.content.contains("@user9"));
        assert!(result.content.contains("notes omitted for length"));
    }

    #[test]
    fn test_single_note_oversized() {
        let big_body = "x".repeat(50_000);
        let notes = vec![make_note("alice", &big_body)];
        let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
        assert!(result.is_truncated);
        assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
        assert!(result.content.ends_with("[truncated]"));
        assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
    }

    #[test]
    fn test_first_last_oversized() {
        let big_body = "x".repeat(20_000);
        let notes = vec![make_note("alice", &big_body), make_note("bob", &big_body)];
        let result = truncate_discussion(&notes, 10_000);
        assert!(result.is_truncated);
        assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
        assert!(result.content.contains("@alice"));
        assert!(result.content.ends_with("[truncated]"));
    }

    #[test]
    fn test_one_note_under_limit() {
        let notes = vec![make_note("alice", "Short note")];
        let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
        assert!(!result.is_truncated);
        assert!(result.content.contains("@alice"));
    }

    #[test]
    fn test_empty_notes() {
        let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
        assert!(!result.is_truncated);
        assert!(result.content.is_empty());
    }

    #[test]
    fn test_utf8_boundary_safety() {
        let emoji_content = "🎉".repeat(10);
        let truncated = truncate_utf8(&emoji_content, 10);
        assert_eq!(truncated.len(), 8);
        assert_eq!(truncated, "🎉🎉");
    }

    #[test]
    fn test_utf8_boundary_cjk() {
        let cjk = "中文字符测试";
        let truncated = truncate_utf8(cjk, 7);
        assert_eq!(truncated, "中文");
        assert_eq!(truncated.len(), 6);
    }

    #[test]
    fn test_hard_cap() {
        let big_content = "x".repeat(3_000_000);
        let result = truncate_hard_cap(&big_content);
        assert!(result.is_truncated);
        assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
        assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
        assert!(result.content.ends_with("[truncated]"));
    }

    #[test]
    fn test_hard_cap_under_limit() {
        let content = "Short content";
        let result = truncate_hard_cap(content);
        assert!(!result.is_truncated);
        assert_eq!(result.content, content);
    }

    #[test]
    fn test_marker_count_correct() {
        let big_body = "x".repeat(5000);
        let notes: Vec<NoteContent> = (0..7)
            .map(|i| make_note(&format!("user{}", i), &big_body))
            .collect();
        let result = truncate_discussion(&notes, 12_000);
        assert!(result.is_truncated);
        assert!(
            result
                .content
                .contains("[... 5 notes omitted for length ...]")
        );
    }

    #[test]
    fn test_truncation_reason_as_str() {
        assert_eq!(
            TruncationReason::TokenLimitMiddleDrop.as_str(),
            "token_limit_middle_drop"
        );
        assert_eq!(
            TruncationReason::SingleNoteOversized.as_str(),
            "single_note_oversized"
        );
        assert_eq!(
            TruncationReason::FirstLastOversized.as_str(),
            "first_last_oversized"
        );
        assert_eq!(
            TruncationReason::HardCapOversized.as_str(),
            "hard_cap_oversized"
        );
    }
}