pub const MAX_DISCUSSION_BYTES: usize = 32_000; pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000; pub struct NoteContent { pub author: String, pub date: String, pub body: String, } pub struct TruncationResult { pub content: String, pub is_truncated: bool, pub reason: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TruncationReason { TokenLimitMiddleDrop, SingleNoteOversized, FirstLastOversized, HardCapOversized, } impl TruncationReason { pub fn as_str(&self) -> &'static str { match self { Self::TokenLimitMiddleDrop => "token_limit_middle_drop", Self::SingleNoteOversized => "single_note_oversized", Self::FirstLastOversized => "first_last_oversized", Self::HardCapOversized => "hard_cap_oversized", } } } fn format_note(note: &NoteContent) -> String { format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body) } pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str { if s.len() <= max_bytes { return s; } let mut end = max_bytes; while end > 0 && !s.is_char_boundary(end) { end -= 1; } &s[..end] } /// Result of pre-truncating a description to avoid unbounded memory allocation. pub struct DescriptionPreTruncateResult { pub content: String, pub was_truncated: bool, pub original_bytes: usize, } /// Pre-truncate a description to avoid allocating huge amounts of memory. /// /// This is called BEFORE appending to the document content, so we don't /// allocate memory for pathologically large descriptions (e.g., 500MB base64 blob). /// /// Returns the (potentially truncated) description and whether truncation occurred. pub fn pre_truncate_description(desc: &str, max_bytes: usize) -> DescriptionPreTruncateResult { let original_bytes = desc.len(); if original_bytes <= max_bytes { return DescriptionPreTruncateResult { content: desc.to_string(), was_truncated: false, original_bytes, }; } // Truncate at UTF-8 boundary and add indicator let truncated = truncate_utf8(desc, max_bytes.saturating_sub(50)); // Reserve space for marker let mut content = truncated.to_string(); content.push_str("\n\n[... description truncated from "); content.push_str(&format_bytes(original_bytes)); content.push_str(" to "); content.push_str(&format_bytes(max_bytes)); content.push_str(" ...]"); DescriptionPreTruncateResult { content, was_truncated: true, original_bytes, } } fn format_bytes(bytes: usize) -> String { if bytes >= 1_000_000 { format!("{:.1}MB", bytes as f64 / 1_000_000.0) } else if bytes >= 1_000 { format!("{:.1}KB", bytes as f64 / 1_000.0) } else { format!("{}B", bytes) } } pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult { if notes.is_empty() { return TruncationResult { content: String::new(), is_truncated: false, reason: None, }; } let formatted: Vec = notes.iter().map(format_note).collect(); let total_len: usize = formatted.iter().map(|s| s.len()).sum(); if total_len <= max_bytes { let mut total = String::with_capacity(total_len); for s in &formatted { total.push_str(s); } return TruncationResult { content: total, is_truncated: false, reason: None, }; } if notes.len() == 1 { let truncated = truncate_utf8(&formatted[0], max_bytes.saturating_sub(11)); let content = format!("{}[truncated]", truncated); return TruncationResult { content, is_truncated: true, reason: Some(TruncationReason::SingleNoteOversized), }; } let last_note = &formatted[formatted.len() - 1]; let mut best_n = 0; for n in 1..formatted.len() - 1 { let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum(); let omitted = formatted.len() - n - 1; let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted); let candidate_len = first_n + marker.len() + last_note.len(); if candidate_len <= max_bytes { best_n = n; } else { break; } } if best_n > 0 { let first_part: String = formatted[..best_n].concat(); let omitted = formatted.len() - best_n - 1; let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted); let content = format!("{}{}{}", first_part, marker, last_note); return TruncationResult { content, is_truncated: true, reason: Some(TruncationReason::TokenLimitMiddleDrop), }; } let first_note = &formatted[0]; let omitted = formatted.len() - 2; let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted); if first_note.len() + marker.len() + last_note.len() > max_bytes { let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11)); let content = format!("{}[truncated]", truncated); return TruncationResult { content, is_truncated: true, reason: Some(TruncationReason::FirstLastOversized), }; } let content = format!("{}{}{}", formatted[0], marker, last_note); TruncationResult { content, is_truncated: true, reason: Some(TruncationReason::TokenLimitMiddleDrop), } } pub fn truncate_hard_cap(content: &str) -> TruncationResult { if content.len() <= MAX_DOCUMENT_BYTES_HARD { return TruncationResult { content: content.to_string(), is_truncated: false, reason: None, }; } let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11)); TruncationResult { content: format!("{}[truncated]", truncated), is_truncated: true, reason: Some(TruncationReason::HardCapOversized), } } #[cfg(test)] mod tests { use super::*; fn make_note(author: &str, body: &str) -> NoteContent { NoteContent { author: author.to_string(), date: "2024-01-01".to_string(), body: body.to_string(), } } #[test] fn test_no_truncation_under_limit() { let notes = vec![ make_note("alice", "Short note 1"), make_note("bob", "Short note 2"), make_note("carol", "Short note 3"), ]; let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES); assert!(!result.is_truncated); assert!(result.reason.is_none()); assert!(result.content.contains("@alice")); assert!(result.content.contains("@bob")); assert!(result.content.contains("@carol")); } #[test] fn test_middle_notes_dropped() { let big_body = "x".repeat(4000); let notes: Vec = (0..10) .map(|i| make_note(&format!("user{}", i), &big_body)) .collect(); let result = truncate_discussion(¬es, 10_000); assert!(result.is_truncated); assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop)); assert!(result.content.contains("@user0")); assert!(result.content.contains("@user9")); assert!(result.content.contains("notes omitted for length")); } #[test] fn test_single_note_oversized() { let big_body = "x".repeat(50_000); let notes = vec![make_note("alice", &big_body)]; let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES); assert!(result.is_truncated); assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized)); assert!(result.content.ends_with("[truncated]")); assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20); } #[test] fn test_first_last_oversized() { let big_body = "x".repeat(20_000); let notes = vec![make_note("alice", &big_body), make_note("bob", &big_body)]; let result = truncate_discussion(¬es, 10_000); assert!(result.is_truncated); assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized)); assert!(result.content.contains("@alice")); assert!(result.content.ends_with("[truncated]")); } #[test] fn test_one_note_under_limit() { let notes = vec![make_note("alice", "Short note")]; let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES); assert!(!result.is_truncated); assert!(result.content.contains("@alice")); } #[test] fn test_empty_notes() { let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES); assert!(!result.is_truncated); assert!(result.content.is_empty()); } #[test] fn test_utf8_boundary_safety() { let emoji_content = "πŸŽ‰".repeat(10); let truncated = truncate_utf8(&emoji_content, 10); assert_eq!(truncated.len(), 8); assert_eq!(truncated, "πŸŽ‰πŸŽ‰"); } #[test] fn test_utf8_boundary_cjk() { let cjk = "中文字符桋试"; let truncated = truncate_utf8(cjk, 7); assert_eq!(truncated, "δΈ­ζ–‡"); assert_eq!(truncated.len(), 6); } #[test] fn test_hard_cap() { let big_content = "x".repeat(3_000_000); let result = truncate_hard_cap(&big_content); assert!(result.is_truncated); assert_eq!(result.reason, Some(TruncationReason::HardCapOversized)); assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20); assert!(result.content.ends_with("[truncated]")); } #[test] fn test_hard_cap_under_limit() { let content = "Short content"; let result = truncate_hard_cap(content); assert!(!result.is_truncated); assert_eq!(result.content, content); } #[test] fn test_marker_count_correct() { let big_body = "x".repeat(5000); let notes: Vec = (0..7) .map(|i| make_note(&format!("user{}", i), &big_body)) .collect(); let result = truncate_discussion(¬es, 12_000); assert!(result.is_truncated); assert!( result .content .contains("[... 5 notes omitted for length ...]") ); } #[test] fn test_truncation_reason_as_str() { assert_eq!( TruncationReason::TokenLimitMiddleDrop.as_str(), "token_limit_middle_drop" ); assert_eq!( TruncationReason::SingleNoteOversized.as_str(), "single_note_oversized" ); assert_eq!( TruncationReason::FirstLastOversized.as_str(), "first_last_oversized" ); assert_eq!( TruncationReason::HardCapOversized.as_str(), "hard_cap_oversized" ); } }