feat(truncation): add pre-truncation for oversized descriptions

Add pre_truncate_description() to prevent unbounded memory allocation when processing pathologically large descriptions (e.g., 500MB base64 blobs in issue descriptions). Previously, the document extraction pipeline would: 1. Allocate memory for the entire description 2. Append to content buffer 3. Only truncate at the end via truncate_hard_cap() For a 500MB description, this would allocate 500MB+ before truncation. New approach: 1. Check description size BEFORE appending 2. If over limit, truncate at UTF-8 boundary immediately 3. Add human-readable marker: "[... description truncated from 500.0MB to 2.0MB ...]" 4. Log warning with original size for observability Also adds format_bytes() helper for human-readable byte sizes (B, KB, MB). This is applied to both issue and MR document extraction in extractor.rs, protecting the embedding pipeline from OOM on malformed GitLab data. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-26 11:06:23 -05:00
parent a45c37c7e4
commit 23efb15599
2 changed files with 73 additions and 3 deletions
--- a/src/documents/truncation.rs
+++ b/src/documents/truncation.rs
@@ -48,6 +48,56 @@ pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
    &s[..end]
 }

+/// Result of pre-truncating a description to avoid unbounded memory allocation.
+pub struct DescriptionPreTruncateResult {
+    pub content: String,
+    pub was_truncated: bool,
+    pub original_bytes: usize,
+}
+
+/// Pre-truncate a description to avoid allocating huge amounts of memory.
+///
+/// This is called BEFORE appending to the document content, so we don't
+/// allocate memory for pathologically large descriptions (e.g., 500MB base64 blob).
+///
+/// Returns the (potentially truncated) description and whether truncation occurred.
+pub fn pre_truncate_description(desc: &str, max_bytes: usize) -> DescriptionPreTruncateResult {
+    let original_bytes = desc.len();
+
+    if original_bytes <= max_bytes {
+        return DescriptionPreTruncateResult {
+            content: desc.to_string(),
+            was_truncated: false,
+            original_bytes,
+        };
+    }
+
+    // Truncate at UTF-8 boundary and add indicator
+    let truncated = truncate_utf8(desc, max_bytes.saturating_sub(50)); // Reserve space for marker
+    let mut content = truncated.to_string();
+    content.push_str("\n\n[... description truncated from ");
+    content.push_str(&format_bytes(original_bytes));
+    content.push_str(" to ");
+    content.push_str(&format_bytes(max_bytes));
+    content.push_str(" ...]");
+
+    DescriptionPreTruncateResult {
+        content,
+        was_truncated: true,
+        original_bytes,
+    }
+}
+
+fn format_bytes(bytes: usize) -> String {
+    if bytes >= 1_000_000 {
+        format!("{:.1}MB", bytes as f64 / 1_000_000.0)
+    } else if bytes >= 1_000 {
+        format!("{:.1}KB", bytes as f64 / 1_000.0)
+    } else {
+        format!("{}B", bytes)
+    }
+}
+
 pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
    if notes.is_empty() {
        return TruncationResult {