From 23efb155992a47905ed19fe232834b1783a723a7 Mon Sep 17 00:00:00 2001 From: teernisse Date: Thu, 26 Feb 2026 11:06:23 -0500 Subject: [PATCH] feat(truncation): add pre-truncation for oversized descriptions Add pre_truncate_description() to prevent unbounded memory allocation when processing pathologically large descriptions (e.g., 500MB base64 blobs in issue descriptions). Previously, the document extraction pipeline would: 1. Allocate memory for the entire description 2. Append to content buffer 3. Only truncate at the end via truncate_hard_cap() For a 500MB description, this would allocate 500MB+ before truncation. New approach: 1. Check description size BEFORE appending 2. If over limit, truncate at UTF-8 boundary immediately 3. Add human-readable marker: "[... description truncated from 500.0MB to 2.0MB ...]" 4. Log warning with original size for observability Also adds format_bytes() helper for human-readable byte sizes (B, KB, MB). This is applied to both issue and MR document extraction in extractor.rs, protecting the embedding pipeline from OOM on malformed GitLab data. Co-Authored-By: Claude Opus 4.5 --- src/documents/extractor.rs | 26 ++++++++++++++++--- src/documents/truncation.rs | 50 +++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/src/documents/extractor.rs b/src/documents/extractor.rs index 3903cad..3537235 100644 --- a/src/documents/extractor.rs +++ b/src/documents/extractor.rs @@ -6,10 +6,12 @@ use std::collections::{BTreeSet, HashMap}; use std::fmt::Write as _; use super::truncation::{ - MAX_DISCUSSION_BYTES, NoteContent, truncate_discussion, truncate_hard_cap, + MAX_DISCUSSION_BYTES, MAX_DOCUMENT_BYTES_HARD, NoteContent, pre_truncate_description, + truncate_discussion, truncate_hard_cap, }; use crate::core::error::Result; use crate::core::time::ms_to_iso; +use tracing::warn; #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] @@ -158,7 +160,16 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result