feat(truncation): add pre-truncation for oversized descriptions
Add pre_truncate_description() to prevent unbounded memory allocation when processing pathologically large descriptions (e.g., 500MB base64 blobs in issue descriptions). Previously, the document extraction pipeline would: 1. Allocate memory for the entire description 2. Append to content buffer 3. Only truncate at the end via truncate_hard_cap() For a 500MB description, this would allocate 500MB+ before truncation. New approach: 1. Check description size BEFORE appending 2. If over limit, truncate at UTF-8 boundary immediately 3. Add human-readable marker: "[... description truncated from 500.0MB to 2.0MB ...]" 4. Log warning with original size for observability Also adds format_bytes() helper for human-readable byte sizes (B, KB, MB). This is applied to both issue and MR document extraction in extractor.rs, protecting the embedding pipeline from OOM on malformed GitLab data. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -6,10 +6,12 @@ use std::collections::{BTreeSet, HashMap};
|
||||
use std::fmt::Write as _;
|
||||
|
||||
use super::truncation::{
|
||||
MAX_DISCUSSION_BYTES, NoteContent, truncate_discussion, truncate_hard_cap,
|
||||
MAX_DISCUSSION_BYTES, MAX_DOCUMENT_BYTES_HARD, NoteContent, pre_truncate_description,
|
||||
truncate_discussion, truncate_hard_cap,
|
||||
};
|
||||
use crate::core::error::Result;
|
||||
use crate::core::time::ms_to_iso;
|
||||
use tracing::warn;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
@@ -158,7 +160,16 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
|
||||
|
||||
if let Some(ref desc) = description {
|
||||
content.push_str("\n--- Description ---\n\n");
|
||||
content.push_str(desc);
|
||||
// Pre-truncate to avoid unbounded memory allocation for huge descriptions
|
||||
let pre_trunc = pre_truncate_description(desc, MAX_DOCUMENT_BYTES_HARD);
|
||||
if pre_trunc.was_truncated {
|
||||
warn!(
|
||||
iid,
|
||||
original_bytes = pre_trunc.original_bytes,
|
||||
"Issue description truncated (oversized)"
|
||||
);
|
||||
}
|
||||
content.push_str(&pre_trunc.content);
|
||||
}
|
||||
|
||||
let labels_hash = compute_list_hash(&labels);
|
||||
@@ -268,7 +279,16 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
|
||||
|
||||
if let Some(ref desc) = description {
|
||||
content.push_str("\n--- Description ---\n\n");
|
||||
content.push_str(desc);
|
||||
// Pre-truncate to avoid unbounded memory allocation for huge descriptions
|
||||
let pre_trunc = pre_truncate_description(desc, MAX_DOCUMENT_BYTES_HARD);
|
||||
if pre_trunc.was_truncated {
|
||||
warn!(
|
||||
iid,
|
||||
original_bytes = pre_trunc.original_bytes,
|
||||
"MR description truncated (oversized)"
|
||||
);
|
||||
}
|
||||
content.push_str(&pre_trunc.content);
|
||||
}
|
||||
|
||||
let labels_hash = compute_list_hash(&labels);
|
||||
|
||||
@@ -48,6 +48,56 @@ pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
||||
&s[..end]
|
||||
}
|
||||
|
||||
/// Result of pre-truncating a description to avoid unbounded memory allocation.
|
||||
pub struct DescriptionPreTruncateResult {
|
||||
pub content: String,
|
||||
pub was_truncated: bool,
|
||||
pub original_bytes: usize,
|
||||
}
|
||||
|
||||
/// Pre-truncate a description to avoid allocating huge amounts of memory.
|
||||
///
|
||||
/// This is called BEFORE appending to the document content, so we don't
|
||||
/// allocate memory for pathologically large descriptions (e.g., 500MB base64 blob).
|
||||
///
|
||||
/// Returns the (potentially truncated) description and whether truncation occurred.
|
||||
pub fn pre_truncate_description(desc: &str, max_bytes: usize) -> DescriptionPreTruncateResult {
|
||||
let original_bytes = desc.len();
|
||||
|
||||
if original_bytes <= max_bytes {
|
||||
return DescriptionPreTruncateResult {
|
||||
content: desc.to_string(),
|
||||
was_truncated: false,
|
||||
original_bytes,
|
||||
};
|
||||
}
|
||||
|
||||
// Truncate at UTF-8 boundary and add indicator
|
||||
let truncated = truncate_utf8(desc, max_bytes.saturating_sub(50)); // Reserve space for marker
|
||||
let mut content = truncated.to_string();
|
||||
content.push_str("\n\n[... description truncated from ");
|
||||
content.push_str(&format_bytes(original_bytes));
|
||||
content.push_str(" to ");
|
||||
content.push_str(&format_bytes(max_bytes));
|
||||
content.push_str(" ...]");
|
||||
|
||||
DescriptionPreTruncateResult {
|
||||
content,
|
||||
was_truncated: true,
|
||||
original_bytes,
|
||||
}
|
||||
}
|
||||
|
||||
fn format_bytes(bytes: usize) -> String {
|
||||
if bytes >= 1_000_000 {
|
||||
format!("{:.1}MB", bytes as f64 / 1_000_000.0)
|
||||
} else if bytes >= 1_000 {
|
||||
format!("{:.1}KB", bytes as f64 / 1_000.0)
|
||||
} else {
|
||||
format!("{}B", bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
|
||||
if notes.is_empty() {
|
||||
return TruncationResult {
|
||||
|
||||
Reference in New Issue
Block a user