Files
gitlore/src/documents/truncation.rs
teernisse 23efb15599 feat(truncation): add pre-truncation for oversized descriptions
Add pre_truncate_description() to prevent unbounded memory allocation when
processing pathologically large descriptions (e.g., 500MB base64 blobs in
issue descriptions).

Previously, the document extraction pipeline would:
1. Allocate memory for the entire description
2. Append to content buffer
3. Only truncate at the end via truncate_hard_cap()

For a 500MB description, this would allocate 500MB+ before truncation.

New approach:
1. Check description size BEFORE appending
2. If over limit, truncate at UTF-8 boundary immediately
3. Add human-readable marker: "[... description truncated from 500.0MB to 2.0MB ...]"
4. Log warning with original size for observability

Also adds format_bytes() helper for human-readable byte sizes (B, KB, MB).

This is applied to both issue and MR document extraction in extractor.rs,
protecting the embedding pipeline from OOM on malformed GitLab data.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-26 11:06:32 -05:00

347 lines
11 KiB
Rust

pub const MAX_DISCUSSION_BYTES: usize = 32_000;
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
pub struct NoteContent {
pub author: String,
pub date: String,
pub body: String,
}
pub struct TruncationResult {
pub content: String,
pub is_truncated: bool,
pub reason: Option<TruncationReason>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TruncationReason {
TokenLimitMiddleDrop,
SingleNoteOversized,
FirstLastOversized,
HardCapOversized,
}
impl TruncationReason {
pub fn as_str(&self) -> &'static str {
match self {
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
Self::SingleNoteOversized => "single_note_oversized",
Self::FirstLastOversized => "first_last_oversized",
Self::HardCapOversized => "hard_cap_oversized",
}
}
}
fn format_note(note: &NoteContent) -> String {
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
}
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
if s.len() <= max_bytes {
return s;
}
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
/// Result of pre-truncating a description to avoid unbounded memory allocation.
pub struct DescriptionPreTruncateResult {
pub content: String,
pub was_truncated: bool,
pub original_bytes: usize,
}
/// Pre-truncate a description to avoid allocating huge amounts of memory.
///
/// This is called BEFORE appending to the document content, so we don't
/// allocate memory for pathologically large descriptions (e.g., 500MB base64 blob).
///
/// Returns the (potentially truncated) description and whether truncation occurred.
pub fn pre_truncate_description(desc: &str, max_bytes: usize) -> DescriptionPreTruncateResult {
let original_bytes = desc.len();
if original_bytes <= max_bytes {
return DescriptionPreTruncateResult {
content: desc.to_string(),
was_truncated: false,
original_bytes,
};
}
// Truncate at UTF-8 boundary and add indicator
let truncated = truncate_utf8(desc, max_bytes.saturating_sub(50)); // Reserve space for marker
let mut content = truncated.to_string();
content.push_str("\n\n[... description truncated from ");
content.push_str(&format_bytes(original_bytes));
content.push_str(" to ");
content.push_str(&format_bytes(max_bytes));
content.push_str(" ...]");
DescriptionPreTruncateResult {
content,
was_truncated: true,
original_bytes,
}
}
fn format_bytes(bytes: usize) -> String {
if bytes >= 1_000_000 {
format!("{:.1}MB", bytes as f64 / 1_000_000.0)
} else if bytes >= 1_000 {
format!("{:.1}KB", bytes as f64 / 1_000.0)
} else {
format!("{}B", bytes)
}
}
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
if notes.is_empty() {
return TruncationResult {
content: String::new(),
is_truncated: false,
reason: None,
};
}
let formatted: Vec<String> = notes.iter().map(format_note).collect();
let total_len: usize = formatted.iter().map(|s| s.len()).sum();
if total_len <= max_bytes {
let mut total = String::with_capacity(total_len);
for s in &formatted {
total.push_str(s);
}
return TruncationResult {
content: total,
is_truncated: false,
reason: None,
};
}
if notes.len() == 1 {
let truncated = truncate_utf8(&formatted[0], max_bytes.saturating_sub(11));
let content = format!("{}[truncated]", truncated);
return TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::SingleNoteOversized),
};
}
let last_note = &formatted[formatted.len() - 1];
let mut best_n = 0;
for n in 1..formatted.len() - 1 {
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
let omitted = formatted.len() - n - 1;
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
let candidate_len = first_n + marker.len() + last_note.len();
if candidate_len <= max_bytes {
best_n = n;
} else {
break;
}
}
if best_n > 0 {
let first_part: String = formatted[..best_n].concat();
let omitted = formatted.len() - best_n - 1;
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
let content = format!("{}{}{}", first_part, marker, last_note);
return TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::TokenLimitMiddleDrop),
};
}
let first_note = &formatted[0];
let omitted = formatted.len() - 2;
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
if first_note.len() + marker.len() + last_note.len() > max_bytes {
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
let content = format!("{}[truncated]", truncated);
return TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::FirstLastOversized),
};
}
let content = format!("{}{}{}", formatted[0], marker, last_note);
TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::TokenLimitMiddleDrop),
}
}
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
return TruncationResult {
content: content.to_string(),
is_truncated: false,
reason: None,
};
}
let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
TruncationResult {
content: format!("{}[truncated]", truncated),
is_truncated: true,
reason: Some(TruncationReason::HardCapOversized),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_note(author: &str, body: &str) -> NoteContent {
NoteContent {
author: author.to_string(),
date: "2024-01-01".to_string(),
body: body.to_string(),
}
}
#[test]
fn test_no_truncation_under_limit() {
let notes = vec![
make_note("alice", "Short note 1"),
make_note("bob", "Short note 2"),
make_note("carol", "Short note 3"),
];
let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
assert!(!result.is_truncated);
assert!(result.reason.is_none());
assert!(result.content.contains("@alice"));
assert!(result.content.contains("@bob"));
assert!(result.content.contains("@carol"));
}
#[test]
fn test_middle_notes_dropped() {
let big_body = "x".repeat(4000);
let notes: Vec<NoteContent> = (0..10)
.map(|i| make_note(&format!("user{}", i), &big_body))
.collect();
let result = truncate_discussion(&notes, 10_000);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
assert!(result.content.contains("@user0"));
assert!(result.content.contains("@user9"));
assert!(result.content.contains("notes omitted for length"));
}
#[test]
fn test_single_note_oversized() {
let big_body = "x".repeat(50_000);
let notes = vec![make_note("alice", &big_body)];
let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
assert!(result.content.ends_with("[truncated]"));
assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
}
#[test]
fn test_first_last_oversized() {
let big_body = "x".repeat(20_000);
let notes = vec![make_note("alice", &big_body), make_note("bob", &big_body)];
let result = truncate_discussion(&notes, 10_000);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
assert!(result.content.contains("@alice"));
assert!(result.content.ends_with("[truncated]"));
}
#[test]
fn test_one_note_under_limit() {
let notes = vec![make_note("alice", "Short note")];
let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
assert!(!result.is_truncated);
assert!(result.content.contains("@alice"));
}
#[test]
fn test_empty_notes() {
let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
assert!(!result.is_truncated);
assert!(result.content.is_empty());
}
#[test]
fn test_utf8_boundary_safety() {
let emoji_content = "🎉".repeat(10);
let truncated = truncate_utf8(&emoji_content, 10);
assert_eq!(truncated.len(), 8);
assert_eq!(truncated, "🎉🎉");
}
#[test]
fn test_utf8_boundary_cjk() {
let cjk = "中文字符测试";
let truncated = truncate_utf8(cjk, 7);
assert_eq!(truncated, "中文");
assert_eq!(truncated.len(), 6);
}
#[test]
fn test_hard_cap() {
let big_content = "x".repeat(3_000_000);
let result = truncate_hard_cap(&big_content);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
assert!(result.content.ends_with("[truncated]"));
}
#[test]
fn test_hard_cap_under_limit() {
let content = "Short content";
let result = truncate_hard_cap(content);
assert!(!result.is_truncated);
assert_eq!(result.content, content);
}
#[test]
fn test_marker_count_correct() {
let big_body = "x".repeat(5000);
let notes: Vec<NoteContent> = (0..7)
.map(|i| make_note(&format!("user{}", i), &big_body))
.collect();
let result = truncate_discussion(&notes, 12_000);
assert!(result.is_truncated);
assert!(
result
.content
.contains("[... 5 notes omitted for length ...]")
);
}
#[test]
fn test_truncation_reason_as_str() {
assert_eq!(
TruncationReason::TokenLimitMiddleDrop.as_str(),
"token_limit_middle_drop"
);
assert_eq!(
TruncationReason::SingleNoteOversized.as_str(),
"single_note_oversized"
);
assert_eq!(
TruncationReason::FirstLastOversized.as_str(),
"first_last_oversized"
);
assert_eq!(
TruncationReason::HardCapOversized.as_str(),
"hard_cap_oversized"
);
}
}