Add pre_truncate_description() to prevent unbounded memory allocation when processing pathologically large descriptions (e.g., 500MB base64 blobs in issue descriptions). Previously, the document extraction pipeline would: 1. Allocate memory for the entire description 2. Append to content buffer 3. Only truncate at the end via truncate_hard_cap() For a 500MB description, this would allocate 500MB+ before truncation. New approach: 1. Check description size BEFORE appending 2. If over limit, truncate at UTF-8 boundary immediately 3. Add human-readable marker: "[... description truncated from 500.0MB to 2.0MB ...]" 4. Log warning with original size for observability Also adds format_bytes() helper for human-readable byte sizes (B, KB, MB). This is applied to both issue and MR document extraction in extractor.rs, protecting the embedding pipeline from OOM on malformed GitLab data. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
347 lines
11 KiB
Rust
347 lines
11 KiB
Rust
pub const MAX_DISCUSSION_BYTES: usize = 32_000;
|
|
|
|
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
|
|
|
|
pub struct NoteContent {
|
|
pub author: String,
|
|
pub date: String,
|
|
pub body: String,
|
|
}
|
|
|
|
pub struct TruncationResult {
|
|
pub content: String,
|
|
pub is_truncated: bool,
|
|
pub reason: Option<TruncationReason>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum TruncationReason {
|
|
TokenLimitMiddleDrop,
|
|
SingleNoteOversized,
|
|
FirstLastOversized,
|
|
HardCapOversized,
|
|
}
|
|
|
|
impl TruncationReason {
|
|
pub fn as_str(&self) -> &'static str {
|
|
match self {
|
|
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
|
|
Self::SingleNoteOversized => "single_note_oversized",
|
|
Self::FirstLastOversized => "first_last_oversized",
|
|
Self::HardCapOversized => "hard_cap_oversized",
|
|
}
|
|
}
|
|
}
|
|
|
|
fn format_note(note: &NoteContent) -> String {
|
|
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
|
|
}
|
|
|
|
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
|
if s.len() <= max_bytes {
|
|
return s;
|
|
}
|
|
let mut end = max_bytes;
|
|
while end > 0 && !s.is_char_boundary(end) {
|
|
end -= 1;
|
|
}
|
|
&s[..end]
|
|
}
|
|
|
|
/// Result of pre-truncating a description to avoid unbounded memory allocation.
|
|
pub struct DescriptionPreTruncateResult {
|
|
pub content: String,
|
|
pub was_truncated: bool,
|
|
pub original_bytes: usize,
|
|
}
|
|
|
|
/// Pre-truncate a description to avoid allocating huge amounts of memory.
|
|
///
|
|
/// This is called BEFORE appending to the document content, so we don't
|
|
/// allocate memory for pathologically large descriptions (e.g., 500MB base64 blob).
|
|
///
|
|
/// Returns the (potentially truncated) description and whether truncation occurred.
|
|
pub fn pre_truncate_description(desc: &str, max_bytes: usize) -> DescriptionPreTruncateResult {
|
|
let original_bytes = desc.len();
|
|
|
|
if original_bytes <= max_bytes {
|
|
return DescriptionPreTruncateResult {
|
|
content: desc.to_string(),
|
|
was_truncated: false,
|
|
original_bytes,
|
|
};
|
|
}
|
|
|
|
// Truncate at UTF-8 boundary and add indicator
|
|
let truncated = truncate_utf8(desc, max_bytes.saturating_sub(50)); // Reserve space for marker
|
|
let mut content = truncated.to_string();
|
|
content.push_str("\n\n[... description truncated from ");
|
|
content.push_str(&format_bytes(original_bytes));
|
|
content.push_str(" to ");
|
|
content.push_str(&format_bytes(max_bytes));
|
|
content.push_str(" ...]");
|
|
|
|
DescriptionPreTruncateResult {
|
|
content,
|
|
was_truncated: true,
|
|
original_bytes,
|
|
}
|
|
}
|
|
|
|
fn format_bytes(bytes: usize) -> String {
|
|
if bytes >= 1_000_000 {
|
|
format!("{:.1}MB", bytes as f64 / 1_000_000.0)
|
|
} else if bytes >= 1_000 {
|
|
format!("{:.1}KB", bytes as f64 / 1_000.0)
|
|
} else {
|
|
format!("{}B", bytes)
|
|
}
|
|
}
|
|
|
|
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
|
|
if notes.is_empty() {
|
|
return TruncationResult {
|
|
content: String::new(),
|
|
is_truncated: false,
|
|
reason: None,
|
|
};
|
|
}
|
|
|
|
let formatted: Vec<String> = notes.iter().map(format_note).collect();
|
|
let total_len: usize = formatted.iter().map(|s| s.len()).sum();
|
|
|
|
if total_len <= max_bytes {
|
|
let mut total = String::with_capacity(total_len);
|
|
for s in &formatted {
|
|
total.push_str(s);
|
|
}
|
|
return TruncationResult {
|
|
content: total,
|
|
is_truncated: false,
|
|
reason: None,
|
|
};
|
|
}
|
|
|
|
if notes.len() == 1 {
|
|
let truncated = truncate_utf8(&formatted[0], max_bytes.saturating_sub(11));
|
|
let content = format!("{}[truncated]", truncated);
|
|
return TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::SingleNoteOversized),
|
|
};
|
|
}
|
|
|
|
let last_note = &formatted[formatted.len() - 1];
|
|
|
|
let mut best_n = 0;
|
|
for n in 1..formatted.len() - 1 {
|
|
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
|
|
let omitted = formatted.len() - n - 1;
|
|
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
|
let candidate_len = first_n + marker.len() + last_note.len();
|
|
if candidate_len <= max_bytes {
|
|
best_n = n;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if best_n > 0 {
|
|
let first_part: String = formatted[..best_n].concat();
|
|
let omitted = formatted.len() - best_n - 1;
|
|
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
|
let content = format!("{}{}{}", first_part, marker, last_note);
|
|
return TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
|
};
|
|
}
|
|
|
|
let first_note = &formatted[0];
|
|
let omitted = formatted.len() - 2;
|
|
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
|
if first_note.len() + marker.len() + last_note.len() > max_bytes {
|
|
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
|
|
let content = format!("{}[truncated]", truncated);
|
|
return TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::FirstLastOversized),
|
|
};
|
|
}
|
|
|
|
let content = format!("{}{}{}", formatted[0], marker, last_note);
|
|
TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
|
}
|
|
}
|
|
|
|
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
|
|
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
|
|
return TruncationResult {
|
|
content: content.to_string(),
|
|
is_truncated: false,
|
|
reason: None,
|
|
};
|
|
}
|
|
|
|
let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
|
|
TruncationResult {
|
|
content: format!("{}[truncated]", truncated),
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::HardCapOversized),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn make_note(author: &str, body: &str) -> NoteContent {
|
|
NoteContent {
|
|
author: author.to_string(),
|
|
date: "2024-01-01".to_string(),
|
|
body: body.to_string(),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_truncation_under_limit() {
|
|
let notes = vec![
|
|
make_note("alice", "Short note 1"),
|
|
make_note("bob", "Short note 2"),
|
|
make_note("carol", "Short note 3"),
|
|
];
|
|
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
|
assert!(!result.is_truncated);
|
|
assert!(result.reason.is_none());
|
|
assert!(result.content.contains("@alice"));
|
|
assert!(result.content.contains("@bob"));
|
|
assert!(result.content.contains("@carol"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_middle_notes_dropped() {
|
|
let big_body = "x".repeat(4000);
|
|
let notes: Vec<NoteContent> = (0..10)
|
|
.map(|i| make_note(&format!("user{}", i), &big_body))
|
|
.collect();
|
|
let result = truncate_discussion(¬es, 10_000);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
|
|
assert!(result.content.contains("@user0"));
|
|
assert!(result.content.contains("@user9"));
|
|
assert!(result.content.contains("notes omitted for length"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_single_note_oversized() {
|
|
let big_body = "x".repeat(50_000);
|
|
let notes = vec![make_note("alice", &big_body)];
|
|
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
|
|
assert!(result.content.ends_with("[truncated]"));
|
|
assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
|
|
}
|
|
|
|
#[test]
|
|
fn test_first_last_oversized() {
|
|
let big_body = "x".repeat(20_000);
|
|
let notes = vec![make_note("alice", &big_body), make_note("bob", &big_body)];
|
|
let result = truncate_discussion(¬es, 10_000);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
|
|
assert!(result.content.contains("@alice"));
|
|
assert!(result.content.ends_with("[truncated]"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_one_note_under_limit() {
|
|
let notes = vec![make_note("alice", "Short note")];
|
|
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
|
assert!(!result.is_truncated);
|
|
assert!(result.content.contains("@alice"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_notes() {
|
|
let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
|
|
assert!(!result.is_truncated);
|
|
assert!(result.content.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_utf8_boundary_safety() {
|
|
let emoji_content = "🎉".repeat(10);
|
|
let truncated = truncate_utf8(&emoji_content, 10);
|
|
assert_eq!(truncated.len(), 8);
|
|
assert_eq!(truncated, "🎉🎉");
|
|
}
|
|
|
|
#[test]
|
|
fn test_utf8_boundary_cjk() {
|
|
let cjk = "中文字符测试";
|
|
let truncated = truncate_utf8(cjk, 7);
|
|
assert_eq!(truncated, "中文");
|
|
assert_eq!(truncated.len(), 6);
|
|
}
|
|
|
|
#[test]
|
|
fn test_hard_cap() {
|
|
let big_content = "x".repeat(3_000_000);
|
|
let result = truncate_hard_cap(&big_content);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
|
|
assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
|
|
assert!(result.content.ends_with("[truncated]"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_hard_cap_under_limit() {
|
|
let content = "Short content";
|
|
let result = truncate_hard_cap(content);
|
|
assert!(!result.is_truncated);
|
|
assert_eq!(result.content, content);
|
|
}
|
|
|
|
#[test]
|
|
fn test_marker_count_correct() {
|
|
let big_body = "x".repeat(5000);
|
|
let notes: Vec<NoteContent> = (0..7)
|
|
.map(|i| make_note(&format!("user{}", i), &big_body))
|
|
.collect();
|
|
let result = truncate_discussion(¬es, 12_000);
|
|
assert!(result.is_truncated);
|
|
assert!(
|
|
result
|
|
.content
|
|
.contains("[... 5 notes omitted for length ...]")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncation_reason_as_str() {
|
|
assert_eq!(
|
|
TruncationReason::TokenLimitMiddleDrop.as_str(),
|
|
"token_limit_middle_drop"
|
|
);
|
|
assert_eq!(
|
|
TruncationReason::SingleNoteOversized.as_str(),
|
|
"single_note_oversized"
|
|
);
|
|
assert_eq!(
|
|
TruncationReason::FirstLastOversized.as_str(),
|
|
"first_last_oversized"
|
|
);
|
|
assert_eq!(
|
|
TruncationReason::HardCapOversized.as_str(),
|
|
"hard_cap_oversized"
|
|
);
|
|
}
|
|
}
|