Change detection queries (embedding/change_detector.rs): - Replace triple-EXISTS subquery pattern with LEFT JOIN + NULL check - SQLite now scans embedding_metadata once instead of three times - Semantically identical: returns docs needing embedding when no embedding exists, hash changed, or config mismatch Count queries (cli/commands/count.rs): - Consolidate 3 separate COUNT queries for issues into single query using conditional aggregation (CASE WHEN state = 'x' THEN 1) - Same optimization for MRs: 5 queries reduced to 1 Search filter queries (search/filters.rs): - Replace N separate EXISTS clauses for label filtering with single IN() clause with COUNT/GROUP BY HAVING pattern - For multi-label AND queries, this reduces N subqueries to 1 FTS tokenization (search/fts.rs): - Replace collect-into-Vec-then-join pattern with direct String building - Pre-allocate capacity hint for result string Discussion truncation (documents/truncation.rs): - Calculate total length without allocating concatenated string first - Only allocate full string when we know it fits within limit Embedding pipeline (embedding/pipeline.rs): - Add Vec::with_capacity hints for chunk work and cleared_docs hashset - Reduces reallocations during embedding batch processing Backoff calculation (core/backoff.rs): - Replace unchecked addition with saturating_add to prevent overflow - Add test case verifying overflow protection Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
297 lines
9.2 KiB
Rust
297 lines
9.2 KiB
Rust
pub const MAX_DISCUSSION_BYTES: usize = 32_000;
|
|
|
|
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
|
|
|
|
pub struct NoteContent {
|
|
pub author: String,
|
|
pub date: String,
|
|
pub body: String,
|
|
}
|
|
|
|
pub struct TruncationResult {
|
|
pub content: String,
|
|
pub is_truncated: bool,
|
|
pub reason: Option<TruncationReason>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum TruncationReason {
|
|
TokenLimitMiddleDrop,
|
|
SingleNoteOversized,
|
|
FirstLastOversized,
|
|
HardCapOversized,
|
|
}
|
|
|
|
impl TruncationReason {
|
|
pub fn as_str(&self) -> &'static str {
|
|
match self {
|
|
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
|
|
Self::SingleNoteOversized => "single_note_oversized",
|
|
Self::FirstLastOversized => "first_last_oversized",
|
|
Self::HardCapOversized => "hard_cap_oversized",
|
|
}
|
|
}
|
|
}
|
|
|
|
fn format_note(note: &NoteContent) -> String {
|
|
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
|
|
}
|
|
|
|
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
|
if s.len() <= max_bytes {
|
|
return s;
|
|
}
|
|
let mut end = max_bytes;
|
|
while end > 0 && !s.is_char_boundary(end) {
|
|
end -= 1;
|
|
}
|
|
&s[..end]
|
|
}
|
|
|
|
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
|
|
if notes.is_empty() {
|
|
return TruncationResult {
|
|
content: String::new(),
|
|
is_truncated: false,
|
|
reason: None,
|
|
};
|
|
}
|
|
|
|
let formatted: Vec<String> = notes.iter().map(format_note).collect();
|
|
let total_len: usize = formatted.iter().map(|s| s.len()).sum();
|
|
|
|
if total_len <= max_bytes {
|
|
let mut total = String::with_capacity(total_len);
|
|
for s in &formatted {
|
|
total.push_str(s);
|
|
}
|
|
return TruncationResult {
|
|
content: total,
|
|
is_truncated: false,
|
|
reason: None,
|
|
};
|
|
}
|
|
|
|
if notes.len() == 1 {
|
|
let truncated = truncate_utf8(&formatted[0], max_bytes.saturating_sub(11));
|
|
let content = format!("{}[truncated]", truncated);
|
|
return TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::SingleNoteOversized),
|
|
};
|
|
}
|
|
|
|
let last_note = &formatted[formatted.len() - 1];
|
|
|
|
let mut best_n = 0;
|
|
for n in 1..formatted.len() - 1 {
|
|
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
|
|
let omitted = formatted.len() - n - 1;
|
|
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
|
let candidate_len = first_n + marker.len() + last_note.len();
|
|
if candidate_len <= max_bytes {
|
|
best_n = n;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if best_n > 0 {
|
|
let first_part: String = formatted[..best_n].concat();
|
|
let omitted = formatted.len() - best_n - 1;
|
|
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
|
let content = format!("{}{}{}", first_part, marker, last_note);
|
|
return TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
|
};
|
|
}
|
|
|
|
let first_note = &formatted[0];
|
|
if first_note.len() + last_note.len() > max_bytes {
|
|
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
|
|
let content = format!("{}[truncated]", truncated);
|
|
return TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::FirstLastOversized),
|
|
};
|
|
}
|
|
|
|
let omitted = formatted.len() - 2;
|
|
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
|
let content = format!("{}{}{}", formatted[0], marker, last_note);
|
|
TruncationResult {
|
|
content,
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
|
}
|
|
}
|
|
|
|
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
|
|
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
|
|
return TruncationResult {
|
|
content: content.to_string(),
|
|
is_truncated: false,
|
|
reason: None,
|
|
};
|
|
}
|
|
|
|
let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
|
|
TruncationResult {
|
|
content: format!("{}[truncated]", truncated),
|
|
is_truncated: true,
|
|
reason: Some(TruncationReason::HardCapOversized),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn make_note(author: &str, body: &str) -> NoteContent {
|
|
NoteContent {
|
|
author: author.to_string(),
|
|
date: "2024-01-01".to_string(),
|
|
body: body.to_string(),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_truncation_under_limit() {
|
|
let notes = vec![
|
|
make_note("alice", "Short note 1"),
|
|
make_note("bob", "Short note 2"),
|
|
make_note("carol", "Short note 3"),
|
|
];
|
|
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
|
assert!(!result.is_truncated);
|
|
assert!(result.reason.is_none());
|
|
assert!(result.content.contains("@alice"));
|
|
assert!(result.content.contains("@bob"));
|
|
assert!(result.content.contains("@carol"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_middle_notes_dropped() {
|
|
let big_body = "x".repeat(4000);
|
|
let notes: Vec<NoteContent> = (0..10)
|
|
.map(|i| make_note(&format!("user{}", i), &big_body))
|
|
.collect();
|
|
let result = truncate_discussion(¬es, 10_000);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
|
|
assert!(result.content.contains("@user0"));
|
|
assert!(result.content.contains("@user9"));
|
|
assert!(result.content.contains("notes omitted for length"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_single_note_oversized() {
|
|
let big_body = "x".repeat(50_000);
|
|
let notes = vec![make_note("alice", &big_body)];
|
|
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
|
|
assert!(result.content.ends_with("[truncated]"));
|
|
assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
|
|
}
|
|
|
|
#[test]
|
|
fn test_first_last_oversized() {
|
|
let big_body = "x".repeat(20_000);
|
|
let notes = vec![make_note("alice", &big_body), make_note("bob", &big_body)];
|
|
let result = truncate_discussion(¬es, 10_000);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
|
|
assert!(result.content.contains("@alice"));
|
|
assert!(result.content.ends_with("[truncated]"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_one_note_under_limit() {
|
|
let notes = vec![make_note("alice", "Short note")];
|
|
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
|
assert!(!result.is_truncated);
|
|
assert!(result.content.contains("@alice"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_notes() {
|
|
let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
|
|
assert!(!result.is_truncated);
|
|
assert!(result.content.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_utf8_boundary_safety() {
|
|
let emoji_content = "🎉".repeat(10);
|
|
let truncated = truncate_utf8(&emoji_content, 10);
|
|
assert_eq!(truncated.len(), 8);
|
|
assert_eq!(truncated, "🎉🎉");
|
|
}
|
|
|
|
#[test]
|
|
fn test_utf8_boundary_cjk() {
|
|
let cjk = "中文字符测试";
|
|
let truncated = truncate_utf8(cjk, 7);
|
|
assert_eq!(truncated, "中文");
|
|
assert_eq!(truncated.len(), 6);
|
|
}
|
|
|
|
#[test]
|
|
fn test_hard_cap() {
|
|
let big_content = "x".repeat(3_000_000);
|
|
let result = truncate_hard_cap(&big_content);
|
|
assert!(result.is_truncated);
|
|
assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
|
|
assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
|
|
assert!(result.content.ends_with("[truncated]"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_hard_cap_under_limit() {
|
|
let content = "Short content";
|
|
let result = truncate_hard_cap(content);
|
|
assert!(!result.is_truncated);
|
|
assert_eq!(result.content, content);
|
|
}
|
|
|
|
#[test]
|
|
fn test_marker_count_correct() {
|
|
let big_body = "x".repeat(5000);
|
|
let notes: Vec<NoteContent> = (0..7)
|
|
.map(|i| make_note(&format!("user{}", i), &big_body))
|
|
.collect();
|
|
let result = truncate_discussion(¬es, 12_000);
|
|
assert!(result.is_truncated);
|
|
assert!(
|
|
result
|
|
.content
|
|
.contains("[... 5 notes omitted for length ...]")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncation_reason_as_str() {
|
|
assert_eq!(
|
|
TruncationReason::TokenLimitMiddleDrop.as_str(),
|
|
"token_limit_middle_drop"
|
|
);
|
|
assert_eq!(
|
|
TruncationReason::SingleNoteOversized.as_str(),
|
|
"single_note_oversized"
|
|
);
|
|
assert_eq!(
|
|
TruncationReason::FirstLastOversized.as_str(),
|
|
"first_last_oversized"
|
|
);
|
|
assert_eq!(
|
|
TruncationReason::HardCapOversized.as_str(),
|
|
"hard_cap_oversized"
|
|
);
|
|
}
|
|
}
|