feat(documents): Add document generation pipeline with dirty tracking
Implements the documents module that transforms raw ingested entities (issues, MRs, discussions) into searchable document blobs stored in the documents table. This is the foundation for both FTS5 lexical search and vector embedding. Key components: - documents::extractor: Renders entities into structured text documents. Issues include title, description, labels, milestone, assignees, and threaded discussion summaries. MRs additionally include source/target branches, reviewers, and approval status. Discussions are rendered with full note threading. - documents::regenerator: Drains the dirty_queue table to regenerate only documents whose source entities changed since last sync. Supports full rebuild mode (seeds all entities into dirty queue first) and project-scoped regeneration. - documents::truncation: Safety cap at 2MB per document to prevent pathological outliers from degrading FTS or embedding performance. - ingestion::dirty_tracker: Marks entities as dirty inside the ingestion transaction so document regeneration stays consistent with data changes. Uses INSERT OR IGNORE to deduplicate. - ingestion::discussion_queue: Queue-based discussion fetching that isolates individual discussion failures from the broader ingestion pipeline, preventing a single corrupt discussion from blocking an entire project sync. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
329
src/documents/truncation.rs
Normal file
329
src/documents/truncation.rs
Normal file
@@ -0,0 +1,329 @@
|
||||
/// Maximum byte limit for discussion documents (suitable for embedding chunking).
|
||||
/// Note: uses `.len()` (byte count), not char count — consistent with `CHUNK_MAX_BYTES`.
|
||||
pub const MAX_DISCUSSION_BYTES: usize = 32_000;
|
||||
|
||||
/// Hard safety cap (bytes) for any document type (pathological content: pasted logs, base64).
|
||||
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
|
||||
|
||||
/// A single note's content for truncation processing.
|
||||
pub struct NoteContent {
|
||||
pub author: String,
|
||||
pub date: String,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
/// Result of truncation processing.
|
||||
pub struct TruncationResult {
|
||||
pub content: String,
|
||||
pub is_truncated: bool,
|
||||
pub reason: Option<TruncationReason>,
|
||||
}
|
||||
|
||||
/// Why a document was truncated (matches DB CHECK constraint values).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TruncationReason {
|
||||
TokenLimitMiddleDrop,
|
||||
SingleNoteOversized,
|
||||
FirstLastOversized,
|
||||
HardCapOversized,
|
||||
}
|
||||
|
||||
impl TruncationReason {
|
||||
/// Returns the DB-compatible string matching the CHECK constraint.
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
|
||||
Self::SingleNoteOversized => "single_note_oversized",
|
||||
Self::FirstLastOversized => "first_last_oversized",
|
||||
Self::HardCapOversized => "hard_cap_oversized",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Format a single note as `@author (date):\nbody\n\n`.
|
||||
fn format_note(note: &NoteContent) -> String {
|
||||
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
|
||||
}
|
||||
|
||||
/// Truncate a string at a UTF-8-safe byte boundary.
|
||||
/// Returns a slice no longer than `max_bytes` bytes, walking backward
|
||||
/// to find the nearest char boundary if needed.
|
||||
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
||||
if s.len() <= max_bytes {
|
||||
return s;
|
||||
}
|
||||
// Walk backward from max_bytes to find a char boundary
|
||||
let mut end = max_bytes;
|
||||
while end > 0 && !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
&s[..end]
|
||||
}
|
||||
|
||||
/// Truncate discussion notes to fit within `max_bytes`.
|
||||
///
|
||||
/// Algorithm:
|
||||
/// 1. Format all notes
|
||||
/// 2. If total fits, return as-is
|
||||
/// 3. Single note: truncate at UTF-8 boundary, append [truncated]
|
||||
/// 4. Try to keep first N notes + last note + marker within limit
|
||||
/// 5. If first + last > limit: keep only first (truncated)
|
||||
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
|
||||
if notes.is_empty() {
|
||||
return TruncationResult {
|
||||
content: String::new(),
|
||||
is_truncated: false,
|
||||
reason: None,
|
||||
};
|
||||
}
|
||||
|
||||
let formatted: Vec<String> = notes.iter().map(format_note).collect();
|
||||
let total: String = formatted.concat();
|
||||
|
||||
// Case 1: fits within limit
|
||||
if total.len() <= max_bytes {
|
||||
return TruncationResult {
|
||||
content: total,
|
||||
is_truncated: false,
|
||||
reason: None,
|
||||
};
|
||||
}
|
||||
|
||||
// Case 2: single note — truncate it
|
||||
if notes.len() == 1 {
|
||||
let truncated = truncate_utf8(&total, max_bytes.saturating_sub(11)); // room for [truncated]
|
||||
let content = format!("{}[truncated]", truncated);
|
||||
return TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::SingleNoteOversized),
|
||||
};
|
||||
}
|
||||
|
||||
// Case 3: multiple notes — try first N + marker + last
|
||||
let last_note = &formatted[formatted.len() - 1];
|
||||
|
||||
// Binary search for max N where first N notes + marker + last note fit
|
||||
let mut best_n = 0;
|
||||
for n in 1..formatted.len() - 1 {
|
||||
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
|
||||
let omitted = formatted.len() - n - 1;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let candidate_len = first_n + marker.len() + last_note.len();
|
||||
if candidate_len <= max_bytes {
|
||||
best_n = n;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if best_n > 0 {
|
||||
// We can keep first best_n notes + marker + last note
|
||||
let first_part: String = formatted[..best_n].concat();
|
||||
let omitted = formatted.len() - best_n - 1;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let content = format!("{}{}{}", first_part, marker, last_note);
|
||||
return TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
||||
};
|
||||
}
|
||||
|
||||
// Case 4: even first + last don't fit — keep only first (truncated)
|
||||
let first_note = &formatted[0];
|
||||
if first_note.len() + last_note.len() > max_bytes {
|
||||
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
|
||||
let content = format!("{}[truncated]", truncated);
|
||||
return TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::FirstLastOversized),
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback: first + marker + last (0 middle notes kept)
|
||||
let omitted = formatted.len() - 2;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let content = format!("{}{}{}", formatted[0], marker, last_note);
|
||||
TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply hard cap truncation to any document type.
|
||||
/// Truncates at UTF-8-safe boundary if content exceeds 2MB.
|
||||
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
|
||||
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
|
||||
return TruncationResult {
|
||||
content: content.to_string(),
|
||||
is_truncated: false,
|
||||
reason: None,
|
||||
};
|
||||
}
|
||||
|
||||
let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
|
||||
TruncationResult {
|
||||
content: format!("{}[truncated]", truncated),
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::HardCapOversized),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_note(author: &str, body: &str) -> NoteContent {
|
||||
NoteContent {
|
||||
author: author.to_string(),
|
||||
date: "2024-01-01".to_string(),
|
||||
body: body.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_truncation_under_limit() {
|
||||
let notes = vec![
|
||||
make_note("alice", "Short note 1"),
|
||||
make_note("bob", "Short note 2"),
|
||||
make_note("carol", "Short note 3"),
|
||||
];
|
||||
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
||||
assert!(!result.is_truncated);
|
||||
assert!(result.reason.is_none());
|
||||
assert!(result.content.contains("@alice"));
|
||||
assert!(result.content.contains("@bob"));
|
||||
assert!(result.content.contains("@carol"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_middle_notes_dropped() {
|
||||
// Create 10 notes where total exceeds limit
|
||||
let big_body = "x".repeat(4000);
|
||||
let notes: Vec<NoteContent> = (0..10)
|
||||
.map(|i| make_note(&format!("user{}", i), &big_body))
|
||||
.collect();
|
||||
let result = truncate_discussion(¬es, 10_000);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
|
||||
// First note preserved
|
||||
assert!(result.content.contains("@user0"));
|
||||
// Last note preserved
|
||||
assert!(result.content.contains("@user9"));
|
||||
// Marker present
|
||||
assert!(result.content.contains("notes omitted for length"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_note_oversized() {
|
||||
let big_body = "x".repeat(50_000);
|
||||
let notes = vec![make_note("alice", &big_body)];
|
||||
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
|
||||
assert!(result.content.ends_with("[truncated]"));
|
||||
assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_first_last_oversized() {
|
||||
let big_body = "x".repeat(20_000);
|
||||
let notes = vec![
|
||||
make_note("alice", &big_body),
|
||||
make_note("bob", &big_body),
|
||||
];
|
||||
let result = truncate_discussion(¬es, 10_000);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
|
||||
assert!(result.content.contains("@alice"));
|
||||
assert!(result.content.ends_with("[truncated]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_note_under_limit() {
|
||||
let notes = vec![make_note("alice", "Short note")];
|
||||
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
||||
assert!(!result.is_truncated);
|
||||
assert!(result.content.contains("@alice"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_notes() {
|
||||
let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
|
||||
assert!(!result.is_truncated);
|
||||
assert!(result.content.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_boundary_safety() {
|
||||
// Emoji are 4 bytes each
|
||||
let emoji_content = "🎉".repeat(10);
|
||||
let truncated = truncate_utf8(&emoji_content, 10);
|
||||
// 10 bytes should hold 2 emoji (8 bytes) with 2 bytes left over (not enough for another)
|
||||
assert_eq!(truncated.len(), 8);
|
||||
assert_eq!(truncated, "🎉🎉");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_boundary_cjk() {
|
||||
// CJK characters are 3 bytes each
|
||||
let cjk = "中文字符测试";
|
||||
let truncated = truncate_utf8(cjk, 7);
|
||||
// 7 bytes: 2 full chars (6 bytes), 1 byte left (not enough for another)
|
||||
assert_eq!(truncated, "中文");
|
||||
assert_eq!(truncated.len(), 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hard_cap() {
|
||||
let big_content = "x".repeat(3_000_000);
|
||||
let result = truncate_hard_cap(&big_content);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
|
||||
assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
|
||||
assert!(result.content.ends_with("[truncated]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hard_cap_under_limit() {
|
||||
let content = "Short content";
|
||||
let result = truncate_hard_cap(content);
|
||||
assert!(!result.is_truncated);
|
||||
assert_eq!(result.content, content);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_marker_count_correct() {
|
||||
// 7 notes, keep first 1 + last 1, drop middle 5
|
||||
let big_body = "x".repeat(5000);
|
||||
let notes: Vec<NoteContent> = (0..7)
|
||||
.map(|i| make_note(&format!("user{}", i), &big_body))
|
||||
.collect();
|
||||
let result = truncate_discussion(¬es, 12_000);
|
||||
assert!(result.is_truncated);
|
||||
assert!(result.content.contains("[... 5 notes omitted for length ...]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncation_reason_as_str() {
|
||||
assert_eq!(
|
||||
TruncationReason::TokenLimitMiddleDrop.as_str(),
|
||||
"token_limit_middle_drop"
|
||||
);
|
||||
assert_eq!(
|
||||
TruncationReason::SingleNoteOversized.as_str(),
|
||||
"single_note_oversized"
|
||||
);
|
||||
assert_eq!(
|
||||
TruncationReason::FirstLastOversized.as_str(),
|
||||
"first_last_oversized"
|
||||
);
|
||||
assert_eq!(
|
||||
TruncationReason::HardCapOversized.as_str(),
|
||||
"hard_cap_oversized"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user