feat(documents): Add document generation pipeline with dirty tracking

Implements the documents module that transforms raw ingested entities
(issues, MRs, discussions) into searchable document blobs stored in
the documents table. This is the foundation for both FTS5 lexical
search and vector embedding.

Key components:

- documents::extractor: Renders entities into structured text documents.
  Issues include title, description, labels, milestone, assignees, and
  threaded discussion summaries. MRs additionally include source/target
  branches, reviewers, and approval status. Discussions are rendered
  with full note threading.

- documents::regenerator: Drains the dirty_queue table to regenerate
  only documents whose source entities changed since last sync. Supports
  full rebuild mode (seeds all entities into dirty queue first) and
  project-scoped regeneration.

- documents::truncation: Safety cap at 2MB per document to prevent
  pathological outliers from degrading FTS or embedding performance.

- ingestion::dirty_tracker: Marks entities as dirty inside the
  ingestion transaction so document regeneration stays consistent
  with data changes. Uses INSERT OR IGNORE to deduplicate.

- ingestion::discussion_queue: Queue-based discussion fetching that
  isolates individual discussion failures from the broader ingestion
  pipeline, preventing a single corrupt discussion from blocking
  an entire project sync.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:46:18 -05:00
parent d31d5292f2
commit 20edff4ab1
7 changed files with 2431 additions and 0 deletions

329
src/documents/truncation.rs Normal file
View File

@@ -0,0 +1,329 @@
/// Maximum byte limit for discussion documents (suitable for embedding chunking).
/// Note: uses `.len()` (byte count), not char count — consistent with `CHUNK_MAX_BYTES`.
pub const MAX_DISCUSSION_BYTES: usize = 32_000;
/// Hard safety cap (bytes) for any document type (pathological content: pasted logs, base64).
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
/// A single note's content for truncation processing.
pub struct NoteContent {
pub author: String,
pub date: String,
pub body: String,
}
/// Result of truncation processing.
pub struct TruncationResult {
pub content: String,
pub is_truncated: bool,
pub reason: Option<TruncationReason>,
}
/// Why a document was truncated (matches DB CHECK constraint values).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TruncationReason {
TokenLimitMiddleDrop,
SingleNoteOversized,
FirstLastOversized,
HardCapOversized,
}
impl TruncationReason {
/// Returns the DB-compatible string matching the CHECK constraint.
pub fn as_str(&self) -> &'static str {
match self {
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
Self::SingleNoteOversized => "single_note_oversized",
Self::FirstLastOversized => "first_last_oversized",
Self::HardCapOversized => "hard_cap_oversized",
}
}
}
/// Format a single note as `@author (date):\nbody\n\n`.
fn format_note(note: &NoteContent) -> String {
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
}
/// Truncate a string at a UTF-8-safe byte boundary.
/// Returns a slice no longer than `max_bytes` bytes, walking backward
/// to find the nearest char boundary if needed.
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
if s.len() <= max_bytes {
return s;
}
// Walk backward from max_bytes to find a char boundary
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
/// Truncate discussion notes to fit within `max_bytes`.
///
/// Algorithm:
/// 1. Format all notes
/// 2. If total fits, return as-is
/// 3. Single note: truncate at UTF-8 boundary, append [truncated]
/// 4. Try to keep first N notes + last note + marker within limit
/// 5. If first + last > limit: keep only first (truncated)
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
if notes.is_empty() {
return TruncationResult {
content: String::new(),
is_truncated: false,
reason: None,
};
}
let formatted: Vec<String> = notes.iter().map(format_note).collect();
let total: String = formatted.concat();
// Case 1: fits within limit
if total.len() <= max_bytes {
return TruncationResult {
content: total,
is_truncated: false,
reason: None,
};
}
// Case 2: single note — truncate it
if notes.len() == 1 {
let truncated = truncate_utf8(&total, max_bytes.saturating_sub(11)); // room for [truncated]
let content = format!("{}[truncated]", truncated);
return TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::SingleNoteOversized),
};
}
// Case 3: multiple notes — try first N + marker + last
let last_note = &formatted[formatted.len() - 1];
// Binary search for max N where first N notes + marker + last note fit
let mut best_n = 0;
for n in 1..formatted.len() - 1 {
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
let omitted = formatted.len() - n - 1;
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
let candidate_len = first_n + marker.len() + last_note.len();
if candidate_len <= max_bytes {
best_n = n;
} else {
break;
}
}
if best_n > 0 {
// We can keep first best_n notes + marker + last note
let first_part: String = formatted[..best_n].concat();
let omitted = formatted.len() - best_n - 1;
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
let content = format!("{}{}{}", first_part, marker, last_note);
return TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::TokenLimitMiddleDrop),
};
}
// Case 4: even first + last don't fit — keep only first (truncated)
let first_note = &formatted[0];
if first_note.len() + last_note.len() > max_bytes {
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
let content = format!("{}[truncated]", truncated);
return TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::FirstLastOversized),
};
}
// Fallback: first + marker + last (0 middle notes kept)
let omitted = formatted.len() - 2;
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
let content = format!("{}{}{}", formatted[0], marker, last_note);
TruncationResult {
content,
is_truncated: true,
reason: Some(TruncationReason::TokenLimitMiddleDrop),
}
}
/// Apply hard cap truncation to any document type.
/// Truncates at UTF-8-safe boundary if content exceeds 2MB.
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
return TruncationResult {
content: content.to_string(),
is_truncated: false,
reason: None,
};
}
let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
TruncationResult {
content: format!("{}[truncated]", truncated),
is_truncated: true,
reason: Some(TruncationReason::HardCapOversized),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_note(author: &str, body: &str) -> NoteContent {
NoteContent {
author: author.to_string(),
date: "2024-01-01".to_string(),
body: body.to_string(),
}
}
#[test]
fn test_no_truncation_under_limit() {
let notes = vec![
make_note("alice", "Short note 1"),
make_note("bob", "Short note 2"),
make_note("carol", "Short note 3"),
];
let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
assert!(!result.is_truncated);
assert!(result.reason.is_none());
assert!(result.content.contains("@alice"));
assert!(result.content.contains("@bob"));
assert!(result.content.contains("@carol"));
}
#[test]
fn test_middle_notes_dropped() {
// Create 10 notes where total exceeds limit
let big_body = "x".repeat(4000);
let notes: Vec<NoteContent> = (0..10)
.map(|i| make_note(&format!("user{}", i), &big_body))
.collect();
let result = truncate_discussion(&notes, 10_000);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
// First note preserved
assert!(result.content.contains("@user0"));
// Last note preserved
assert!(result.content.contains("@user9"));
// Marker present
assert!(result.content.contains("notes omitted for length"));
}
#[test]
fn test_single_note_oversized() {
let big_body = "x".repeat(50_000);
let notes = vec![make_note("alice", &big_body)];
let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
assert!(result.content.ends_with("[truncated]"));
assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
}
#[test]
fn test_first_last_oversized() {
let big_body = "x".repeat(20_000);
let notes = vec![
make_note("alice", &big_body),
make_note("bob", &big_body),
];
let result = truncate_discussion(&notes, 10_000);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
assert!(result.content.contains("@alice"));
assert!(result.content.ends_with("[truncated]"));
}
#[test]
fn test_one_note_under_limit() {
let notes = vec![make_note("alice", "Short note")];
let result = truncate_discussion(&notes, MAX_DISCUSSION_BYTES);
assert!(!result.is_truncated);
assert!(result.content.contains("@alice"));
}
#[test]
fn test_empty_notes() {
let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
assert!(!result.is_truncated);
assert!(result.content.is_empty());
}
#[test]
fn test_utf8_boundary_safety() {
// Emoji are 4 bytes each
let emoji_content = "🎉".repeat(10);
let truncated = truncate_utf8(&emoji_content, 10);
// 10 bytes should hold 2 emoji (8 bytes) with 2 bytes left over (not enough for another)
assert_eq!(truncated.len(), 8);
assert_eq!(truncated, "🎉🎉");
}
#[test]
fn test_utf8_boundary_cjk() {
// CJK characters are 3 bytes each
let cjk = "中文字符测试";
let truncated = truncate_utf8(cjk, 7);
// 7 bytes: 2 full chars (6 bytes), 1 byte left (not enough for another)
assert_eq!(truncated, "中文");
assert_eq!(truncated.len(), 6);
}
#[test]
fn test_hard_cap() {
let big_content = "x".repeat(3_000_000);
let result = truncate_hard_cap(&big_content);
assert!(result.is_truncated);
assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
assert!(result.content.ends_with("[truncated]"));
}
#[test]
fn test_hard_cap_under_limit() {
let content = "Short content";
let result = truncate_hard_cap(content);
assert!(!result.is_truncated);
assert_eq!(result.content, content);
}
#[test]
fn test_marker_count_correct() {
// 7 notes, keep first 1 + last 1, drop middle 5
let big_body = "x".repeat(5000);
let notes: Vec<NoteContent> = (0..7)
.map(|i| make_note(&format!("user{}", i), &big_body))
.collect();
let result = truncate_discussion(&notes, 12_000);
assert!(result.is_truncated);
assert!(result.content.contains("[... 5 notes omitted for length ...]"));
}
#[test]
fn test_truncation_reason_as_str() {
assert_eq!(
TruncationReason::TokenLimitMiddleDrop.as_str(),
"token_limit_middle_drop"
);
assert_eq!(
TruncationReason::SingleNoteOversized.as_str(),
"single_note_oversized"
);
assert_eq!(
TruncationReason::FirstLastOversized.as_str(),
"first_last_oversized"
);
assert_eq!(
TruncationReason::HardCapOversized.as_str(),
"hard_cap_oversized"
);
}
}