refactor: Remove redundant doc comments throughout codebase
Removes module-level doc comments (//! lines) and excessive inline doc comments that were duplicating information already evident from: - Function/struct names (self-documenting code) - Type signatures (the what is clear from types) - Implementation context (the how is clear from code) Affected modules: - cli/* - Removed command descriptions duplicating clap help text - core/* - Removed module headers and obvious function docs - documents/* - Removed extractor/regenerator/truncation docs - embedding/* - Removed pipeline and chunking docs - gitlab/* - Removed client and transformer docs (kept type definitions) - ingestion/* - Removed orchestrator and ingestion docs - search/* - Removed FTS and vector search docs Philosophy: Code should be self-documenting. Comments should explain "why" (business decisions, non-obvious constraints) not "what" (which the code itself shows). This change reduces noise and maintenance burden while keeping the codebase just as understandable. Retains comments for: - Non-obvious business logic - Important safety invariants - Complex algorithm explanations - Public API boundaries where generated docs matter Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,25 +1,19 @@
|
||||
/// Maximum byte limit for discussion documents (suitable for embedding chunking).
|
||||
/// Note: uses `.len()` (byte count), not char count — consistent with `CHUNK_MAX_BYTES`.
|
||||
pub const MAX_DISCUSSION_BYTES: usize = 32_000;
|
||||
|
||||
/// Hard safety cap (bytes) for any document type (pathological content: pasted logs, base64).
|
||||
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
|
||||
|
||||
/// A single note's content for truncation processing.
|
||||
pub struct NoteContent {
|
||||
pub author: String,
|
||||
pub date: String,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
/// Result of truncation processing.
|
||||
pub struct TruncationResult {
|
||||
pub content: String,
|
||||
pub is_truncated: bool,
|
||||
pub reason: Option<TruncationReason>,
|
||||
}
|
||||
|
||||
/// Why a document was truncated (matches DB CHECK constraint values).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TruncationReason {
|
||||
TokenLimitMiddleDrop,
|
||||
@@ -29,7 +23,6 @@ pub enum TruncationReason {
|
||||
}
|
||||
|
||||
impl TruncationReason {
|
||||
/// Returns the DB-compatible string matching the CHECK constraint.
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
|
||||
@@ -40,19 +33,14 @@ impl TruncationReason {
|
||||
}
|
||||
}
|
||||
|
||||
/// Format a single note as `@author (date):\nbody\n\n`.
|
||||
fn format_note(note: &NoteContent) -> String {
|
||||
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
|
||||
}
|
||||
|
||||
/// Truncate a string at a UTF-8-safe byte boundary.
|
||||
/// Returns a slice no longer than `max_bytes` bytes, walking backward
|
||||
/// to find the nearest char boundary if needed.
|
||||
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
||||
if s.len() <= max_bytes {
|
||||
return s;
|
||||
}
|
||||
// Walk backward from max_bytes to find a char boundary
|
||||
let mut end = max_bytes;
|
||||
while end > 0 && !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
@@ -60,14 +48,6 @@ pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
||||
&s[..end]
|
||||
}
|
||||
|
||||
/// Truncate discussion notes to fit within `max_bytes`.
|
||||
///
|
||||
/// Algorithm:
|
||||
/// 1. Format all notes
|
||||
/// 2. If total fits, return as-is
|
||||
/// 3. Single note: truncate at UTF-8 boundary, append [truncated]
|
||||
/// 4. Try to keep first N notes + last note + marker within limit
|
||||
/// 5. If first + last > limit: keep only first (truncated)
|
||||
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
|
||||
if notes.is_empty() {
|
||||
return TruncationResult {
|
||||
@@ -80,7 +60,6 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
let formatted: Vec<String> = notes.iter().map(format_note).collect();
|
||||
let total: String = formatted.concat();
|
||||
|
||||
// Case 1: fits within limit
|
||||
if total.len() <= max_bytes {
|
||||
return TruncationResult {
|
||||
content: total,
|
||||
@@ -89,9 +68,8 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
};
|
||||
}
|
||||
|
||||
// Case 2: single note — truncate it
|
||||
if notes.len() == 1 {
|
||||
let truncated = truncate_utf8(&total, max_bytes.saturating_sub(11)); // room for [truncated]
|
||||
let truncated = truncate_utf8(&total, max_bytes.saturating_sub(11));
|
||||
let content = format!("{}[truncated]", truncated);
|
||||
return TruncationResult {
|
||||
content,
|
||||
@@ -100,10 +78,8 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
};
|
||||
}
|
||||
|
||||
// Case 3: multiple notes — try first N + marker + last
|
||||
let last_note = &formatted[formatted.len() - 1];
|
||||
|
||||
// Binary search for max N where first N notes + marker + last note fit
|
||||
let mut best_n = 0;
|
||||
for n in 1..formatted.len() - 1 {
|
||||
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
|
||||
@@ -118,7 +94,6 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
}
|
||||
|
||||
if best_n > 0 {
|
||||
// We can keep first best_n notes + marker + last note
|
||||
let first_part: String = formatted[..best_n].concat();
|
||||
let omitted = formatted.len() - best_n - 1;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
@@ -130,7 +105,6 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
};
|
||||
}
|
||||
|
||||
// Case 4: even first + last don't fit — keep only first (truncated)
|
||||
let first_note = &formatted[0];
|
||||
if first_note.len() + last_note.len() > max_bytes {
|
||||
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
|
||||
@@ -142,7 +116,6 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback: first + marker + last (0 middle notes kept)
|
||||
let omitted = formatted.len() - 2;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let content = format!("{}{}{}", formatted[0], marker, last_note);
|
||||
@@ -153,8 +126,6 @@ pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> Truncatio
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply hard cap truncation to any document type.
|
||||
/// Truncates at UTF-8-safe boundary if content exceeds 2MB.
|
||||
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
|
||||
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
|
||||
return TruncationResult {
|
||||
@@ -201,7 +172,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_middle_notes_dropped() {
|
||||
// Create 10 notes where total exceeds limit
|
||||
let big_body = "x".repeat(4000);
|
||||
let notes: Vec<NoteContent> = (0..10)
|
||||
.map(|i| make_note(&format!("user{}", i), &big_body))
|
||||
@@ -209,11 +179,8 @@ mod tests {
|
||||
let result = truncate_discussion(¬es, 10_000);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
|
||||
// First note preserved
|
||||
assert!(result.content.contains("@user0"));
|
||||
// Last note preserved
|
||||
assert!(result.content.contains("@user9"));
|
||||
// Marker present
|
||||
assert!(result.content.contains("notes omitted for length"));
|
||||
}
|
||||
|
||||
@@ -256,20 +223,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_utf8_boundary_safety() {
|
||||
// Emoji are 4 bytes each
|
||||
let emoji_content = "🎉".repeat(10);
|
||||
let truncated = truncate_utf8(&emoji_content, 10);
|
||||
// 10 bytes should hold 2 emoji (8 bytes) with 2 bytes left over (not enough for another)
|
||||
assert_eq!(truncated.len(), 8);
|
||||
assert_eq!(truncated, "🎉🎉");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_boundary_cjk() {
|
||||
// CJK characters are 3 bytes each
|
||||
let cjk = "中文字符测试";
|
||||
let truncated = truncate_utf8(cjk, 7);
|
||||
// 7 bytes: 2 full chars (6 bytes), 1 byte left (not enough for another)
|
||||
assert_eq!(truncated, "中文");
|
||||
assert_eq!(truncated.len(), 6);
|
||||
}
|
||||
@@ -294,7 +257,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_marker_count_correct() {
|
||||
// 7 notes, keep first 1 + last 1, drop middle 5
|
||||
let big_body = "x".repeat(5000);
|
||||
let notes: Vec<NoteContent> = (0..7)
|
||||
.map(|i| make_note(&format!("user{}", i), &big_body))
|
||||
|
||||
Reference in New Issue
Block a user