refactor: extract unit tests into separate _tests.rs files
Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files
into dedicated _tests.rs companion files, wired via:
#[cfg(test)]
#[path = "module_tests.rs"]
mod tests;
This keeps implementation-focused source files leaner and more scannable
while preserving full access to private items through `use super::*;`.
Modules extracted:
core: db, note_parser, payloads, project, references, sync_run,
timeline_collect, timeline_expand, timeline_seed
cli: list (55 tests), who (75 tests)
documents: extractor (43 tests), regenerator
embedding: change_detector, chunking
gitlab: graphql (wiremock async tests), transformers/issue
ingestion: dirty_tracker, discussions, issues, mr_diffs
Also adds conflicts_with("explain_score") to the --detail flag in the
who command to prevent mutually exclusive flags from being combined.
All 629 unit tests pass. No behavior changes.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -103,231 +103,5 @@ fn floor_char_boundary(s: &str, idx: usize) -> usize {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_empty_content() {
|
||||
let chunks = split_into_chunks("");
|
||||
assert!(chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_short_document_single_chunk() {
|
||||
let content = "Short document content.";
|
||||
let chunks = split_into_chunks(content);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(chunks[0].0, 0);
|
||||
assert_eq!(chunks[0].1, content);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactly_max_chars() {
|
||||
let content = "a".repeat(CHUNK_MAX_BYTES);
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_long_document_multiple_chunks() {
|
||||
let paragraph = "This is a paragraph of text.\n\n";
|
||||
let mut content = String::new();
|
||||
while content.len() < CHUNK_MAX_BYTES * 2 {
|
||||
content.push_str(paragraph);
|
||||
}
|
||||
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"Expected multiple chunks, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
for (i, (idx, _)) in chunks.iter().enumerate() {
|
||||
assert_eq!(*idx, i);
|
||||
}
|
||||
|
||||
assert!(!chunks.last().unwrap().1.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_overlap() {
|
||||
let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
|
||||
let mut content = String::new();
|
||||
while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
|
||||
content.push_str(paragraph);
|
||||
}
|
||||
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
|
||||
if chunks.len() >= 2 {
|
||||
let end_of_first = &chunks[0].1;
|
||||
let start_of_second = &chunks[1].1;
|
||||
let overlap_region =
|
||||
&end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
|
||||
assert!(
|
||||
start_of_second.starts_with(overlap_region)
|
||||
|| overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
|
||||
"Expected overlap between chunks"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_paragraph_boundary() {
|
||||
let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
for (_, chunk) in &chunks {
|
||||
assert!(!chunk.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_indices_sequential() {
|
||||
let content = "a ".repeat(CHUNK_MAX_BYTES);
|
||||
let chunks = split_into_chunks(&content);
|
||||
for (i, (idx, _)) in chunks.iter().enumerate() {
|
||||
assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibyte_characters_no_panic() {
|
||||
// Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
|
||||
// placed at positions likely to hit len()*2/3 and len()/2 boundaries
|
||||
let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
|
||||
let mut content = String::new();
|
||||
while content.len() < CHUNK_MAX_BYTES * 3 {
|
||||
content.push_str(segment);
|
||||
}
|
||||
// Should not panic on multi-byte boundary
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
for (_, chunk) in &chunks {
|
||||
assert!(!chunk.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nbsp_at_overlap_boundary() {
|
||||
// Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
|
||||
// placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
|
||||
let mut content = String::new();
|
||||
// Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
|
||||
// near where the overlap subtraction would land
|
||||
let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
|
||||
while content.len() < target - 2 {
|
||||
content.push('a');
|
||||
}
|
||||
content.push('\u{a0}'); // 2-byte char right at the overlap boundary
|
||||
while content.len() < CHUNK_MAX_BYTES * 3 {
|
||||
content.push('b');
|
||||
}
|
||||
// Should not panic
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_box_drawing_heavy_content() {
|
||||
// Simulates a document with many box-drawing characters (3-byte UTF-8)
|
||||
// like the ─ (U+2500) character found in markdown tables
|
||||
let mut content = String::new();
|
||||
// Normal text header
|
||||
content.push_str("# Title\n\nSome description text.\n\n");
|
||||
// Table header with box drawing
|
||||
content.push('┌');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('┬');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push_str("┐\n"); // clippy: push_str is correct here (multi-char)
|
||||
// Table rows
|
||||
for row in 0..50 {
|
||||
content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row));
|
||||
content.push('├');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('┼');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push_str("┤\n"); // push_str for multi-char
|
||||
}
|
||||
content.push('└');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('┴');
|
||||
for _ in 0..200 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push_str("┘\n"); // push_str for multi-char
|
||||
|
||||
eprintln!(
|
||||
"Content size: {} bytes, {} chars",
|
||||
content.len(),
|
||||
content.chars().count()
|
||||
);
|
||||
let start = std::time::Instant::now();
|
||||
let chunks = split_into_chunks(&content);
|
||||
let elapsed = start.elapsed();
|
||||
eprintln!(
|
||||
"Chunking took {:?}, produced {} chunks",
|
||||
elapsed,
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// Should complete in reasonable time
|
||||
assert!(
|
||||
elapsed.as_secs() < 5,
|
||||
"Chunking took too long: {:?}",
|
||||
elapsed
|
||||
);
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_real_doc_18526_pattern() {
|
||||
// Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
|
||||
// followed by newlines, creating a pattern where chunk windows
|
||||
// land in spaceless regions
|
||||
let mut content = String::new();
|
||||
content.push_str("Header text with spaces\n\n");
|
||||
// Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
|
||||
for _ in 0..800 {
|
||||
content.push('─'); // 3 bytes each = 2400 bytes
|
||||
}
|
||||
content.push('\n');
|
||||
content.push_str("Some more text.\n\n");
|
||||
// Another long run
|
||||
for _ in 0..800 {
|
||||
content.push('─');
|
||||
}
|
||||
content.push('\n');
|
||||
content.push_str("End text.\n");
|
||||
|
||||
eprintln!("Content size: {} bytes", content.len());
|
||||
let start = std::time::Instant::now();
|
||||
let chunks = split_into_chunks(&content);
|
||||
let elapsed = start.elapsed();
|
||||
eprintln!(
|
||||
"Chunking took {:?}, produced {} chunks",
|
||||
elapsed,
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
assert!(
|
||||
elapsed.as_secs() < 2,
|
||||
"Chunking took too long: {:?}",
|
||||
elapsed
|
||||
);
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
}
|
||||
#[path = "chunking_tests.rs"]
|
||||
mod tests;
|
||||
|
||||
Reference in New Issue
Block a user