refactor: extract unit tests into separate _tests.rs files

Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files into dedicated _tests.rs companion files, wired via: #[cfg(test)] #[path = "module_tests.rs"] mod tests; This keeps implementation-focused source files leaner and more scannable while preserving full access to private items through `use super::*;`. Modules extracted: core: db, note_parser, payloads, project, references, sync_run, timeline_collect, timeline_expand, timeline_seed cli: list (55 tests), who (75 tests) documents: extractor (43 tests), regenerator embedding: change_detector, chunking gitlab: graphql (wiremock async tests), transformers/issue ingestion: dirty_tracker, discussions, issues, mr_diffs Also adds conflicts_with("explain_score") to the --detail flag in the who command to prevent mutually exclusive flags from being combined. All 629 unit tests pass. No behavior changes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 10:54:02 -05:00
parent 5c2df3df3b
commit 7e0e6a91f2
43 changed files with 11672 additions and 11942 deletions
--- a/src/embedding/chunking.rs
+++ b/src/embedding/chunking.rs
@@ -103,231 +103,5 @@ fn floor_char_boundary(s: &str, idx: usize) -> usize {
 }

 #[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_empty_content() {
-        let chunks = split_into_chunks("");
-        assert!(chunks.is_empty());
-    }
-
-    #[test]
-    fn test_short_document_single_chunk() {
-        let content = "Short document content.";
-        let chunks = split_into_chunks(content);
-        assert_eq!(chunks.len(), 1);
-        assert_eq!(chunks[0].0, 0);
-        assert_eq!(chunks[0].1, content);
-    }
-
-    #[test]
-    fn test_exactly_max_chars() {
-        let content = "a".repeat(CHUNK_MAX_BYTES);
-        let chunks = split_into_chunks(&content);
-        assert_eq!(chunks.len(), 1);
-    }
-
-    #[test]
-    fn test_long_document_multiple_chunks() {
-        let paragraph = "This is a paragraph of text.\n\n";
-        let mut content = String::new();
-        while content.len() < CHUNK_MAX_BYTES * 2 {
-            content.push_str(paragraph);
-        }
-
-        let chunks = split_into_chunks(&content);
-        assert!(
-            chunks.len() >= 2,
-            "Expected multiple chunks, got {}",
-            chunks.len()
-        );
-
-        for (i, (idx, _)) in chunks.iter().enumerate() {
-            assert_eq!(*idx, i);
-        }
-
-        assert!(!chunks.last().unwrap().1.is_empty());
-    }
-
-    #[test]
-    fn test_chunk_overlap() {
-        let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
-        let mut content = String::new();
-        while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
-            content.push_str(paragraph);
-        }
-
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-
-        if chunks.len() >= 2 {
-            let end_of_first = &chunks[0].1;
-            let start_of_second = &chunks[1].1;
-            let overlap_region =
-                &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
-            assert!(
-                start_of_second.starts_with(overlap_region)
-                    || overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
-                "Expected overlap between chunks"
-            );
-        }
-    }
-
-    #[test]
-    fn test_no_paragraph_boundary() {
-        let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-        for (_, chunk) in &chunks {
-            assert!(!chunk.is_empty());
-        }
-    }
-
-    #[test]
-    fn test_chunk_indices_sequential() {
-        let content = "a ".repeat(CHUNK_MAX_BYTES);
-        let chunks = split_into_chunks(&content);
-        for (i, (idx, _)) in chunks.iter().enumerate() {
-            assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
-        }
-    }
-
-    #[test]
-    fn test_multibyte_characters_no_panic() {
-        // Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
-        // placed at positions likely to hit len()*2/3 and len()/2 boundaries
-        let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
-        let mut content = String::new();
-        while content.len() < CHUNK_MAX_BYTES * 3 {
-            content.push_str(segment);
-        }
-        // Should not panic on multi-byte boundary
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-        for (_, chunk) in &chunks {
-            assert!(!chunk.is_empty());
-        }
-    }
-
-    #[test]
-    fn test_nbsp_at_overlap_boundary() {
-        // Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
-        // placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
-        let mut content = String::new();
-        // Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
-        // near where the overlap subtraction would land
-        let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
-        while content.len() < target - 2 {
-            content.push('a');
-        }
-        content.push('\u{a0}'); // 2-byte char right at the overlap boundary
-        while content.len() < CHUNK_MAX_BYTES * 3 {
-            content.push('b');
-        }
-        // Should not panic
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-    }
-
-    #[test]
-    fn test_box_drawing_heavy_content() {
-        // Simulates a document with many box-drawing characters (3-byte UTF-8)
-        // like the ─ (U+2500) character found in markdown tables
-        let mut content = String::new();
-        // Normal text header
-        content.push_str("# Title\n\nSome description text.\n\n");
-        // Table header with box drawing
-        content.push('┌');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push('┬');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push_str("┐\n"); // clippy: push_str is correct here (multi-char)
-        // Table rows
-        for row in 0..50 {
-            content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row));
-            content.push('├');
-            for _ in 0..200 {
-                content.push('─');
-            }
-            content.push('┼');
-            for _ in 0..200 {
-                content.push('─');
-            }
-            content.push_str("┤\n"); // push_str for multi-char
-        }
-        content.push('└');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push('┴');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push_str("┘\n"); // push_str for multi-char
-
-        eprintln!(
-            "Content size: {} bytes, {} chars",
-            content.len(),
-            content.chars().count()
-        );
-        let start = std::time::Instant::now();
-        let chunks = split_into_chunks(&content);
-        let elapsed = start.elapsed();
-        eprintln!(
-            "Chunking took {:?}, produced {} chunks",
-            elapsed,
-            chunks.len()
-        );
-
-        // Should complete in reasonable time
-        assert!(
-            elapsed.as_secs() < 5,
-            "Chunking took too long: {:?}",
-            elapsed
-        );
-        assert!(!chunks.is_empty());
-    }
-
-    #[test]
-    fn test_real_doc_18526_pattern() {
-        // Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
-        // followed by newlines, creating a pattern where chunk windows
-        // land in spaceless regions
-        let mut content = String::new();
-        content.push_str("Header text with spaces\n\n");
-        // Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
-        for _ in 0..800 {
-            content.push('─'); // 3 bytes each = 2400 bytes
-        }
-        content.push('\n');
-        content.push_str("Some more text.\n\n");
-        // Another long run
-        for _ in 0..800 {
-            content.push('─');
-        }
-        content.push('\n');
-        content.push_str("End text.\n");
-
-        eprintln!("Content size: {} bytes", content.len());
-        let start = std::time::Instant::now();
-        let chunks = split_into_chunks(&content);
-        let elapsed = start.elapsed();
-        eprintln!(
-            "Chunking took {:?}, produced {} chunks",
-            elapsed,
-            chunks.len()
-        );
-
-        assert!(
-            elapsed.as_secs() < 2,
-            "Chunking took too long: {:?}",
-            elapsed
-        );
-        assert!(!chunks.is_empty());
-    }
-}
+#[path = "chunking_tests.rs"]
+mod tests;