use super::*; #[test] fn test_empty_content() { let chunks = split_into_chunks(""); assert!(chunks.is_empty()); } #[test] fn test_short_document_single_chunk() { let content = "Short document content."; let chunks = split_into_chunks(content); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].0, 0); assert_eq!(chunks[0].1, content); } #[test] fn test_exactly_max_chars() { let content = "a".repeat(CHUNK_MAX_BYTES); let chunks = split_into_chunks(&content); assert_eq!(chunks.len(), 1); } #[test] fn test_long_document_multiple_chunks() { let paragraph = "This is a paragraph of text.\n\n"; let mut content = String::new(); while content.len() < CHUNK_MAX_BYTES * 2 { content.push_str(paragraph); } let chunks = split_into_chunks(&content); assert!( chunks.len() >= 2, "Expected multiple chunks, got {}", chunks.len() ); for (i, (idx, _)) in chunks.iter().enumerate() { assert_eq!(*idx, i); } assert!(!chunks.last().unwrap().1.is_empty()); } #[test] fn test_chunk_overlap() { let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n"; let mut content = String::new(); while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 { content.push_str(paragraph); } let chunks = split_into_chunks(&content); assert!(chunks.len() >= 2); if chunks.len() >= 2 { let end_of_first = &chunks[0].1; let start_of_second = &chunks[1].1; let overlap_region = &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..]; assert!( start_of_second.starts_with(overlap_region) || overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]), "Expected overlap between chunks" ); } } #[test] fn test_no_paragraph_boundary() { let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3); let chunks = split_into_chunks(&content); assert!(chunks.len() >= 2); for (_, chunk) in &chunks { assert!(!chunk.is_empty()); } } #[test] fn test_chunk_indices_sequential() { let content = "a ".repeat(CHUNK_MAX_BYTES); let chunks = split_into_chunks(&content); for (i, (idx, _)) in chunks.iter().enumerate() { assert_eq!(*idx, i, "Chunk index mismatch at position {}", i); } } #[test] fn test_multibyte_characters_no_panic() { // Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK) // placed at positions likely to hit len()*2/3 and len()/2 boundaries let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. "; let mut content = String::new(); while content.len() < CHUNK_MAX_BYTES * 3 { content.push_str(segment); } // Should not panic on multi-byte boundary let chunks = split_into_chunks(&content); assert!(chunks.len() >= 2); for (_, chunk) in &chunks { assert!(!chunk.is_empty()); } } #[test] fn test_nbsp_at_overlap_boundary() { // Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8) // placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character let mut content = String::new(); // Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0} // near where the overlap subtraction would land let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS; while content.len() < target - 2 { content.push('a'); } content.push('\u{a0}'); // 2-byte char right at the overlap boundary while content.len() < CHUNK_MAX_BYTES * 3 { content.push('b'); } // Should not panic let chunks = split_into_chunks(&content); assert!(chunks.len() >= 2); } #[test] fn test_box_drawing_heavy_content() { // Simulates a document with many box-drawing characters (3-byte UTF-8) // like the ─ (U+2500) character found in markdown tables let mut content = String::new(); // Normal text header content.push_str("# Title\n\nSome description text.\n\n"); // Table header with box drawing content.push('┌'); for _ in 0..200 { content.push('─'); } content.push('┬'); for _ in 0..200 { content.push('─'); } content.push_str("┐\n"); // clippy: push_str is correct here (multi-char) // Table rows for row in 0..50 { content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row)); content.push('├'); for _ in 0..200 { content.push('─'); } content.push('┼'); for _ in 0..200 { content.push('─'); } content.push_str("┤\n"); // push_str for multi-char } content.push('└'); for _ in 0..200 { content.push('─'); } content.push('┴'); for _ in 0..200 { content.push('─'); } content.push_str("┘\n"); // push_str for multi-char eprintln!( "Content size: {} bytes, {} chars", content.len(), content.chars().count() ); let start = std::time::Instant::now(); let chunks = split_into_chunks(&content); let elapsed = start.elapsed(); eprintln!( "Chunking took {:?}, produced {} chunks", elapsed, chunks.len() ); // Should complete in reasonable time assert!( elapsed.as_secs() < 5, "Chunking took too long: {:?}", elapsed ); assert!(!chunks.is_empty()); } #[test] fn test_real_doc_18526_pattern() { // Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces) // followed by newlines, creating a pattern where chunk windows // land in spaceless regions let mut content = String::new(); content.push_str("Header text with spaces\n\n"); // Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES) for _ in 0..800 { content.push('─'); // 3 bytes each = 2400 bytes } content.push('\n'); content.push_str("Some more text.\n\n"); // Another long run for _ in 0..800 { content.push('─'); } content.push('\n'); content.push_str("End text.\n"); eprintln!("Content size: {} bytes", content.len()); let start = std::time::Instant::now(); let chunks = split_into_chunks(&content); let elapsed = start.elapsed(); eprintln!( "Chunking took {:?}, produced {} chunks", elapsed, chunks.len() ); assert!( elapsed.as_secs() < 2, "Chunking took too long: {:?}", elapsed ); assert!(!chunks.is_empty()); }