gitlore/src/embedding/chunking_tests.rs

use super::*;

#[test]
fn test_empty_content() {
    let chunks = split_into_chunks("");
    assert!(chunks.is_empty());
}

#[test]
fn test_short_document_single_chunk() {
    let content = "Short document content.";
    let chunks = split_into_chunks(content);
    assert_eq!(chunks.len(), 1);
    assert_eq!(chunks[0].0, 0);
    assert_eq!(chunks[0].1, content);
}

#[test]
fn test_exactly_max_chars() {
    let content = "a".repeat(CHUNK_MAX_BYTES);
    let chunks = split_into_chunks(&content);
    assert_eq!(chunks.len(), 1);
}

#[test]
fn test_long_document_multiple_chunks() {
    let paragraph = "This is a paragraph of text.\n\n";
    let mut content = String::new();
    while content.len() < CHUNK_MAX_BYTES * 2 {
        content.push_str(paragraph);
    }

    let chunks = split_into_chunks(&content);
    assert!(
        chunks.len() >= 2,
        "Expected multiple chunks, got {}",
        chunks.len()
    );

    for (i, (idx, _)) in chunks.iter().enumerate() {
        assert_eq!(*idx, i);
    }

    assert!(!chunks.last().unwrap().1.is_empty());
}

#[test]
fn test_chunk_overlap() {
    let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
    let mut content = String::new();
    while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
        content.push_str(paragraph);
    }

    let chunks = split_into_chunks(&content);
    assert!(chunks.len() >= 2);

    if chunks.len() >= 2 {
        let end_of_first = &chunks[0].1;
        let start_of_second = &chunks[1].1;
        let overlap_region =
            &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
        assert!(
            start_of_second.starts_with(overlap_region)
                || overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
            "Expected overlap between chunks"
        );
    }
}

#[test]
fn test_no_paragraph_boundary() {
    let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
    let chunks = split_into_chunks(&content);
    assert!(chunks.len() >= 2);
    for (_, chunk) in &chunks {
        assert!(!chunk.is_empty());
    }
}

#[test]
fn test_chunk_indices_sequential() {
    let content = "a ".repeat(CHUNK_MAX_BYTES);
    let chunks = split_into_chunks(&content);
    for (i, (idx, _)) in chunks.iter().enumerate() {
        assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
    }
}

#[test]
fn test_multibyte_characters_no_panic() {
    // Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
    // placed at positions likely to hit len()*2/3 and len()/2 boundaries
    let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
    let mut content = String::new();
    while content.len() < CHUNK_MAX_BYTES * 3 {
        content.push_str(segment);
    }
    // Should not panic on multi-byte boundary
    let chunks = split_into_chunks(&content);
    assert!(chunks.len() >= 2);
    for (_, chunk) in &chunks {
        assert!(!chunk.is_empty());
    }
}

#[test]
fn test_nbsp_at_overlap_boundary() {
    // Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
    // placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
    let mut content = String::new();
    // Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
    // near where the overlap subtraction would land
    let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
    while content.len() < target - 2 {
        content.push('a');
    }
    content.push('\u{a0}'); // 2-byte char right at the overlap boundary
    while content.len() < CHUNK_MAX_BYTES * 3 {
        content.push('b');
    }
    // Should not panic
    let chunks = split_into_chunks(&content);
    assert!(chunks.len() >= 2);
}

#[test]
fn test_box_drawing_heavy_content() {
    // Simulates a document with many box-drawing characters (3-byte UTF-8)
    // like the ─ (U+2500) character found in markdown tables
    let mut content = String::new();
    // Normal text header
    content.push_str("# Title\n\nSome description text.\n\n");
    // Table header with box drawing
    content.push('┌');
    for _ in 0..200 {
        content.push('─');
    }
    content.push('┬');
    for _ in 0..200 {
        content.push('─');
    }
    content.push_str("┐\n"); // clippy: push_str is correct here (multi-char)
    // Table rows
    for row in 0..50 {
        content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row));
        content.push('├');
        for _ in 0..200 {
            content.push('─');
        }
        content.push('┼');
        for _ in 0..200 {
            content.push('─');
        }
        content.push_str("┤\n"); // push_str for multi-char
    }
    content.push('└');
    for _ in 0..200 {
        content.push('─');
    }
    content.push('┴');
    for _ in 0..200 {
        content.push('─');
    }
    content.push_str("┘\n"); // push_str for multi-char

    eprintln!(
        "Content size: {} bytes, {} chars",
        content.len(),
        content.chars().count()
    );
    let start = std::time::Instant::now();
    let chunks = split_into_chunks(&content);
    let elapsed = start.elapsed();
    eprintln!(
        "Chunking took {:?}, produced {} chunks",
        elapsed,
        chunks.len()
    );

    // Should complete in reasonable time
    assert!(
        elapsed.as_secs() < 5,
        "Chunking took too long: {:?}",
        elapsed
    );
    assert!(!chunks.is_empty());
}

#[test]
fn test_real_doc_18526_pattern() {
    // Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
    // followed by newlines, creating a pattern where chunk windows
    // land in spaceless regions
    let mut content = String::new();
    content.push_str("Header text with spaces\n\n");
    // Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
    for _ in 0..800 {
        content.push('─'); // 3 bytes each = 2400 bytes
    }
    content.push('\n');
    content.push_str("Some more text.\n\n");
    // Another long run
    for _ in 0..800 {
        content.push('─');
    }
    content.push('\n');
    content.push_str("End text.\n");

    eprintln!("Content size: {} bytes", content.len());
    let start = std::time::Instant::now();
    let chunks = split_into_chunks(&content);
    let elapsed = start.elapsed();
    eprintln!(
        "Chunking took {:?}, produced {} chunks",
        elapsed,
        chunks.len()
    );

    assert!(
        elapsed.as_secs() < 2,
        "Chunking took too long: {:?}",
        elapsed
    );
    assert!(!chunks.is_empty());
}