refactor: extract unit tests into separate _tests.rs files

Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files into dedicated _tests.rs companion files, wired via: #[cfg(test)] #[path = "module_tests.rs"] mod tests; This keeps implementation-focused source files leaner and more scannable while preserving full access to private items through `use super::*;`. Modules extracted: core: db, note_parser, payloads, project, references, sync_run, timeline_collect, timeline_expand, timeline_seed cli: list (55 tests), who (75 tests) documents: extractor (43 tests), regenerator embedding: change_detector, chunking gitlab: graphql (wiremock async tests), transformers/issue ingestion: dirty_tracker, discussions, issues, mr_diffs Also adds conflicts_with("explain_score") to the --detail flag in the who command to prevent mutually exclusive flags from being combined. All 629 unit tests pass. No behavior changes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 10:54:02 -05:00
parent 5c2df3df3b
commit 7e0e6a91f2
43 changed files with 11672 additions and 11942 deletions
--- a/src/embedding/change_detector.rs
+++ b/src/embedding/change_detector.rs
@@ -85,146 +85,5 @@ pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i6
 }

 #[cfg(test)]
-mod tests {
-    use std::path::Path;
-
-    use super::*;
-    use crate::core::db::{create_connection, run_migrations};
-    use crate::embedding::pipeline::record_embedding_error;
-
-    const MODEL: &str = "nomic-embed-text";
-
-    fn setup_db() -> Connection {
-        let conn = create_connection(Path::new(":memory:")).unwrap();
-        run_migrations(&conn).unwrap();
-        conn
-    }
-
-    fn insert_test_project(conn: &Connection) -> i64 {
-        conn.execute(
-            "INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url)
-             VALUES (1, 'group/test', 'https://gitlab.example.com/group/test')",
-            [],
-        )
-        .unwrap();
-        conn.last_insert_rowid()
-    }
-
-    fn insert_test_document(conn: &Connection, project_id: i64, content: &str) -> i64 {
-        conn.execute(
-            "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash)
-             VALUES ('issue', 1, ?1, ?2, 'hash123')",
-            rusqlite::params![project_id, content],
-        )
-        .unwrap();
-        conn.last_insert_rowid()
-    }
-
-    #[test]
-    fn retry_failed_delete_makes_doc_pending_again() {
-        let conn = setup_db();
-        let proj_id = insert_test_project(&conn);
-        let doc_id = insert_test_document(&conn, proj_id, "some text content");
-
-        // Doc starts as pending
-        let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
-        assert_eq!(pending.len(), 1, "Doc should be pending initially");
-
-        // Record an error — doc should no longer be pending
-        record_embedding_error(
-            &conn,
-            doc_id,
-            0,
-            "hash123",
-            "chunkhash",
-            MODEL,
-            "test error",
-        )
-        .unwrap();
-        let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
-        assert!(
-            pending.is_empty(),
-            "Doc with error metadata should not be pending"
-        );
-
-        // DELETE error rows (mimicking --retry-failed) — doc should become pending again
-        conn.execute_batch(
-            "DELETE FROM embeddings WHERE rowid / 1000 IN (
-               SELECT DISTINCT document_id FROM embedding_metadata
-               WHERE last_error IS NOT NULL
-             );
-             DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;",
-        )
-        .unwrap();
-        let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
-        assert_eq!(pending.len(), 1, "Doc should be pending again after DELETE");
-        assert_eq!(pending[0].document_id, doc_id);
-    }
-
-    #[test]
-    fn empty_doc_with_error_not_pending() {
-        let conn = setup_db();
-        let proj_id = insert_test_project(&conn);
-        let doc_id = insert_test_document(&conn, proj_id, "");
-
-        // Empty doc starts as pending
-        let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
-        assert_eq!(pending.len(), 1, "Empty doc should be pending initially");
-
-        // Record an error for the empty doc
-        record_embedding_error(
-            &conn,
-            doc_id,
-            0,
-            "hash123",
-            "empty",
-            MODEL,
-            "Document has empty content",
-        )
-        .unwrap();
-
-        // Should no longer be pending
-        let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
-        assert!(
-            pending.is_empty(),
-            "Empty doc with error metadata should not be pending"
-        );
-    }
-
-    #[test]
-    fn old_update_approach_leaves_doc_invisible() {
-        // This test demonstrates WHY we use DELETE instead of UPDATE.
-        // UPDATE clears last_error but the row still matches config params,
-        // so the doc stays "not pending" — permanently invisible.
-        let conn = setup_db();
-        let proj_id = insert_test_project(&conn);
-        let doc_id = insert_test_document(&conn, proj_id, "some text content");
-
-        // Record an error
-        record_embedding_error(
-            &conn,
-            doc_id,
-            0,
-            "hash123",
-            "chunkhash",
-            MODEL,
-            "test error",
-        )
-        .unwrap();
-
-        // Old approach: UPDATE to clear error
-        conn.execute(
-            "UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
-             WHERE last_error IS NOT NULL",
-            [],
-        )
-        .unwrap();
-
-        // Doc is NOT pending — it's permanently invisible! This is the bug.
-        let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
-        assert!(
-            pending.is_empty(),
-            "UPDATE approach leaves doc invisible (this proves the bug)"
-        );
-    }
-}
+#[path = "change_detector_tests.rs"]
+mod tests;
--- a/src/embedding/change_detector_tests.rs
+++ b/src/embedding/change_detector_tests.rs
@@ -0,0 +1,141 @@
+use std::path::Path;
+
+use super::*;
+use crate::core::db::{create_connection, run_migrations};
+use crate::embedding::pipeline::record_embedding_error;
+
+const MODEL: &str = "nomic-embed-text";
+
+fn setup_db() -> Connection {
+    let conn = create_connection(Path::new(":memory:")).unwrap();
+    run_migrations(&conn).unwrap();
+    conn
+}
+
+fn insert_test_project(conn: &Connection) -> i64 {
+    conn.execute(
+        "INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url)
+         VALUES (1, 'group/test', 'https://gitlab.example.com/group/test')",
+        [],
+    )
+    .unwrap();
+    conn.last_insert_rowid()
+}
+
+fn insert_test_document(conn: &Connection, project_id: i64, content: &str) -> i64 {
+    conn.execute(
+        "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash)
+         VALUES ('issue', 1, ?1, ?2, 'hash123')",
+        rusqlite::params![project_id, content],
+    )
+    .unwrap();
+    conn.last_insert_rowid()
+}
+
+#[test]
+fn retry_failed_delete_makes_doc_pending_again() {
+    let conn = setup_db();
+    let proj_id = insert_test_project(&conn);
+    let doc_id = insert_test_document(&conn, proj_id, "some text content");
+
+    // Doc starts as pending
+    let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
+    assert_eq!(pending.len(), 1, "Doc should be pending initially");
+
+    // Record an error — doc should no longer be pending
+    record_embedding_error(
+        &conn,
+        doc_id,
+        0,
+        "hash123",
+        "chunkhash",
+        MODEL,
+        "test error",
+    )
+    .unwrap();
+    let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
+    assert!(
+        pending.is_empty(),
+        "Doc with error metadata should not be pending"
+    );
+
+    // DELETE error rows (mimicking --retry-failed) — doc should become pending again
+    conn.execute_batch(
+        "DELETE FROM embeddings WHERE rowid / 1000 IN (
+           SELECT DISTINCT document_id FROM embedding_metadata
+           WHERE last_error IS NOT NULL
+         );
+         DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;",
+    )
+    .unwrap();
+    let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
+    assert_eq!(pending.len(), 1, "Doc should be pending again after DELETE");
+    assert_eq!(pending[0].document_id, doc_id);
+}
+
+#[test]
+fn empty_doc_with_error_not_pending() {
+    let conn = setup_db();
+    let proj_id = insert_test_project(&conn);
+    let doc_id = insert_test_document(&conn, proj_id, "");
+
+    // Empty doc starts as pending
+    let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
+    assert_eq!(pending.len(), 1, "Empty doc should be pending initially");
+
+    // Record an error for the empty doc
+    record_embedding_error(
+        &conn,
+        doc_id,
+        0,
+        "hash123",
+        "empty",
+        MODEL,
+        "Document has empty content",
+    )
+    .unwrap();
+
+    // Should no longer be pending
+    let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
+    assert!(
+        pending.is_empty(),
+        "Empty doc with error metadata should not be pending"
+    );
+}
+
+#[test]
+fn old_update_approach_leaves_doc_invisible() {
+    // This test demonstrates WHY we use DELETE instead of UPDATE.
+    // UPDATE clears last_error but the row still matches config params,
+    // so the doc stays "not pending" — permanently invisible.
+    let conn = setup_db();
+    let proj_id = insert_test_project(&conn);
+    let doc_id = insert_test_document(&conn, proj_id, "some text content");
+
+    // Record an error
+    record_embedding_error(
+        &conn,
+        doc_id,
+        0,
+        "hash123",
+        "chunkhash",
+        MODEL,
+        "test error",
+    )
+    .unwrap();
+
+    // Old approach: UPDATE to clear error
+    conn.execute(
+        "UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
+         WHERE last_error IS NOT NULL",
+        [],
+    )
+    .unwrap();
+
+    // Doc is NOT pending — it's permanently invisible! This is the bug.
+    let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
+    assert!(
+        pending.is_empty(),
+        "UPDATE approach leaves doc invisible (this proves the bug)"
+    );
+}
--- a/src/embedding/chunking.rs
+++ b/src/embedding/chunking.rs
@@ -103,231 +103,5 @@ fn floor_char_boundary(s: &str, idx: usize) -> usize {
 }

 #[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_empty_content() {
-        let chunks = split_into_chunks("");
-        assert!(chunks.is_empty());
-    }
-
-    #[test]
-    fn test_short_document_single_chunk() {
-        let content = "Short document content.";
-        let chunks = split_into_chunks(content);
-        assert_eq!(chunks.len(), 1);
-        assert_eq!(chunks[0].0, 0);
-        assert_eq!(chunks[0].1, content);
-    }
-
-    #[test]
-    fn test_exactly_max_chars() {
-        let content = "a".repeat(CHUNK_MAX_BYTES);
-        let chunks = split_into_chunks(&content);
-        assert_eq!(chunks.len(), 1);
-    }
-
-    #[test]
-    fn test_long_document_multiple_chunks() {
-        let paragraph = "This is a paragraph of text.\n\n";
-        let mut content = String::new();
-        while content.len() < CHUNK_MAX_BYTES * 2 {
-            content.push_str(paragraph);
-        }
-
-        let chunks = split_into_chunks(&content);
-        assert!(
-            chunks.len() >= 2,
-            "Expected multiple chunks, got {}",
-            chunks.len()
-        );
-
-        for (i, (idx, _)) in chunks.iter().enumerate() {
-            assert_eq!(*idx, i);
-        }
-
-        assert!(!chunks.last().unwrap().1.is_empty());
-    }
-
-    #[test]
-    fn test_chunk_overlap() {
-        let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
-        let mut content = String::new();
-        while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
-            content.push_str(paragraph);
-        }
-
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-
-        if chunks.len() >= 2 {
-            let end_of_first = &chunks[0].1;
-            let start_of_second = &chunks[1].1;
-            let overlap_region =
-                &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
-            assert!(
-                start_of_second.starts_with(overlap_region)
-                    || overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
-                "Expected overlap between chunks"
-            );
-        }
-    }
-
-    #[test]
-    fn test_no_paragraph_boundary() {
-        let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-        for (_, chunk) in &chunks {
-            assert!(!chunk.is_empty());
-        }
-    }
-
-    #[test]
-    fn test_chunk_indices_sequential() {
-        let content = "a ".repeat(CHUNK_MAX_BYTES);
-        let chunks = split_into_chunks(&content);
-        for (i, (idx, _)) in chunks.iter().enumerate() {
-            assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
-        }
-    }
-
-    #[test]
-    fn test_multibyte_characters_no_panic() {
-        // Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
-        // placed at positions likely to hit len()*2/3 and len()/2 boundaries
-        let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
-        let mut content = String::new();
-        while content.len() < CHUNK_MAX_BYTES * 3 {
-            content.push_str(segment);
-        }
-        // Should not panic on multi-byte boundary
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-        for (_, chunk) in &chunks {
-            assert!(!chunk.is_empty());
-        }
-    }
-
-    #[test]
-    fn test_nbsp_at_overlap_boundary() {
-        // Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
-        // placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
-        let mut content = String::new();
-        // Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
-        // near where the overlap subtraction would land
-        let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
-        while content.len() < target - 2 {
-            content.push('a');
-        }
-        content.push('\u{a0}'); // 2-byte char right at the overlap boundary
-        while content.len() < CHUNK_MAX_BYTES * 3 {
-            content.push('b');
-        }
-        // Should not panic
-        let chunks = split_into_chunks(&content);
-        assert!(chunks.len() >= 2);
-    }
-
-    #[test]
-    fn test_box_drawing_heavy_content() {
-        // Simulates a document with many box-drawing characters (3-byte UTF-8)
-        // like the ─ (U+2500) character found in markdown tables
-        let mut content = String::new();
-        // Normal text header
-        content.push_str("# Title\n\nSome description text.\n\n");
-        // Table header with box drawing
-        content.push('┌');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push('┬');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push_str("┐\n"); // clippy: push_str is correct here (multi-char)
-        // Table rows
-        for row in 0..50 {
-            content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row));
-            content.push('├');
-            for _ in 0..200 {
-                content.push('─');
-            }
-            content.push('┼');
-            for _ in 0..200 {
-                content.push('─');
-            }
-            content.push_str("┤\n"); // push_str for multi-char
-        }
-        content.push('└');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push('┴');
-        for _ in 0..200 {
-            content.push('─');
-        }
-        content.push_str("┘\n"); // push_str for multi-char
-
-        eprintln!(
-            "Content size: {} bytes, {} chars",
-            content.len(),
-            content.chars().count()
-        );
-        let start = std::time::Instant::now();
-        let chunks = split_into_chunks(&content);
-        let elapsed = start.elapsed();
-        eprintln!(
-            "Chunking took {:?}, produced {} chunks",
-            elapsed,
-            chunks.len()
-        );
-
-        // Should complete in reasonable time
-        assert!(
-            elapsed.as_secs() < 5,
-            "Chunking took too long: {:?}",
-            elapsed
-        );
-        assert!(!chunks.is_empty());
-    }
-
-    #[test]
-    fn test_real_doc_18526_pattern() {
-        // Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
-        // followed by newlines, creating a pattern where chunk windows
-        // land in spaceless regions
-        let mut content = String::new();
-        content.push_str("Header text with spaces\n\n");
-        // Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
-        for _ in 0..800 {
-            content.push('─'); // 3 bytes each = 2400 bytes
-        }
-        content.push('\n');
-        content.push_str("Some more text.\n\n");
-        // Another long run
-        for _ in 0..800 {
-            content.push('─');
-        }
-        content.push('\n');
-        content.push_str("End text.\n");
-
-        eprintln!("Content size: {} bytes", content.len());
-        let start = std::time::Instant::now();
-        let chunks = split_into_chunks(&content);
-        let elapsed = start.elapsed();
-        eprintln!(
-            "Chunking took {:?}, produced {} chunks",
-            elapsed,
-            chunks.len()
-        );
-
-        assert!(
-            elapsed.as_secs() < 2,
-            "Chunking took too long: {:?}",
-            elapsed
-        );
-        assert!(!chunks.is_empty());
-    }
-}
+#[path = "chunking_tests.rs"]
+mod tests;
--- a/src/embedding/chunking_tests.rs
+++ b/src/embedding/chunking_tests.rs
@@ -0,0 +1,226 @@
+use super::*;
+
+#[test]
+fn test_empty_content() {
+    let chunks = split_into_chunks("");
+    assert!(chunks.is_empty());
+}
+
+#[test]
+fn test_short_document_single_chunk() {
+    let content = "Short document content.";
+    let chunks = split_into_chunks(content);
+    assert_eq!(chunks.len(), 1);
+    assert_eq!(chunks[0].0, 0);
+    assert_eq!(chunks[0].1, content);
+}
+
+#[test]
+fn test_exactly_max_chars() {
+    let content = "a".repeat(CHUNK_MAX_BYTES);
+    let chunks = split_into_chunks(&content);
+    assert_eq!(chunks.len(), 1);
+}
+
+#[test]
+fn test_long_document_multiple_chunks() {
+    let paragraph = "This is a paragraph of text.\n\n";
+    let mut content = String::new();
+    while content.len() < CHUNK_MAX_BYTES * 2 {
+        content.push_str(paragraph);
+    }
+
+    let chunks = split_into_chunks(&content);
+    assert!(
+        chunks.len() >= 2,
+        "Expected multiple chunks, got {}",
+        chunks.len()
+    );
+
+    for (i, (idx, _)) in chunks.iter().enumerate() {
+        assert_eq!(*idx, i);
+    }
+
+    assert!(!chunks.last().unwrap().1.is_empty());
+}
+
+#[test]
+fn test_chunk_overlap() {
+    let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
+    let mut content = String::new();
+    while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
+        content.push_str(paragraph);
+    }
+
+    let chunks = split_into_chunks(&content);
+    assert!(chunks.len() >= 2);
+
+    if chunks.len() >= 2 {
+        let end_of_first = &chunks[0].1;
+        let start_of_second = &chunks[1].1;
+        let overlap_region =
+            &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
+        assert!(
+            start_of_second.starts_with(overlap_region)
+                || overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
+            "Expected overlap between chunks"
+        );
+    }
+}
+
+#[test]
+fn test_no_paragraph_boundary() {
+    let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
+    let chunks = split_into_chunks(&content);
+    assert!(chunks.len() >= 2);
+    for (_, chunk) in &chunks {
+        assert!(!chunk.is_empty());
+    }
+}
+
+#[test]
+fn test_chunk_indices_sequential() {
+    let content = "a ".repeat(CHUNK_MAX_BYTES);
+    let chunks = split_into_chunks(&content);
+    for (i, (idx, _)) in chunks.iter().enumerate() {
+        assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
+    }
+}
+
+#[test]
+fn test_multibyte_characters_no_panic() {
+    // Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
+    // placed at positions likely to hit len()*2/3 and len()/2 boundaries
+    let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
+    let mut content = String::new();
+    while content.len() < CHUNK_MAX_BYTES * 3 {
+        content.push_str(segment);
+    }
+    // Should not panic on multi-byte boundary
+    let chunks = split_into_chunks(&content);
+    assert!(chunks.len() >= 2);
+    for (_, chunk) in &chunks {
+        assert!(!chunk.is_empty());
+    }
+}
+
+#[test]
+fn test_nbsp_at_overlap_boundary() {
+    // Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
+    // placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
+    let mut content = String::new();
+    // Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
+    // near where the overlap subtraction would land
+    let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
+    while content.len() < target - 2 {
+        content.push('a');
+    }
+    content.push('\u{a0}'); // 2-byte char right at the overlap boundary
+    while content.len() < CHUNK_MAX_BYTES * 3 {
+        content.push('b');
+    }
+    // Should not panic
+    let chunks = split_into_chunks(&content);
+    assert!(chunks.len() >= 2);
+}
+
+#[test]
+fn test_box_drawing_heavy_content() {
+    // Simulates a document with many box-drawing characters (3-byte UTF-8)
+    // like the ─ (U+2500) character found in markdown tables
+    let mut content = String::new();
+    // Normal text header
+    content.push_str("# Title\n\nSome description text.\n\n");
+    // Table header with box drawing
+    content.push('┌');
+    for _ in 0..200 {
+        content.push('─');
+    }
+    content.push('┬');
+    for _ in 0..200 {
+        content.push('─');
+    }
+    content.push_str("┐\n"); // clippy: push_str is correct here (multi-char)
+    // Table rows
+    for row in 0..50 {
+        content.push_str(&format!("│ row {:<194}│ data {:<193}│\n", row, row));
+        content.push('├');
+        for _ in 0..200 {
+            content.push('─');
+        }
+        content.push('┼');
+        for _ in 0..200 {
+            content.push('─');
+        }
+        content.push_str("┤\n"); // push_str for multi-char
+    }
+    content.push('└');
+    for _ in 0..200 {
+        content.push('─');
+    }
+    content.push('┴');
+    for _ in 0..200 {
+        content.push('─');
+    }
+    content.push_str("┘\n"); // push_str for multi-char
+
+    eprintln!(
+        "Content size: {} bytes, {} chars",
+        content.len(),
+        content.chars().count()
+    );
+    let start = std::time::Instant::now();
+    let chunks = split_into_chunks(&content);
+    let elapsed = start.elapsed();
+    eprintln!(
+        "Chunking took {:?}, produced {} chunks",
+        elapsed,
+        chunks.len()
+    );
+
+    // Should complete in reasonable time
+    assert!(
+        elapsed.as_secs() < 5,
+        "Chunking took too long: {:?}",
+        elapsed
+    );
+    assert!(!chunks.is_empty());
+}
+
+#[test]
+fn test_real_doc_18526_pattern() {
+    // Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
+    // followed by newlines, creating a pattern where chunk windows
+    // land in spaceless regions
+    let mut content = String::new();
+    content.push_str("Header text with spaces\n\n");
+    // Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
+    for _ in 0..800 {
+        content.push('─'); // 3 bytes each = 2400 bytes
+    }
+    content.push('\n');
+    content.push_str("Some more text.\n\n");
+    // Another long run
+    for _ in 0..800 {
+        content.push('─');
+    }
+    content.push('\n');
+    content.push_str("End text.\n");
+
+    eprintln!("Content size: {} bytes", content.len());
+    let start = std::time::Instant::now();
+    let chunks = split_into_chunks(&content);
+    let elapsed = start.elapsed();
+    eprintln!(
+        "Chunking took {:?}, produced {} chunks",
+        elapsed,
+        chunks.len()
+    );
+
+    assert!(
+        elapsed.as_secs() < 2,
+        "Chunking took too long: {:?}",
+        elapsed
+    );
+    assert!(!chunks.is_empty());
+}