refactor: extract unit tests into separate _tests.rs files

Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files
into dedicated _tests.rs companion files, wired via:

    #[cfg(test)]
    #[path = "module_tests.rs"]
    mod tests;

This keeps implementation-focused source files leaner and more scannable
while preserving full access to private items through `use super::*;`.

Modules extracted:
  core:      db, note_parser, payloads, project, references, sync_run,
             timeline_collect, timeline_expand, timeline_seed
  cli:       list (55 tests), who (75 tests)
  documents: extractor (43 tests), regenerator
  embedding: change_detector, chunking
  gitlab:    graphql (wiremock async tests), transformers/issue
  ingestion: dirty_tracker, discussions, issues, mr_diffs

Also adds conflicts_with("explain_score") to the --detail flag in the
who command to prevent mutually exclusive flags from being combined.

All 629 unit tests pass. No behavior changes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-13 10:54:02 -05:00
parent 5c2df3df3b
commit 7e0e6a91f2
43 changed files with 11672 additions and 11942 deletions

View File

@@ -85,146 +85,5 @@ pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i6
}
#[cfg(test)]
mod tests {
use std::path::Path;
use super::*;
use crate::core::db::{create_connection, run_migrations};
use crate::embedding::pipeline::record_embedding_error;
const MODEL: &str = "nomic-embed-text";
fn setup_db() -> Connection {
let conn = create_connection(Path::new(":memory:")).unwrap();
run_migrations(&conn).unwrap();
conn
}
fn insert_test_project(conn: &Connection) -> i64 {
conn.execute(
"INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url)
VALUES (1, 'group/test', 'https://gitlab.example.com/group/test')",
[],
)
.unwrap();
conn.last_insert_rowid()
}
fn insert_test_document(conn: &Connection, project_id: i64, content: &str) -> i64 {
conn.execute(
"INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash)
VALUES ('issue', 1, ?1, ?2, 'hash123')",
rusqlite::params![project_id, content],
)
.unwrap();
conn.last_insert_rowid()
}
#[test]
fn retry_failed_delete_makes_doc_pending_again() {
let conn = setup_db();
let proj_id = insert_test_project(&conn);
let doc_id = insert_test_document(&conn, proj_id, "some text content");
// Doc starts as pending
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert_eq!(pending.len(), 1, "Doc should be pending initially");
// Record an error — doc should no longer be pending
record_embedding_error(
&conn,
doc_id,
0,
"hash123",
"chunkhash",
MODEL,
"test error",
)
.unwrap();
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert!(
pending.is_empty(),
"Doc with error metadata should not be pending"
);
// DELETE error rows (mimicking --retry-failed) — doc should become pending again
conn.execute_batch(
"DELETE FROM embeddings WHERE rowid / 1000 IN (
SELECT DISTINCT document_id FROM embedding_metadata
WHERE last_error IS NOT NULL
);
DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;",
)
.unwrap();
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert_eq!(pending.len(), 1, "Doc should be pending again after DELETE");
assert_eq!(pending[0].document_id, doc_id);
}
#[test]
fn empty_doc_with_error_not_pending() {
let conn = setup_db();
let proj_id = insert_test_project(&conn);
let doc_id = insert_test_document(&conn, proj_id, "");
// Empty doc starts as pending
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert_eq!(pending.len(), 1, "Empty doc should be pending initially");
// Record an error for the empty doc
record_embedding_error(
&conn,
doc_id,
0,
"hash123",
"empty",
MODEL,
"Document has empty content",
)
.unwrap();
// Should no longer be pending
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert!(
pending.is_empty(),
"Empty doc with error metadata should not be pending"
);
}
#[test]
fn old_update_approach_leaves_doc_invisible() {
// This test demonstrates WHY we use DELETE instead of UPDATE.
// UPDATE clears last_error but the row still matches config params,
// so the doc stays "not pending" — permanently invisible.
let conn = setup_db();
let proj_id = insert_test_project(&conn);
let doc_id = insert_test_document(&conn, proj_id, "some text content");
// Record an error
record_embedding_error(
&conn,
doc_id,
0,
"hash123",
"chunkhash",
MODEL,
"test error",
)
.unwrap();
// Old approach: UPDATE to clear error
conn.execute(
"UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
WHERE last_error IS NOT NULL",
[],
)
.unwrap();
// Doc is NOT pending — it's permanently invisible! This is the bug.
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert!(
pending.is_empty(),
"UPDATE approach leaves doc invisible (this proves the bug)"
);
}
}
#[path = "change_detector_tests.rs"]
mod tests;

View File

@@ -0,0 +1,141 @@
use std::path::Path;
use super::*;
use crate::core::db::{create_connection, run_migrations};
use crate::embedding::pipeline::record_embedding_error;
const MODEL: &str = "nomic-embed-text";
fn setup_db() -> Connection {
let conn = create_connection(Path::new(":memory:")).unwrap();
run_migrations(&conn).unwrap();
conn
}
fn insert_test_project(conn: &Connection) -> i64 {
conn.execute(
"INSERT INTO projects (gitlab_project_id, path_with_namespace, web_url)
VALUES (1, 'group/test', 'https://gitlab.example.com/group/test')",
[],
)
.unwrap();
conn.last_insert_rowid()
}
fn insert_test_document(conn: &Connection, project_id: i64, content: &str) -> i64 {
conn.execute(
"INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash)
VALUES ('issue', 1, ?1, ?2, 'hash123')",
rusqlite::params![project_id, content],
)
.unwrap();
conn.last_insert_rowid()
}
#[test]
fn retry_failed_delete_makes_doc_pending_again() {
let conn = setup_db();
let proj_id = insert_test_project(&conn);
let doc_id = insert_test_document(&conn, proj_id, "some text content");
// Doc starts as pending
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert_eq!(pending.len(), 1, "Doc should be pending initially");
// Record an error — doc should no longer be pending
record_embedding_error(
&conn,
doc_id,
0,
"hash123",
"chunkhash",
MODEL,
"test error",
)
.unwrap();
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert!(
pending.is_empty(),
"Doc with error metadata should not be pending"
);
// DELETE error rows (mimicking --retry-failed) — doc should become pending again
conn.execute_batch(
"DELETE FROM embeddings WHERE rowid / 1000 IN (
SELECT DISTINCT document_id FROM embedding_metadata
WHERE last_error IS NOT NULL
);
DELETE FROM embedding_metadata WHERE last_error IS NOT NULL;",
)
.unwrap();
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert_eq!(pending.len(), 1, "Doc should be pending again after DELETE");
assert_eq!(pending[0].document_id, doc_id);
}
#[test]
fn empty_doc_with_error_not_pending() {
let conn = setup_db();
let proj_id = insert_test_project(&conn);
let doc_id = insert_test_document(&conn, proj_id, "");
// Empty doc starts as pending
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert_eq!(pending.len(), 1, "Empty doc should be pending initially");
// Record an error for the empty doc
record_embedding_error(
&conn,
doc_id,
0,
"hash123",
"empty",
MODEL,
"Document has empty content",
)
.unwrap();
// Should no longer be pending
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert!(
pending.is_empty(),
"Empty doc with error metadata should not be pending"
);
}
#[test]
fn old_update_approach_leaves_doc_invisible() {
// This test demonstrates WHY we use DELETE instead of UPDATE.
// UPDATE clears last_error but the row still matches config params,
// so the doc stays "not pending" — permanently invisible.
let conn = setup_db();
let proj_id = insert_test_project(&conn);
let doc_id = insert_test_document(&conn, proj_id, "some text content");
// Record an error
record_embedding_error(
&conn,
doc_id,
0,
"hash123",
"chunkhash",
MODEL,
"test error",
)
.unwrap();
// Old approach: UPDATE to clear error
conn.execute(
"UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
WHERE last_error IS NOT NULL",
[],
)
.unwrap();
// Doc is NOT pending — it's permanently invisible! This is the bug.
let pending = find_pending_documents(&conn, 100, 0, MODEL).unwrap();
assert!(
pending.is_empty(),
"UPDATE approach leaves doc invisible (this proves the bug)"
);
}

View File

@@ -103,231 +103,5 @@ fn floor_char_boundary(s: &str, idx: usize) -> usize {
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_content() {
let chunks = split_into_chunks("");
assert!(chunks.is_empty());
}
#[test]
fn test_short_document_single_chunk() {
let content = "Short document content.";
let chunks = split_into_chunks(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].0, 0);
assert_eq!(chunks[0].1, content);
}
#[test]
fn test_exactly_max_chars() {
let content = "a".repeat(CHUNK_MAX_BYTES);
let chunks = split_into_chunks(&content);
assert_eq!(chunks.len(), 1);
}
#[test]
fn test_long_document_multiple_chunks() {
let paragraph = "This is a paragraph of text.\n\n";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES * 2 {
content.push_str(paragraph);
}
let chunks = split_into_chunks(&content);
assert!(
chunks.len() >= 2,
"Expected multiple chunks, got {}",
chunks.len()
);
for (i, (idx, _)) in chunks.iter().enumerate() {
assert_eq!(*idx, i);
}
assert!(!chunks.last().unwrap().1.is_empty());
}
#[test]
fn test_chunk_overlap() {
let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
content.push_str(paragraph);
}
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
if chunks.len() >= 2 {
let end_of_first = &chunks[0].1;
let start_of_second = &chunks[1].1;
let overlap_region =
&end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
assert!(
start_of_second.starts_with(overlap_region)
|| overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
"Expected overlap between chunks"
);
}
}
#[test]
fn test_no_paragraph_boundary() {
let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
for (_, chunk) in &chunks {
assert!(!chunk.is_empty());
}
}
#[test]
fn test_chunk_indices_sequential() {
let content = "a ".repeat(CHUNK_MAX_BYTES);
let chunks = split_into_chunks(&content);
for (i, (idx, _)) in chunks.iter().enumerate() {
assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
}
}
#[test]
fn test_multibyte_characters_no_panic() {
// Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
// placed at positions likely to hit len()*2/3 and len()/2 boundaries
let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES * 3 {
content.push_str(segment);
}
// Should not panic on multi-byte boundary
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
for (_, chunk) in &chunks {
assert!(!chunk.is_empty());
}
}
#[test]
fn test_nbsp_at_overlap_boundary() {
// Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
// placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
let mut content = String::new();
// Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
// near where the overlap subtraction would land
let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
while content.len() < target - 2 {
content.push('a');
}
content.push('\u{a0}'); // 2-byte char right at the overlap boundary
while content.len() < CHUNK_MAX_BYTES * 3 {
content.push('b');
}
// Should not panic
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
}
#[test]
fn test_box_drawing_heavy_content() {
// Simulates a document with many box-drawing characters (3-byte UTF-8)
// like the ─ (U+2500) character found in markdown tables
let mut content = String::new();
// Normal text header
content.push_str("# Title\n\nSome description text.\n\n");
// Table header with box drawing
content.push('┌');
for _ in 0..200 {
content.push('─');
}
content.push('┬');
for _ in 0..200 {
content.push('─');
}
content.push_str("\n"); // clippy: push_str is correct here (multi-char)
// Table rows
for row in 0..50 {
content.push_str(&format!("│ row {:<194}│ data {:<193}\n", row, row));
content.push('├');
for _ in 0..200 {
content.push('─');
}
content.push('┼');
for _ in 0..200 {
content.push('─');
}
content.push_str("\n"); // push_str for multi-char
}
content.push('└');
for _ in 0..200 {
content.push('─');
}
content.push('┴');
for _ in 0..200 {
content.push('─');
}
content.push_str("\n"); // push_str for multi-char
eprintln!(
"Content size: {} bytes, {} chars",
content.len(),
content.chars().count()
);
let start = std::time::Instant::now();
let chunks = split_into_chunks(&content);
let elapsed = start.elapsed();
eprintln!(
"Chunking took {:?}, produced {} chunks",
elapsed,
chunks.len()
);
// Should complete in reasonable time
assert!(
elapsed.as_secs() < 5,
"Chunking took too long: {:?}",
elapsed
);
assert!(!chunks.is_empty());
}
#[test]
fn test_real_doc_18526_pattern() {
// Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
// followed by newlines, creating a pattern where chunk windows
// land in spaceless regions
let mut content = String::new();
content.push_str("Header text with spaces\n\n");
// Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
for _ in 0..800 {
content.push('─'); // 3 bytes each = 2400 bytes
}
content.push('\n');
content.push_str("Some more text.\n\n");
// Another long run
for _ in 0..800 {
content.push('─');
}
content.push('\n');
content.push_str("End text.\n");
eprintln!("Content size: {} bytes", content.len());
let start = std::time::Instant::now();
let chunks = split_into_chunks(&content);
let elapsed = start.elapsed();
eprintln!(
"Chunking took {:?}, produced {} chunks",
elapsed,
chunks.len()
);
assert!(
elapsed.as_secs() < 2,
"Chunking took too long: {:?}",
elapsed
);
assert!(!chunks.is_empty());
}
}
#[path = "chunking_tests.rs"]
mod tests;

View File

@@ -0,0 +1,226 @@
use super::*;
#[test]
fn test_empty_content() {
let chunks = split_into_chunks("");
assert!(chunks.is_empty());
}
#[test]
fn test_short_document_single_chunk() {
let content = "Short document content.";
let chunks = split_into_chunks(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].0, 0);
assert_eq!(chunks[0].1, content);
}
#[test]
fn test_exactly_max_chars() {
let content = "a".repeat(CHUNK_MAX_BYTES);
let chunks = split_into_chunks(&content);
assert_eq!(chunks.len(), 1);
}
#[test]
fn test_long_document_multiple_chunks() {
let paragraph = "This is a paragraph of text.\n\n";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES * 2 {
content.push_str(paragraph);
}
let chunks = split_into_chunks(&content);
assert!(
chunks.len() >= 2,
"Expected multiple chunks, got {}",
chunks.len()
);
for (i, (idx, _)) in chunks.iter().enumerate() {
assert_eq!(*idx, i);
}
assert!(!chunks.last().unwrap().1.is_empty());
}
#[test]
fn test_chunk_overlap() {
let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
content.push_str(paragraph);
}
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
if chunks.len() >= 2 {
let end_of_first = &chunks[0].1;
let start_of_second = &chunks[1].1;
let overlap_region =
&end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
assert!(
start_of_second.starts_with(overlap_region)
|| overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
"Expected overlap between chunks"
);
}
}
#[test]
fn test_no_paragraph_boundary() {
let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
for (_, chunk) in &chunks {
assert!(!chunk.is_empty());
}
}
#[test]
fn test_chunk_indices_sequential() {
let content = "a ".repeat(CHUNK_MAX_BYTES);
let chunks = split_into_chunks(&content);
for (i, (idx, _)) in chunks.iter().enumerate() {
assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
}
}
#[test]
fn test_multibyte_characters_no_panic() {
// Build content with multi-byte UTF-8 chars (smart quotes, emoji, CJK)
// placed at positions likely to hit len()*2/3 and len()/2 boundaries
let segment = "We\u{2019}ve gradually ar\u{2014}ranged the components. ";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES * 3 {
content.push_str(segment);
}
// Should not panic on multi-byte boundary
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
for (_, chunk) in &chunks {
assert!(!chunk.is_empty());
}
}
#[test]
fn test_nbsp_at_overlap_boundary() {
// Reproduce the exact crash: \u{a0} (non-breaking space, 2-byte UTF-8)
// placed so that split_at - CHUNK_OVERLAP_CHARS lands mid-character
let mut content = String::new();
// Fill with ASCII up to near CHUNK_MAX_BYTES, then place \u{a0}
// near where the overlap subtraction would land
let target = CHUNK_MAX_BYTES - CHUNK_OVERLAP_CHARS;
while content.len() < target - 2 {
content.push('a');
}
content.push('\u{a0}'); // 2-byte char right at the overlap boundary
while content.len() < CHUNK_MAX_BYTES * 3 {
content.push('b');
}
// Should not panic
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
}
#[test]
fn test_box_drawing_heavy_content() {
// Simulates a document with many box-drawing characters (3-byte UTF-8)
// like the ─ (U+2500) character found in markdown tables
let mut content = String::new();
// Normal text header
content.push_str("# Title\n\nSome description text.\n\n");
// Table header with box drawing
content.push('┌');
for _ in 0..200 {
content.push('─');
}
content.push('┬');
for _ in 0..200 {
content.push('─');
}
content.push_str("\n"); // clippy: push_str is correct here (multi-char)
// Table rows
for row in 0..50 {
content.push_str(&format!("│ row {:<194}│ data {:<193}\n", row, row));
content.push('├');
for _ in 0..200 {
content.push('─');
}
content.push('┼');
for _ in 0..200 {
content.push('─');
}
content.push_str("\n"); // push_str for multi-char
}
content.push('└');
for _ in 0..200 {
content.push('─');
}
content.push('┴');
for _ in 0..200 {
content.push('─');
}
content.push_str("\n"); // push_str for multi-char
eprintln!(
"Content size: {} bytes, {} chars",
content.len(),
content.chars().count()
);
let start = std::time::Instant::now();
let chunks = split_into_chunks(&content);
let elapsed = start.elapsed();
eprintln!(
"Chunking took {:?}, produced {} chunks",
elapsed,
chunks.len()
);
// Should complete in reasonable time
assert!(
elapsed.as_secs() < 5,
"Chunking took too long: {:?}",
elapsed
);
assert!(!chunks.is_empty());
}
#[test]
fn test_real_doc_18526_pattern() {
// Reproduce exact pattern: long lines of ─ (3 bytes each, no spaces)
// followed by newlines, creating a pattern where chunk windows
// land in spaceless regions
let mut content = String::new();
content.push_str("Header text with spaces\n\n");
// Create a very long line of ─ chars (2000+ bytes, exceeding CHUNK_MAX_BYTES)
for _ in 0..800 {
content.push('─'); // 3 bytes each = 2400 bytes
}
content.push('\n');
content.push_str("Some more text.\n\n");
// Another long run
for _ in 0..800 {
content.push('─');
}
content.push('\n');
content.push_str("End text.\n");
eprintln!("Content size: {} bytes", content.len());
let start = std::time::Instant::now();
let chunks = split_into_chunks(&content);
let elapsed = start.elapsed();
eprintln!(
"Chunking took {:?}, produced {} chunks",
elapsed,
chunks.len()
);
assert!(
elapsed.as_secs() < 2,
"Chunking took too long: {:?}",
elapsed
);
assert!(!chunks.is_empty());
}