Move inline #[cfg(test)] mod tests { ... } blocks from 22 source files
into dedicated _tests.rs companion files, wired via:
#[cfg(test)]
#[path = "module_tests.rs"]
mod tests;
This keeps implementation-focused source files leaner and more scannable
while preserving full access to private items through `use super::*;`.
Modules extracted:
core: db, note_parser, payloads, project, references, sync_run,
timeline_collect, timeline_expand, timeline_seed
cli: list (55 tests), who (75 tests)
documents: extractor (43 tests), regenerator
embedding: change_detector, chunking
gitlab: graphql (wiremock async tests), transformers/issue
ingestion: dirty_tracker, discussions, issues, mr_diffs
Also adds conflicts_with("explain_score") to the --detail flag in the
who command to prevent mutually exclusive flags from being combined.
All 629 unit tests pass. No behavior changes.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
108 lines
3.1 KiB
Rust
108 lines
3.1 KiB
Rust
pub const CHUNK_MAX_BYTES: usize = 1_500;
|
|
|
|
pub const EXPECTED_DIMS: usize = 768;
|
|
|
|
pub const CHUNK_OVERLAP_CHARS: usize = 200;
|
|
|
|
pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
|
|
if content.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
if content.len() <= CHUNK_MAX_BYTES {
|
|
return vec![(0, content.to_string())];
|
|
}
|
|
|
|
let mut chunks: Vec<(usize, String)> = Vec::new();
|
|
let mut start = 0;
|
|
let mut chunk_index = 0;
|
|
|
|
while start < content.len() {
|
|
let remaining = &content[start..];
|
|
if remaining.len() <= CHUNK_MAX_BYTES {
|
|
chunks.push((chunk_index, remaining.to_string()));
|
|
break;
|
|
}
|
|
|
|
let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
|
|
let window = &content[start..end];
|
|
|
|
let split_at = find_paragraph_break(window)
|
|
.or_else(|| find_sentence_break(window))
|
|
.or_else(|| find_word_break(window))
|
|
.unwrap_or(window.len());
|
|
|
|
let chunk_text = &content[start..start + split_at];
|
|
chunks.push((chunk_index, chunk_text.to_string()));
|
|
|
|
let advance = if split_at > CHUNK_OVERLAP_CHARS {
|
|
split_at - CHUNK_OVERLAP_CHARS
|
|
} else {
|
|
split_at
|
|
}
|
|
.max(1);
|
|
let old_start = start;
|
|
start += advance;
|
|
// Ensure start lands on a char boundary after overlap subtraction
|
|
start = floor_char_boundary(content, start);
|
|
// Guarantee forward progress: multi-byte chars can cause
|
|
// floor_char_boundary to round back to old_start
|
|
if start <= old_start {
|
|
start = old_start
|
|
+ content[old_start..]
|
|
.chars()
|
|
.next()
|
|
.map_or(1, |c| c.len_utf8());
|
|
}
|
|
chunk_index += 1;
|
|
}
|
|
|
|
chunks
|
|
}
|
|
|
|
fn find_paragraph_break(window: &str) -> Option<usize> {
|
|
let search_start = floor_char_boundary(window, window.len() * 2 / 3);
|
|
window[search_start..]
|
|
.rfind("\n\n")
|
|
.map(|pos| search_start + pos + 2)
|
|
.or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
|
|
}
|
|
|
|
fn find_sentence_break(window: &str) -> Option<usize> {
|
|
let search_start = floor_char_boundary(window, window.len() / 2);
|
|
for pat in &[". ", "? ", "! "] {
|
|
if let Some(pos) = window[search_start..].rfind(pat) {
|
|
return Some(search_start + pos + pat.len());
|
|
}
|
|
}
|
|
for pat in &[". ", "? ", "! "] {
|
|
if let Some(pos) = window[..search_start].rfind(pat) {
|
|
return Some(pos + pat.len());
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn find_word_break(window: &str) -> Option<usize> {
|
|
let search_start = floor_char_boundary(window, window.len() / 2);
|
|
window[search_start..]
|
|
.rfind(' ')
|
|
.map(|pos| search_start + pos + 1)
|
|
.or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
|
|
}
|
|
|
|
fn floor_char_boundary(s: &str, idx: usize) -> usize {
|
|
if idx >= s.len() {
|
|
return s.len();
|
|
}
|
|
let mut i = idx;
|
|
while i > 0 && !s.is_char_boundary(i) {
|
|
i -= 1;
|
|
}
|
|
i
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[path = "chunking_tests.rs"]
|
|
mod tests;
|