pub const CHUNK_MAX_BYTES: usize = 1_500; pub const EXPECTED_DIMS: usize = 768; pub const CHUNK_OVERLAP_CHARS: usize = 200; pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> { if content.is_empty() { return Vec::new(); } if content.len() <= CHUNK_MAX_BYTES { return vec![(0, content.to_string())]; } let mut chunks: Vec<(usize, String)> = Vec::new(); let mut start = 0; let mut chunk_index = 0; while start < content.len() { let remaining = &content[start..]; if remaining.len() <= CHUNK_MAX_BYTES { chunks.push((chunk_index, remaining.to_string())); break; } let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES); let window = &content[start..end]; let split_at = find_paragraph_break(window) .or_else(|| find_sentence_break(window)) .or_else(|| find_word_break(window)) .unwrap_or(window.len()); let chunk_text = &content[start..start + split_at]; chunks.push((chunk_index, chunk_text.to_string())); let advance = if split_at > CHUNK_OVERLAP_CHARS { split_at - CHUNK_OVERLAP_CHARS } else { split_at } .max(1); let old_start = start; start += advance; // Ensure start lands on a char boundary after overlap subtraction start = floor_char_boundary(content, start); // Guarantee forward progress: multi-byte chars can cause // floor_char_boundary to round back to old_start if start <= old_start { start = old_start + content[old_start..] .chars() .next() .map_or(1, |c| c.len_utf8()); } chunk_index += 1; } chunks } fn find_paragraph_break(window: &str) -> Option { let search_start = floor_char_boundary(window, window.len() * 2 / 3); window[search_start..] .rfind("\n\n") .map(|pos| search_start + pos + 2) .or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2)) } fn find_sentence_break(window: &str) -> Option { let search_start = floor_char_boundary(window, window.len() / 2); for pat in &[". ", "? ", "! "] { if let Some(pos) = window[search_start..].rfind(pat) { return Some(search_start + pos + pat.len()); } } for pat in &[". ", "? ", "! "] { if let Some(pos) = window[..search_start].rfind(pat) { return Some(pos + pat.len()); } } None } fn find_word_break(window: &str) -> Option { let search_start = floor_char_boundary(window, window.len() / 2); window[search_start..] .rfind(' ') .map(|pos| search_start + pos + 1) .or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1)) } fn floor_char_boundary(s: &str, idx: usize) -> usize { if idx >= s.len() { return s.len(); } let mut i = idx; while i > 0 && !s.is_char_boundary(i) { i -= 1; } i } #[cfg(test)] #[path = "chunking_tests.rs"] mod tests;