gitlore/src/embedding/chunking.rs

pub const CHUNK_MAX_BYTES: usize = 1_500;

pub const EXPECTED_DIMS: usize = 768;

pub const CHUNK_OVERLAP_CHARS: usize = 200;

pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
    if content.is_empty() {
        return Vec::new();
    }

    if content.len() <= CHUNK_MAX_BYTES {
        return vec![(0, content.to_string())];
    }

    let mut chunks: Vec<(usize, String)> = Vec::new();
    let mut start = 0;
    let mut chunk_index = 0;

    while start < content.len() {
        let remaining = &content[start..];
        if remaining.len() <= CHUNK_MAX_BYTES {
            chunks.push((chunk_index, remaining.to_string()));
            break;
        }

        let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
        let window = &content[start..end];

        let split_at = find_paragraph_break(window)
            .or_else(|| find_sentence_break(window))
            .or_else(|| find_word_break(window))
            .unwrap_or(window.len());

        let chunk_text = &content[start..start + split_at];
        chunks.push((chunk_index, chunk_text.to_string()));

        let advance = if split_at > CHUNK_OVERLAP_CHARS {
            split_at - CHUNK_OVERLAP_CHARS
        } else {
            split_at
        }
        .max(1);
        let old_start = start;
        start += advance;
        // Ensure start lands on a char boundary after overlap subtraction
        start = floor_char_boundary(content, start);
        // Guarantee forward progress: multi-byte chars can cause
        // floor_char_boundary to round back to old_start
        if start <= old_start {
            start = old_start
                + content[old_start..]
                    .chars()
                    .next()
                    .map_or(1, |c| c.len_utf8());
        }
        chunk_index += 1;
    }

    chunks
}

fn find_paragraph_break(window: &str) -> Option<usize> {
    let search_start = floor_char_boundary(window, window.len() * 2 / 3);
    window[search_start..]
        .rfind("\n\n")
        .map(|pos| search_start + pos + 2)
        .or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
}

fn find_sentence_break(window: &str) -> Option<usize> {
    let search_start = floor_char_boundary(window, window.len() / 2);
    for pat in &[". ", "? ", "! "] {
        if let Some(pos) = window[search_start..].rfind(pat) {
            return Some(search_start + pos + pat.len());
        }
    }
    for pat in &[". ", "? ", "! "] {
        if let Some(pos) = window[..search_start].rfind(pat) {
            return Some(pos + pat.len());
        }
    }
    None
}

fn find_word_break(window: &str) -> Option<usize> {
    let search_start = floor_char_boundary(window, window.len() / 2);
    window[search_start..]
        .rfind(' ')
        .map(|pos| search_start + pos + 1)
        .or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
}

fn floor_char_boundary(s: &str, idx: usize) -> usize {
    if idx >= s.len() {
        return s.len();
    }
    let mut i = idx;
    while i > 0 && !s.is_char_boundary(i) {
        i -= 1;
    }
    i
}

#[cfg(test)]
#[path = "chunking_tests.rs"]
mod tests;