feat(embedding): Add Ollama-powered vector embedding pipeline

Implements the embedding module that generates vector representations of documents using a local Ollama instance with the nomic-embed-text model. These embeddings enable semantic (vector) search and the hybrid search mode that fuses lexical and semantic results via RRF. Key components: - embedding::ollama: HTTP client for the Ollama /api/embeddings endpoint. Handles connection errors with actionable error messages (OllamaUnavailable, OllamaModelNotFound) and validates response dimensions. - embedding::chunking: Splits long documents into overlapping paragraph-aware chunks for embedding. Uses a configurable max token estimate (8192 default for nomic-embed-text) with 10% overlap to preserve cross-chunk context. - embedding::chunk_ids: Encodes chunk identity as doc_id * 1000 + chunk_index for the embeddings table rowid. This allows vector search to map results back to documents and deduplicate by doc_id efficiently. - embedding::change_detector: Compares document content_hash against stored embedding hashes to skip re-embedding unchanged documents, making incremental embedding runs fast. - embedding::pipeline: Orchestrates the full embedding flow: detect changed documents, chunk them, call Ollama in configurable concurrency (default 4), store results. Supports --retry-failed to re-attempt previously failed embeddings. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:46:30 -05:00
parent 20edff4ab1
commit 723703bed9
6 changed files with 810 additions and 0 deletions
--- a/src/embedding/chunking.rs
+++ b/src/embedding/chunking.rs
@@ -0,0 +1,207 @@
+//! Text chunking for embedding: split documents at paragraph boundaries with overlap.
+
+/// Maximum bytes per chunk.
+/// Named `_BYTES` because `str::len()` returns byte count; multi-byte UTF-8
+/// sequences mean byte length ≥ char count.
+pub const CHUNK_MAX_BYTES: usize = 32_000;
+
+/// Character overlap between adjacent chunks.
+pub const CHUNK_OVERLAP_CHARS: usize = 500;
+
+/// Split document content into chunks suitable for embedding.
+///
+/// Documents <= CHUNK_MAX_BYTES produce a single chunk.
+/// Longer documents are split at paragraph boundaries (`\n\n`), falling back
+/// to sentence boundaries, then word boundaries, then hard character cut.
+/// Adjacent chunks share CHUNK_OVERLAP_CHARS of overlap.
+///
+/// Returns Vec<(chunk_index, chunk_text)>.
+pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
+    if content.is_empty() {
+        return Vec::new();
+    }
+
+    if content.len() <= CHUNK_MAX_BYTES {
+        return vec![(0, content.to_string())];
+    }
+
+    let mut chunks: Vec<(usize, String)> = Vec::new();
+    let mut start = 0;
+    let mut chunk_index = 0;
+
+    while start < content.len() {
+        let remaining = &content[start..];
+        if remaining.len() <= CHUNK_MAX_BYTES {
+            chunks.push((chunk_index, remaining.to_string()));
+            break;
+        }
+
+        // Find a split point within CHUNK_MAX_BYTES (char-boundary-safe)
+        let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
+        let window = &content[start..end];
+
+        // Try paragraph boundary (\n\n) — search backward from end
+        let split_at = find_paragraph_break(window)
+            .or_else(|| find_sentence_break(window))
+            .or_else(|| find_word_break(window))
+            .unwrap_or(window.len());
+
+        let chunk_text = &content[start..start + split_at];
+        chunks.push((chunk_index, chunk_text.to_string()));
+
+        // Advance with overlap, guaranteeing forward progress to prevent infinite loops.
+        // If split_at <= CHUNK_OVERLAP_CHARS we skip overlap to avoid stalling.
+        // The .max(1) ensures we always advance at least 1 byte.
+        let advance = if split_at > CHUNK_OVERLAP_CHARS {
+            split_at - CHUNK_OVERLAP_CHARS
+        } else {
+            split_at
+        }
+        .max(1);
+        start += advance;
+        chunk_index += 1;
+    }
+
+    chunks
+}
+
+/// Find the last paragraph break (`\n\n`) in the window, preferring the
+/// last third for balanced chunks.
+fn find_paragraph_break(window: &str) -> Option<usize> {
+    // Search backward from 2/3 of the way through to find a good split
+    let search_start = window.len() * 2 / 3;
+    window[search_start..].rfind("\n\n").map(|pos| search_start + pos + 2)
+        .or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
+}
+
+/// Find the last sentence boundary (`. `, `? `, `! `) in the window.
+fn find_sentence_break(window: &str) -> Option<usize> {
+    let search_start = window.len() / 2;
+    for pat in &[". ", "? ", "! "] {
+        if let Some(pos) = window[search_start..].rfind(pat) {
+            return Some(search_start + pos + pat.len());
+        }
+    }
+    // Try first half
+    for pat in &[". ", "? ", "! "] {
+        if let Some(pos) = window[..search_start].rfind(pat) {
+            return Some(pos + pat.len());
+        }
+    }
+    None
+}
+
+/// Find the last word boundary (space) in the window.
+fn find_word_break(window: &str) -> Option<usize> {
+    let search_start = window.len() / 2;
+    window[search_start..].rfind(' ').map(|pos| search_start + pos + 1)
+        .or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
+}
+
+/// Find the largest byte index <= `idx` that is a valid char boundary in `s`.
+/// Equivalent to `str::floor_char_boundary` (stabilized in Rust 1.82).
+fn floor_char_boundary(s: &str, idx: usize) -> usize {
+    if idx >= s.len() {
+        return s.len();
+    }
+    let mut i = idx;
+    while i > 0 && !s.is_char_boundary(i) {
+        i -= 1;
+    }
+    i
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_empty_content() {
+        let chunks = split_into_chunks("");
+        assert!(chunks.is_empty());
+    }
+
+    #[test]
+    fn test_short_document_single_chunk() {
+        let content = "Short document content.";
+        let chunks = split_into_chunks(content);
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(chunks[0].0, 0);
+        assert_eq!(chunks[0].1, content);
+    }
+
+    #[test]
+    fn test_exactly_max_chars() {
+        let content = "a".repeat(CHUNK_MAX_BYTES);
+        let chunks = split_into_chunks(&content);
+        assert_eq!(chunks.len(), 1);
+    }
+
+    #[test]
+    fn test_long_document_multiple_chunks() {
+        // Create content > CHUNK_MAX_BYTES with paragraph boundaries
+        let paragraph = "This is a paragraph of text.\n\n";
+        let mut content = String::new();
+        while content.len() < CHUNK_MAX_BYTES * 2 {
+            content.push_str(paragraph);
+        }
+
+        let chunks = split_into_chunks(&content);
+        assert!(chunks.len() >= 2, "Expected multiple chunks, got {}", chunks.len());
+
+        // Verify indices are sequential
+        for (i, (idx, _)) in chunks.iter().enumerate() {
+            assert_eq!(*idx, i);
+        }
+
+        // Verify all content is covered (no gaps)
+        assert!(!chunks.last().unwrap().1.is_empty());
+    }
+
+    #[test]
+    fn test_chunk_overlap() {
+        // Create content that will produce 2+ chunks
+        let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
+        let mut content = String::new();
+        while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
+            content.push_str(paragraph);
+        }
+
+        let chunks = split_into_chunks(&content);
+        assert!(chunks.len() >= 2);
+
+        // Check that adjacent chunks share some content (overlap)
+        if chunks.len() >= 2 {
+            let end_of_first = &chunks[0].1;
+            let start_of_second = &chunks[1].1;
+            // The end of first chunk should overlap with start of second
+            let overlap_region = &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
+            assert!(
+                start_of_second.starts_with(overlap_region)
+                    || overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
+                "Expected overlap between chunks"
+            );
+        }
+    }
+
+    #[test]
+    fn test_no_paragraph_boundary() {
+        // Create content without paragraph breaks
+        let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
+        let chunks = split_into_chunks(&content);
+        assert!(chunks.len() >= 2);
+        // Should still split (at word boundaries)
+        for (_, chunk) in &chunks {
+            assert!(!chunk.is_empty());
+        }
+    }
+
+    #[test]
+    fn test_chunk_indices_sequential() {
+        let content = "a ".repeat(CHUNK_MAX_BYTES);
+        let chunks = split_into_chunks(&content);
+        for (i, (idx, _)) in chunks.iter().enumerate() {
+            assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
+        }
+    }
+}