feat(embedding): Add Ollama-powered vector embedding pipeline

Implements the embedding module that generates vector representations
of documents using a local Ollama instance with the nomic-embed-text
model. These embeddings enable semantic (vector) search and the hybrid
search mode that fuses lexical and semantic results via RRF.

Key components:

- embedding::ollama: HTTP client for the Ollama /api/embeddings
  endpoint. Handles connection errors with actionable error messages
  (OllamaUnavailable, OllamaModelNotFound) and validates response
  dimensions.

- embedding::chunking: Splits long documents into overlapping
  paragraph-aware chunks for embedding. Uses a configurable max token
  estimate (8192 default for nomic-embed-text) with 10% overlap to
  preserve cross-chunk context.

- embedding::chunk_ids: Encodes chunk identity as
  doc_id * 1000 + chunk_index for the embeddings table rowid. This
  allows vector search to map results back to documents and
  deduplicate by doc_id efficiently.

- embedding::change_detector: Compares document content_hash against
  stored embedding hashes to skip re-embedding unchanged documents,
  making incremental embedding runs fast.

- embedding::pipeline: Orchestrates the full embedding flow: detect
  changed documents, chunk them, call Ollama in configurable
  concurrency (default 4), store results. Supports --retry-failed
  to re-attempt previously failed embeddings.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:46:30 -05:00
parent 20edff4ab1
commit 723703bed9
6 changed files with 810 additions and 0 deletions

207
src/embedding/chunking.rs Normal file
View File

@@ -0,0 +1,207 @@
//! Text chunking for embedding: split documents at paragraph boundaries with overlap.
/// Maximum bytes per chunk.
/// Named `_BYTES` because `str::len()` returns byte count; multi-byte UTF-8
/// sequences mean byte length ≥ char count.
pub const CHUNK_MAX_BYTES: usize = 32_000;
/// Character overlap between adjacent chunks.
pub const CHUNK_OVERLAP_CHARS: usize = 500;
/// Split document content into chunks suitable for embedding.
///
/// Documents <= CHUNK_MAX_BYTES produce a single chunk.
/// Longer documents are split at paragraph boundaries (`\n\n`), falling back
/// to sentence boundaries, then word boundaries, then hard character cut.
/// Adjacent chunks share CHUNK_OVERLAP_CHARS of overlap.
///
/// Returns Vec<(chunk_index, chunk_text)>.
pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
if content.is_empty() {
return Vec::new();
}
if content.len() <= CHUNK_MAX_BYTES {
return vec![(0, content.to_string())];
}
let mut chunks: Vec<(usize, String)> = Vec::new();
let mut start = 0;
let mut chunk_index = 0;
while start < content.len() {
let remaining = &content[start..];
if remaining.len() <= CHUNK_MAX_BYTES {
chunks.push((chunk_index, remaining.to_string()));
break;
}
// Find a split point within CHUNK_MAX_BYTES (char-boundary-safe)
let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
let window = &content[start..end];
// Try paragraph boundary (\n\n) — search backward from end
let split_at = find_paragraph_break(window)
.or_else(|| find_sentence_break(window))
.or_else(|| find_word_break(window))
.unwrap_or(window.len());
let chunk_text = &content[start..start + split_at];
chunks.push((chunk_index, chunk_text.to_string()));
// Advance with overlap, guaranteeing forward progress to prevent infinite loops.
// If split_at <= CHUNK_OVERLAP_CHARS we skip overlap to avoid stalling.
// The .max(1) ensures we always advance at least 1 byte.
let advance = if split_at > CHUNK_OVERLAP_CHARS {
split_at - CHUNK_OVERLAP_CHARS
} else {
split_at
}
.max(1);
start += advance;
chunk_index += 1;
}
chunks
}
/// Find the last paragraph break (`\n\n`) in the window, preferring the
/// last third for balanced chunks.
fn find_paragraph_break(window: &str) -> Option<usize> {
// Search backward from 2/3 of the way through to find a good split
let search_start = window.len() * 2 / 3;
window[search_start..].rfind("\n\n").map(|pos| search_start + pos + 2)
.or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
}
/// Find the last sentence boundary (`. `, `? `, `! `) in the window.
fn find_sentence_break(window: &str) -> Option<usize> {
let search_start = window.len() / 2;
for pat in &[". ", "? ", "! "] {
if let Some(pos) = window[search_start..].rfind(pat) {
return Some(search_start + pos + pat.len());
}
}
// Try first half
for pat in &[". ", "? ", "! "] {
if let Some(pos) = window[..search_start].rfind(pat) {
return Some(pos + pat.len());
}
}
None
}
/// Find the last word boundary (space) in the window.
fn find_word_break(window: &str) -> Option<usize> {
let search_start = window.len() / 2;
window[search_start..].rfind(' ').map(|pos| search_start + pos + 1)
.or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
}
/// Find the largest byte index <= `idx` that is a valid char boundary in `s`.
/// Equivalent to `str::floor_char_boundary` (stabilized in Rust 1.82).
fn floor_char_boundary(s: &str, idx: usize) -> usize {
if idx >= s.len() {
return s.len();
}
let mut i = idx;
while i > 0 && !s.is_char_boundary(i) {
i -= 1;
}
i
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_content() {
let chunks = split_into_chunks("");
assert!(chunks.is_empty());
}
#[test]
fn test_short_document_single_chunk() {
let content = "Short document content.";
let chunks = split_into_chunks(content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].0, 0);
assert_eq!(chunks[0].1, content);
}
#[test]
fn test_exactly_max_chars() {
let content = "a".repeat(CHUNK_MAX_BYTES);
let chunks = split_into_chunks(&content);
assert_eq!(chunks.len(), 1);
}
#[test]
fn test_long_document_multiple_chunks() {
// Create content > CHUNK_MAX_BYTES with paragraph boundaries
let paragraph = "This is a paragraph of text.\n\n";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES * 2 {
content.push_str(paragraph);
}
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2, "Expected multiple chunks, got {}", chunks.len());
// Verify indices are sequential
for (i, (idx, _)) in chunks.iter().enumerate() {
assert_eq!(*idx, i);
}
// Verify all content is covered (no gaps)
assert!(!chunks.last().unwrap().1.is_empty());
}
#[test]
fn test_chunk_overlap() {
// Create content that will produce 2+ chunks
let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
let mut content = String::new();
while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
content.push_str(paragraph);
}
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
// Check that adjacent chunks share some content (overlap)
if chunks.len() >= 2 {
let end_of_first = &chunks[0].1;
let start_of_second = &chunks[1].1;
// The end of first chunk should overlap with start of second
let overlap_region = &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
assert!(
start_of_second.starts_with(overlap_region)
|| overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
"Expected overlap between chunks"
);
}
}
#[test]
fn test_no_paragraph_boundary() {
// Create content without paragraph breaks
let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
let chunks = split_into_chunks(&content);
assert!(chunks.len() >= 2);
// Should still split (at word boundaries)
for (_, chunk) in &chunks {
assert!(!chunk.is_empty());
}
}
#[test]
fn test_chunk_indices_sequential() {
let content = "a ".repeat(CHUNK_MAX_BYTES);
let chunks = split_into_chunks(&content);
for (i, (idx, _)) in chunks.iter().enumerate() {
assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
}
}
}