feat(embedding): Add Ollama-powered vector embedding pipeline

Implements the embedding module that generates vector representations of documents using a local Ollama instance with the nomic-embed-text model. These embeddings enable semantic (vector) search and the hybrid search mode that fuses lexical and semantic results via RRF. Key components: - embedding::ollama: HTTP client for the Ollama /api/embeddings endpoint. Handles connection errors with actionable error messages (OllamaUnavailable, OllamaModelNotFound) and validates response dimensions. - embedding::chunking: Splits long documents into overlapping paragraph-aware chunks for embedding. Uses a configurable max token estimate (8192 default for nomic-embed-text) with 10% overlap to preserve cross-chunk context. - embedding::chunk_ids: Encodes chunk identity as doc_id * 1000 + chunk_index for the embeddings table rowid. This allows vector search to map results back to documents and deduplicate by doc_id efficiently. - embedding::change_detector: Compares document content_hash against stored embedding hashes to skip re-embedding unchanged documents, making incremental embedding runs fast. - embedding::pipeline: Orchestrates the full embedding flow: detect changed documents, chunk them, call Ollama in configurable concurrency (default 4), store results. Supports --retry-failed to re-attempt previously failed embeddings. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:46:30 -05:00
parent 20edff4ab1
commit 723703bed9
6 changed files with 810 additions and 0 deletions
--- a/src/embedding/chunk_ids.rs
+++ b/src/embedding/chunk_ids.rs
@@ -0,0 +1,63 @@
+/// Multiplier for encoding (document_id, chunk_index) into a single rowid.
+/// Supports up to 1000 chunks per document (32M chars at 32k/chunk).
+pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000;
+
+/// Encode (document_id, chunk_index) into a sqlite-vec rowid.
+///
+/// rowid = document_id * CHUNK_ROWID_MULTIPLIER + chunk_index
+pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
+    document_id * CHUNK_ROWID_MULTIPLIER + chunk_index
+}
+
+/// Decode a sqlite-vec rowid back into (document_id, chunk_index).
+pub fn decode_rowid(rowid: i64) -> (i64, i64) {
+    let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
+    let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
+    (document_id, chunk_index)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_encode_single_chunk() {
+        assert_eq!(encode_rowid(1, 0), 1000);
+    }
+
+    #[test]
+    fn test_encode_multi_chunk() {
+        assert_eq!(encode_rowid(1, 5), 1005);
+    }
+
+    #[test]
+    fn test_encode_specific_values() {
+        assert_eq!(encode_rowid(42, 0), 42000);
+        assert_eq!(encode_rowid(42, 5), 42005);
+    }
+
+    #[test]
+    fn test_decode_zero_chunk() {
+        assert_eq!(decode_rowid(42000), (42, 0));
+    }
+
+    #[test]
+    fn test_decode_roundtrip() {
+        for doc_id in [0, 1, 42, 100, 999, 10000] {
+            for chunk_idx in [0, 1, 5, 99, 999] {
+                let rowid = encode_rowid(doc_id, chunk_idx);
+                let (decoded_doc, decoded_chunk) = decode_rowid(rowid);
+                assert_eq!(
+                    (decoded_doc, decoded_chunk),
+                    (doc_id, chunk_idx),
+                    "Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_multiplier_value() {
+        assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000);
+    }
+}