feat(embedding): Add Ollama-powered vector embedding pipeline

Implements the embedding module that generates vector representations of documents using a local Ollama instance with the nomic-embed-text model. These embeddings enable semantic (vector) search and the hybrid search mode that fuses lexical and semantic results via RRF. Key components: - embedding::ollama: HTTP client for the Ollama /api/embeddings endpoint. Handles connection errors with actionable error messages (OllamaUnavailable, OllamaModelNotFound) and validates response dimensions. - embedding::chunking: Splits long documents into overlapping paragraph-aware chunks for embedding. Uses a configurable max token estimate (8192 default for nomic-embed-text) with 10% overlap to preserve cross-chunk context. - embedding::chunk_ids: Encodes chunk identity as doc_id * 1000 + chunk_index for the embeddings table rowid. This allows vector search to map results back to documents and deduplicate by doc_id efficiently. - embedding::change_detector: Compares document content_hash against stored embedding hashes to skip re-embedding unchanged documents, making incremental embedding runs fast. - embedding::pipeline: Orchestrates the full embedding flow: detect changed documents, chunk them, call Ollama in configurable concurrency (default 4), store results. Supports --retry-failed to re-attempt previously failed embeddings. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:46:30 -05:00
parent 20edff4ab1
commit 723703bed9
6 changed files with 810 additions and 0 deletions
--- a/src/embedding/change_detector.rs
+++ b/src/embedding/change_detector.rs
@@ -0,0 +1,79 @@
+//! Detect documents needing (re-)embedding based on content hash changes.
+
+use rusqlite::Connection;
+
+use crate::core::error::Result;
+
+/// A document that needs embedding or re-embedding.
+#[derive(Debug)]
+pub struct PendingDocument {
+    pub document_id: i64,
+    pub content_text: String,
+    pub content_hash: String,
+}
+
+/// Find documents that need embedding: new (no metadata) or changed (hash mismatch).
+///
+/// Uses keyset pagination (WHERE d.id > last_id) and returns up to `page_size` results.
+pub fn find_pending_documents(
+    conn: &Connection,
+    page_size: usize,
+    last_id: i64,
+) -> Result<Vec<PendingDocument>> {
+    // Documents that either:
+    // 1. Have no embedding_metadata at all (new)
+    // 2. Have metadata where document_hash != content_hash (changed)
+    let sql = r#"
+        SELECT d.id, d.content_text, d.content_hash
+        FROM documents d
+        WHERE d.id > ?1
+          AND (
+            NOT EXISTS (
+                SELECT 1 FROM embedding_metadata em
+                WHERE em.document_id = d.id AND em.chunk_index = 0
+            )
+            OR EXISTS (
+                SELECT 1 FROM embedding_metadata em
+                WHERE em.document_id = d.id AND em.chunk_index = 0
+                  AND em.document_hash != d.content_hash
+            )
+          )
+        ORDER BY d.id
+        LIMIT ?2
+    "#;
+
+    let mut stmt = conn.prepare(sql)?;
+    let rows = stmt
+        .query_map(rusqlite::params![last_id, page_size as i64], |row| {
+            Ok(PendingDocument {
+                document_id: row.get(0)?,
+                content_text: row.get(1)?,
+                content_hash: row.get(2)?,
+            })
+        })?
+        .collect::<std::result::Result<Vec<_>, _>>()?;
+
+    Ok(rows)
+}
+
+/// Count total documents that need embedding.
+pub fn count_pending_documents(conn: &Connection) -> Result<i64> {
+    let count: i64 = conn.query_row(
+        r#"
+        SELECT COUNT(*)
+        FROM documents d
+        WHERE NOT EXISTS (
+            SELECT 1 FROM embedding_metadata em
+            WHERE em.document_id = d.id AND em.chunk_index = 0
+        )
+        OR EXISTS (
+            SELECT 1 FROM embedding_metadata em
+            WHERE em.document_id = d.id AND em.chunk_index = 0
+              AND em.document_hash != d.content_hash
+        )
+        "#,
+        [],
+        |row| row.get(0),
+    )?;
+    Ok(count)
+}