feat(embedding): Add Ollama-powered vector embedding pipeline

Implements the embedding module that generates vector representations
of documents using a local Ollama instance with the nomic-embed-text
model. These embeddings enable semantic (vector) search and the hybrid
search mode that fuses lexical and semantic results via RRF.

Key components:

- embedding::ollama: HTTP client for the Ollama /api/embeddings
  endpoint. Handles connection errors with actionable error messages
  (OllamaUnavailable, OllamaModelNotFound) and validates response
  dimensions.

- embedding::chunking: Splits long documents into overlapping
  paragraph-aware chunks for embedding. Uses a configurable max token
  estimate (8192 default for nomic-embed-text) with 10% overlap to
  preserve cross-chunk context.

- embedding::chunk_ids: Encodes chunk identity as
  doc_id * 1000 + chunk_index for the embeddings table rowid. This
  allows vector search to map results back to documents and
  deduplicate by doc_id efficiently.

- embedding::change_detector: Compares document content_hash against
  stored embedding hashes to skip re-embedding unchanged documents,
  making incremental embedding runs fast.

- embedding::pipeline: Orchestrates the full embedding flow: detect
  changed documents, chunk them, call Ollama in configurable
  concurrency (default 4), store results. Supports --retry-failed
  to re-attempt previously failed embeddings.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-01-30 15:46:30 -05:00
parent 20edff4ab1
commit 723703bed9
6 changed files with 810 additions and 0 deletions

View File

@@ -0,0 +1,79 @@
//! Detect documents needing (re-)embedding based on content hash changes.
use rusqlite::Connection;
use crate::core::error::Result;
/// A document that needs embedding or re-embedding.
#[derive(Debug)]
pub struct PendingDocument {
pub document_id: i64,
pub content_text: String,
pub content_hash: String,
}
/// Find documents that need embedding: new (no metadata) or changed (hash mismatch).
///
/// Uses keyset pagination (WHERE d.id > last_id) and returns up to `page_size` results.
pub fn find_pending_documents(
conn: &Connection,
page_size: usize,
last_id: i64,
) -> Result<Vec<PendingDocument>> {
// Documents that either:
// 1. Have no embedding_metadata at all (new)
// 2. Have metadata where document_hash != content_hash (changed)
let sql = r#"
SELECT d.id, d.content_text, d.content_hash
FROM documents d
WHERE d.id > ?1
AND (
NOT EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
)
)
ORDER BY d.id
LIMIT ?2
"#;
let mut stmt = conn.prepare(sql)?;
let rows = stmt
.query_map(rusqlite::params![last_id, page_size as i64], |row| {
Ok(PendingDocument {
document_id: row.get(0)?,
content_text: row.get(1)?,
content_hash: row.get(2)?,
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(rows)
}
/// Count total documents that need embedding.
pub fn count_pending_documents(conn: &Connection) -> Result<i64> {
let count: i64 = conn.query_row(
r#"
SELECT COUNT(*)
FROM documents d
WHERE NOT EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
)
"#,
[],
|row| row.get(0),
)?;
Ok(count)
}