feat(embedding): Add Ollama-powered vector embedding pipeline
Implements the embedding module that generates vector representations of documents using a local Ollama instance with the nomic-embed-text model. These embeddings enable semantic (vector) search and the hybrid search mode that fuses lexical and semantic results via RRF. Key components: - embedding::ollama: HTTP client for the Ollama /api/embeddings endpoint. Handles connection errors with actionable error messages (OllamaUnavailable, OllamaModelNotFound) and validates response dimensions. - embedding::chunking: Splits long documents into overlapping paragraph-aware chunks for embedding. Uses a configurable max token estimate (8192 default for nomic-embed-text) with 10% overlap to preserve cross-chunk context. - embedding::chunk_ids: Encodes chunk identity as doc_id * 1000 + chunk_index for the embeddings table rowid. This allows vector search to map results back to documents and deduplicate by doc_id efficiently. - embedding::change_detector: Compares document content_hash against stored embedding hashes to skip re-embedding unchanged documents, making incremental embedding runs fast. - embedding::pipeline: Orchestrates the full embedding flow: detect changed documents, chunk them, call Ollama in configurable concurrency (default 4), store results. Supports --retry-failed to re-attempt previously failed embeddings. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
79
src/embedding/change_detector.rs
Normal file
79
src/embedding/change_detector.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! Detect documents needing (re-)embedding based on content hash changes.
|
||||
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
|
||||
/// A document that needs embedding or re-embedding.
|
||||
#[derive(Debug)]
|
||||
pub struct PendingDocument {
|
||||
pub document_id: i64,
|
||||
pub content_text: String,
|
||||
pub content_hash: String,
|
||||
}
|
||||
|
||||
/// Find documents that need embedding: new (no metadata) or changed (hash mismatch).
|
||||
///
|
||||
/// Uses keyset pagination (WHERE d.id > last_id) and returns up to `page_size` results.
|
||||
pub fn find_pending_documents(
|
||||
conn: &Connection,
|
||||
page_size: usize,
|
||||
last_id: i64,
|
||||
) -> Result<Vec<PendingDocument>> {
|
||||
// Documents that either:
|
||||
// 1. Have no embedding_metadata at all (new)
|
||||
// 2. Have metadata where document_hash != content_hash (changed)
|
||||
let sql = r#"
|
||||
SELECT d.id, d.content_text, d.content_hash
|
||||
FROM documents d
|
||||
WHERE d.id > ?1
|
||||
AND (
|
||||
NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
)
|
||||
ORDER BY d.id
|
||||
LIMIT ?2
|
||||
"#;
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let rows = stmt
|
||||
.query_map(rusqlite::params![last_id, page_size as i64], |row| {
|
||||
Ok(PendingDocument {
|
||||
document_id: row.get(0)?,
|
||||
content_text: row.get(1)?,
|
||||
content_hash: row.get(2)?,
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Count total documents that need embedding.
|
||||
pub fn count_pending_documents(conn: &Connection) -> Result<i64> {
|
||||
let count: i64 = conn.query_row(
|
||||
r#"
|
||||
SELECT COUNT(*)
|
||||
FROM documents d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
"#,
|
||||
[],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
Ok(count)
|
||||
}
|
||||
Reference in New Issue
Block a user