feat(embedding): Add Ollama-powered vector embedding pipeline
Implements the embedding module that generates vector representations of documents using a local Ollama instance with the nomic-embed-text model. These embeddings enable semantic (vector) search and the hybrid search mode that fuses lexical and semantic results via RRF. Key components: - embedding::ollama: HTTP client for the Ollama /api/embeddings endpoint. Handles connection errors with actionable error messages (OllamaUnavailable, OllamaModelNotFound) and validates response dimensions. - embedding::chunking: Splits long documents into overlapping paragraph-aware chunks for embedding. Uses a configurable max token estimate (8192 default for nomic-embed-text) with 10% overlap to preserve cross-chunk context. - embedding::chunk_ids: Encodes chunk identity as doc_id * 1000 + chunk_index for the embeddings table rowid. This allows vector search to map results back to documents and deduplicate by doc_id efficiently. - embedding::change_detector: Compares document content_hash against stored embedding hashes to skip re-embedding unchanged documents, making incremental embedding runs fast. - embedding::pipeline: Orchestrates the full embedding flow: detect changed documents, chunk them, call Ollama in configurable concurrency (default 4), store results. Supports --retry-failed to re-attempt previously failed embeddings. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
63
src/embedding/chunk_ids.rs
Normal file
63
src/embedding/chunk_ids.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
/// Multiplier for encoding (document_id, chunk_index) into a single rowid.
|
||||
/// Supports up to 1000 chunks per document (32M chars at 32k/chunk).
|
||||
pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000;
|
||||
|
||||
/// Encode (document_id, chunk_index) into a sqlite-vec rowid.
|
||||
///
|
||||
/// rowid = document_id * CHUNK_ROWID_MULTIPLIER + chunk_index
|
||||
pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
|
||||
document_id * CHUNK_ROWID_MULTIPLIER + chunk_index
|
||||
}
|
||||
|
||||
/// Decode a sqlite-vec rowid back into (document_id, chunk_index).
|
||||
pub fn decode_rowid(rowid: i64) -> (i64, i64) {
|
||||
let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
|
||||
let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
|
||||
(document_id, chunk_index)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_encode_single_chunk() {
|
||||
assert_eq!(encode_rowid(1, 0), 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_multi_chunk() {
|
||||
assert_eq!(encode_rowid(1, 5), 1005);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_specific_values() {
|
||||
assert_eq!(encode_rowid(42, 0), 42000);
|
||||
assert_eq!(encode_rowid(42, 5), 42005);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_zero_chunk() {
|
||||
assert_eq!(decode_rowid(42000), (42, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_roundtrip() {
|
||||
for doc_id in [0, 1, 42, 100, 999, 10000] {
|
||||
for chunk_idx in [0, 1, 5, 99, 999] {
|
||||
let rowid = encode_rowid(doc_id, chunk_idx);
|
||||
let (decoded_doc, decoded_chunk) = decode_rowid(rowid);
|
||||
assert_eq!(
|
||||
(decoded_doc, decoded_chunk),
|
||||
(doc_id, chunk_idx),
|
||||
"Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiplier_value() {
|
||||
assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user