refactor(structure): reorganize codebase into domain-focused modules
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
||||
use crate::embedding::chunks::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PendingDocument {
|
||||
|
||||
177
src/embedding/chunks.rs
Normal file
177
src/embedding/chunks.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000;
|
||||
|
||||
pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
|
||||
assert!(
|
||||
(0..CHUNK_ROWID_MULTIPLIER).contains(&chunk_index),
|
||||
"chunk_index {chunk_index} out of range [0, {CHUNK_ROWID_MULTIPLIER})"
|
||||
);
|
||||
document_id
|
||||
.checked_mul(CHUNK_ROWID_MULTIPLIER)
|
||||
.and_then(|v| v.checked_add(chunk_index))
|
||||
.unwrap_or_else(|| {
|
||||
panic!("encode_rowid overflow: document_id={document_id}, chunk_index={chunk_index}")
|
||||
})
|
||||
}
|
||||
|
||||
pub fn decode_rowid(rowid: i64) -> (i64, i64) {
|
||||
assert!(
|
||||
rowid >= 0,
|
||||
"decode_rowid called with negative rowid: {rowid}"
|
||||
);
|
||||
let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
|
||||
let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
|
||||
(document_id, chunk_index)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod chunk_ids_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_encode_single_chunk() {
|
||||
assert_eq!(encode_rowid(1, 0), 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_multi_chunk() {
|
||||
assert_eq!(encode_rowid(1, 5), 1005);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_specific_values() {
|
||||
assert_eq!(encode_rowid(42, 0), 42000);
|
||||
assert_eq!(encode_rowid(42, 5), 42005);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_zero_chunk() {
|
||||
assert_eq!(decode_rowid(42000), (42, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_roundtrip() {
|
||||
for doc_id in [0, 1, 42, 100, 999, 10000] {
|
||||
for chunk_idx in [0, 1, 5, 99, 999] {
|
||||
let rowid = encode_rowid(doc_id, chunk_idx);
|
||||
let (decoded_doc, decoded_chunk) = decode_rowid(rowid);
|
||||
assert_eq!(
|
||||
(decoded_doc, decoded_chunk),
|
||||
(doc_id, chunk_idx),
|
||||
"Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiplier_value() {
|
||||
assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000);
|
||||
}
|
||||
}
|
||||
pub const CHUNK_MAX_BYTES: usize = 1_500;
|
||||
|
||||
pub const EXPECTED_DIMS: usize = 768;
|
||||
|
||||
pub const CHUNK_OVERLAP_CHARS: usize = 200;
|
||||
|
||||
pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
|
||||
if content.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
if content.len() <= CHUNK_MAX_BYTES {
|
||||
return vec![(0, content.to_string())];
|
||||
}
|
||||
|
||||
let mut chunks: Vec<(usize, String)> = Vec::new();
|
||||
let mut start = 0;
|
||||
let mut chunk_index = 0;
|
||||
|
||||
while start < content.len() {
|
||||
let remaining = &content[start..];
|
||||
if remaining.len() <= CHUNK_MAX_BYTES {
|
||||
chunks.push((chunk_index, remaining.to_string()));
|
||||
break;
|
||||
}
|
||||
|
||||
let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
|
||||
let window = &content[start..end];
|
||||
|
||||
let split_at = find_paragraph_break(window)
|
||||
.or_else(|| find_sentence_break(window))
|
||||
.or_else(|| find_word_break(window))
|
||||
.unwrap_or(window.len());
|
||||
|
||||
let chunk_text = &content[start..start + split_at];
|
||||
chunks.push((chunk_index, chunk_text.to_string()));
|
||||
|
||||
let advance = if split_at > CHUNK_OVERLAP_CHARS {
|
||||
split_at - CHUNK_OVERLAP_CHARS
|
||||
} else {
|
||||
split_at
|
||||
}
|
||||
.max(1);
|
||||
let old_start = start;
|
||||
start += advance;
|
||||
// Ensure start lands on a char boundary after overlap subtraction
|
||||
start = floor_char_boundary(content, start);
|
||||
// Guarantee forward progress: multi-byte chars can cause
|
||||
// floor_char_boundary to round back to old_start
|
||||
if start <= old_start {
|
||||
start = old_start
|
||||
+ content[old_start..]
|
||||
.chars()
|
||||
.next()
|
||||
.map_or(1, |c| c.len_utf8());
|
||||
}
|
||||
chunk_index += 1;
|
||||
}
|
||||
|
||||
chunks
|
||||
}
|
||||
|
||||
fn find_paragraph_break(window: &str) -> Option<usize> {
|
||||
let search_start = floor_char_boundary(window, window.len() * 2 / 3);
|
||||
window[search_start..]
|
||||
.rfind("\n\n")
|
||||
.map(|pos| search_start + pos + 2)
|
||||
.or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
|
||||
}
|
||||
|
||||
fn find_sentence_break(window: &str) -> Option<usize> {
|
||||
let search_start = floor_char_boundary(window, window.len() / 2);
|
||||
for pat in &[". ", "? ", "! "] {
|
||||
if let Some(pos) = window[search_start..].rfind(pat) {
|
||||
return Some(search_start + pos + pat.len());
|
||||
}
|
||||
}
|
||||
for pat in &[". ", "? ", "! "] {
|
||||
if let Some(pos) = window[..search_start].rfind(pat) {
|
||||
return Some(pos + pat.len());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn find_word_break(window: &str) -> Option<usize> {
|
||||
let search_start = floor_char_boundary(window, window.len() / 2);
|
||||
window[search_start..]
|
||||
.rfind(' ')
|
||||
.map(|pos| search_start + pos + 1)
|
||||
.or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
|
||||
}
|
||||
|
||||
fn floor_char_boundary(s: &str, idx: usize) -> usize {
|
||||
if idx >= s.len() {
|
||||
return s.len();
|
||||
}
|
||||
let mut i = idx;
|
||||
while i > 0 && !s.is_char_boundary(i) {
|
||||
i -= 1;
|
||||
}
|
||||
i
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "chunking_tests.rs"]
|
||||
mod chunking_tests;
|
||||
@@ -1,11 +1,10 @@
|
||||
pub mod change_detector;
|
||||
pub mod chunk_ids;
|
||||
pub mod chunking;
|
||||
pub mod chunks;
|
||||
pub mod ollama;
|
||||
pub mod pipeline;
|
||||
pub mod similarity;
|
||||
|
||||
pub use change_detector::{PendingDocument, count_pending_documents, find_pending_documents};
|
||||
pub use chunking::{CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS, split_into_chunks};
|
||||
pub use chunks::{CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS, split_into_chunks};
|
||||
pub use pipeline::{EmbedForIdsResult, EmbedResult, embed_documents, embed_documents_by_ids};
|
||||
pub use similarity::cosine_similarity;
|
||||
|
||||
@@ -9,8 +9,9 @@ use tracing::{debug, info, instrument, warn};
|
||||
use crate::core::error::Result;
|
||||
use crate::core::shutdown::ShutdownSignal;
|
||||
use crate::embedding::change_detector::{count_pending_documents, find_pending_documents};
|
||||
use crate::embedding::chunk_ids::{CHUNK_ROWID_MULTIPLIER, encode_rowid};
|
||||
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS, split_into_chunks};
|
||||
use crate::embedding::chunks::{
|
||||
CHUNK_MAX_BYTES, CHUNK_ROWID_MULTIPLIER, EXPECTED_DIMS, encode_rowid, split_into_chunks,
|
||||
};
|
||||
use crate::embedding::ollama::OllamaClient;
|
||||
|
||||
const BATCH_SIZE: usize = 32;
|
||||
@@ -685,7 +686,7 @@ fn find_documents_by_ids(
|
||||
document_ids: &[i64],
|
||||
model_name: &str,
|
||||
) -> Result<Vec<crate::embedding::change_detector::PendingDocument>> {
|
||||
use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
||||
use crate::embedding::chunks::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
|
||||
|
||||
if document_ids.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
|
||||
@@ -6,7 +6,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::core::db::{create_connection, run_migrations};
|
||||
use crate::core::shutdown::ShutdownSignal;
|
||||
use crate::embedding::chunking::EXPECTED_DIMS;
|
||||
use crate::embedding::chunks::EXPECTED_DIMS;
|
||||
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
|
||||
use crate::embedding::pipeline::embed_documents_by_ids;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user