refactor(structure): reorganize codebase into domain-focused modules

2026-03-06 15:22:42 -05:00
parent 4d41d74ea7
commit bf977eca1a
78 changed files with 8704 additions and 6973 deletions
--- a/src/embedding/change_detector.rs
+++ b/src/embedding/change_detector.rs
@@ -1,7 +1,7 @@
 use rusqlite::Connection;

 use crate::core::error::Result;
-use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
+use crate::embedding::chunks::{CHUNK_MAX_BYTES, EXPECTED_DIMS};

 #[derive(Debug)]
 pub struct PendingDocument {
--- a/src/embedding/chunks.rs
+++ b/src/embedding/chunks.rs
@@ -0,0 +1,177 @@
+pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000;
+
+pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
+    assert!(
+        (0..CHUNK_ROWID_MULTIPLIER).contains(&chunk_index),
+        "chunk_index {chunk_index} out of range [0, {CHUNK_ROWID_MULTIPLIER})"
+    );
+    document_id
+        .checked_mul(CHUNK_ROWID_MULTIPLIER)
+        .and_then(|v| v.checked_add(chunk_index))
+        .unwrap_or_else(|| {
+            panic!("encode_rowid overflow: document_id={document_id}, chunk_index={chunk_index}")
+        })
+}
+
+pub fn decode_rowid(rowid: i64) -> (i64, i64) {
+    assert!(
+        rowid >= 0,
+        "decode_rowid called with negative rowid: {rowid}"
+    );
+    let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
+    let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
+    (document_id, chunk_index)
+}
+
+#[cfg(test)]
+mod chunk_ids_tests {
+    use super::*;
+
+    #[test]
+    fn test_encode_single_chunk() {
+        assert_eq!(encode_rowid(1, 0), 1000);
+    }
+
+    #[test]
+    fn test_encode_multi_chunk() {
+        assert_eq!(encode_rowid(1, 5), 1005);
+    }
+
+    #[test]
+    fn test_encode_specific_values() {
+        assert_eq!(encode_rowid(42, 0), 42000);
+        assert_eq!(encode_rowid(42, 5), 42005);
+    }
+
+    #[test]
+    fn test_decode_zero_chunk() {
+        assert_eq!(decode_rowid(42000), (42, 0));
+    }
+
+    #[test]
+    fn test_decode_roundtrip() {
+        for doc_id in [0, 1, 42, 100, 999, 10000] {
+            for chunk_idx in [0, 1, 5, 99, 999] {
+                let rowid = encode_rowid(doc_id, chunk_idx);
+                let (decoded_doc, decoded_chunk) = decode_rowid(rowid);
+                assert_eq!(
+                    (decoded_doc, decoded_chunk),
+                    (doc_id, chunk_idx),
+                    "Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_multiplier_value() {
+        assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000);
+    }
+}
+pub const CHUNK_MAX_BYTES: usize = 1_500;
+
+pub const EXPECTED_DIMS: usize = 768;
+
+pub const CHUNK_OVERLAP_CHARS: usize = 200;
+
+pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
+    if content.is_empty() {
+        return Vec::new();
+    }
+
+    if content.len() <= CHUNK_MAX_BYTES {
+        return vec![(0, content.to_string())];
+    }
+
+    let mut chunks: Vec<(usize, String)> = Vec::new();
+    let mut start = 0;
+    let mut chunk_index = 0;
+
+    while start < content.len() {
+        let remaining = &content[start..];
+        if remaining.len() <= CHUNK_MAX_BYTES {
+            chunks.push((chunk_index, remaining.to_string()));
+            break;
+        }
+
+        let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
+        let window = &content[start..end];
+
+        let split_at = find_paragraph_break(window)
+            .or_else(|| find_sentence_break(window))
+            .or_else(|| find_word_break(window))
+            .unwrap_or(window.len());
+
+        let chunk_text = &content[start..start + split_at];
+        chunks.push((chunk_index, chunk_text.to_string()));
+
+        let advance = if split_at > CHUNK_OVERLAP_CHARS {
+            split_at - CHUNK_OVERLAP_CHARS
+        } else {
+            split_at
+        }
+        .max(1);
+        let old_start = start;
+        start += advance;
+        // Ensure start lands on a char boundary after overlap subtraction
+        start = floor_char_boundary(content, start);
+        // Guarantee forward progress: multi-byte chars can cause
+        // floor_char_boundary to round back to old_start
+        if start <= old_start {
+            start = old_start
+                + content[old_start..]
+                    .chars()
+                    .next()
+                    .map_or(1, |c| c.len_utf8());
+        }
+        chunk_index += 1;
+    }
+
+    chunks
+}
+
+fn find_paragraph_break(window: &str) -> Option<usize> {
+    let search_start = floor_char_boundary(window, window.len() * 2 / 3);
+    window[search_start..]
+        .rfind("\n\n")
+        .map(|pos| search_start + pos + 2)
+        .or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
+}
+
+fn find_sentence_break(window: &str) -> Option<usize> {
+    let search_start = floor_char_boundary(window, window.len() / 2);
+    for pat in &[". ", "? ", "! "] {
+        if let Some(pos) = window[search_start..].rfind(pat) {
+            return Some(search_start + pos + pat.len());
+        }
+    }
+    for pat in &[". ", "? ", "! "] {
+        if let Some(pos) = window[..search_start].rfind(pat) {
+            return Some(pos + pat.len());
+        }
+    }
+    None
+}
+
+fn find_word_break(window: &str) -> Option<usize> {
+    let search_start = floor_char_boundary(window, window.len() / 2);
+    window[search_start..]
+        .rfind(' ')
+        .map(|pos| search_start + pos + 1)
+        .or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
+}
+
+fn floor_char_boundary(s: &str, idx: usize) -> usize {
+    if idx >= s.len() {
+        return s.len();
+    }
+    let mut i = idx;
+    while i > 0 && !s.is_char_boundary(i) {
+        i -= 1;
+    }
+    i
+}
+
+#[cfg(test)]
+#[path = "chunking_tests.rs"]
+mod chunking_tests;
--- a/src/embedding/mod.rs
+++ b/src/embedding/mod.rs
@@ -1,11 +1,10 @@
 pub mod change_detector;
-pub mod chunk_ids;
-pub mod chunking;
+pub mod chunks;
 pub mod ollama;
 pub mod pipeline;
 pub mod similarity;

 pub use change_detector::{PendingDocument, count_pending_documents, find_pending_documents};
-pub use chunking::{CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS, split_into_chunks};
+pub use chunks::{CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS, split_into_chunks};
 pub use pipeline::{EmbedForIdsResult, EmbedResult, embed_documents, embed_documents_by_ids};
 pub use similarity::cosine_similarity;
--- a/src/embedding/pipeline.rs
+++ b/src/embedding/pipeline.rs
@@ -9,8 +9,9 @@ use tracing::{debug, info, instrument, warn};
 use crate::core::error::Result;
 use crate::core::shutdown::ShutdownSignal;
 use crate::embedding::change_detector::{count_pending_documents, find_pending_documents};
-use crate::embedding::chunk_ids::{CHUNK_ROWID_MULTIPLIER, encode_rowid};
-use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS, split_into_chunks};
+use crate::embedding::chunks::{
+    CHUNK_MAX_BYTES, CHUNK_ROWID_MULTIPLIER, EXPECTED_DIMS, encode_rowid, split_into_chunks,
+};
 use crate::embedding::ollama::OllamaClient;

 const BATCH_SIZE: usize = 32;
@@ -685,7 +686,7 @@ fn find_documents_by_ids(
    document_ids: &[i64],
    model_name: &str,
 ) -> Result<Vec<crate::embedding::change_detector::PendingDocument>> {
-    use crate::embedding::chunking::{CHUNK_MAX_BYTES, EXPECTED_DIMS};
+    use crate::embedding::chunks::{CHUNK_MAX_BYTES, EXPECTED_DIMS};

    if document_ids.is_empty() {
        return Ok(Vec::new());
--- a/src/embedding/pipeline_tests.rs
+++ b/src/embedding/pipeline_tests.rs
@@ -6,7 +6,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};

 use crate::core::db::{create_connection, run_migrations};
 use crate::core::shutdown::ShutdownSignal;
-use crate::embedding::chunking::EXPECTED_DIMS;
+use crate::embedding::chunks::EXPECTED_DIMS;
 use crate::embedding::ollama::{OllamaClient, OllamaConfig};
 use crate::embedding::pipeline::embed_documents_by_ids;