perf: Optimize SQL queries and reduce allocations in hot paths
Change detection queries (embedding/change_detector.rs): - Replace triple-EXISTS subquery pattern with LEFT JOIN + NULL check - SQLite now scans embedding_metadata once instead of three times - Semantically identical: returns docs needing embedding when no embedding exists, hash changed, or config mismatch Count queries (cli/commands/count.rs): - Consolidate 3 separate COUNT queries for issues into single query using conditional aggregation (CASE WHEN state = 'x' THEN 1) - Same optimization for MRs: 5 queries reduced to 1 Search filter queries (search/filters.rs): - Replace N separate EXISTS clauses for label filtering with single IN() clause with COUNT/GROUP BY HAVING pattern - For multi-label AND queries, this reduces N subqueries to 1 FTS tokenization (search/fts.rs): - Replace collect-into-Vec-then-join pattern with direct String building - Pre-allocate capacity hint for result string Discussion truncation (documents/truncation.rs): - Calculate total length without allocating concatenated string first - Only allocate full string when we know it fits within limit Embedding pipeline (embedding/pipeline.rs): - Add Vec::with_capacity hints for chunk work and cleared_docs hashset - Reduces reallocations during embedding batch processing Backoff calculation (core/backoff.rs): - Replace unchecked addition with saturating_add to prevent overflow - Add test case verifying overflow protection Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -16,30 +16,25 @@ pub fn find_pending_documents(
|
||||
last_id: i64,
|
||||
model_name: &str,
|
||||
) -> Result<Vec<PendingDocument>> {
|
||||
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern.
|
||||
// This allows SQLite to scan embedding_metadata once instead of three times.
|
||||
// Semantically identical: returns documents needing (re-)embedding when:
|
||||
// - No embedding exists (em.document_id IS NULL)
|
||||
// - Content hash changed (em.document_hash != d.content_hash)
|
||||
// - Config mismatch (model/dims/chunk_max_bytes)
|
||||
let sql = r#"
|
||||
SELECT d.id, d.content_text, d.content_hash
|
||||
FROM documents d
|
||||
LEFT JOIN embedding_metadata em
|
||||
ON em.document_id = d.id AND em.chunk_index = 0
|
||||
WHERE d.id > ?1
|
||||
AND (
|
||||
NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND (
|
||||
em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?3
|
||||
OR em.model != ?4
|
||||
OR em.dims != ?5
|
||||
)
|
||||
)
|
||||
em.document_id IS NULL
|
||||
OR em.document_hash != d.content_hash
|
||||
OR em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?3
|
||||
OR em.model != ?4
|
||||
OR em.dims != ?5
|
||||
)
|
||||
ORDER BY d.id
|
||||
LIMIT ?2
|
||||
@@ -69,31 +64,19 @@ pub fn find_pending_documents(
|
||||
}
|
||||
|
||||
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
|
||||
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern
|
||||
let count: i64 = conn.query_row(
|
||||
r#"
|
||||
SELECT COUNT(*)
|
||||
FROM documents d
|
||||
WHERE (
|
||||
NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND (
|
||||
em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?1
|
||||
OR em.model != ?2
|
||||
OR em.dims != ?3
|
||||
)
|
||||
)
|
||||
)
|
||||
LEFT JOIN embedding_metadata em
|
||||
ON em.document_id = d.id AND em.chunk_index = 0
|
||||
WHERE em.document_id IS NULL
|
||||
OR em.document_hash != d.content_hash
|
||||
OR em.chunk_max_bytes IS NULL
|
||||
OR em.chunk_max_bytes != ?1
|
||||
OR em.model != ?2
|
||||
OR em.dims != ?3
|
||||
"#,
|
||||
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|
||||
|row| row.get(0),
|
||||
|
||||
@@ -103,7 +103,7 @@ async fn embed_page(
|
||||
total: usize,
|
||||
progress_callback: &Option<Box<dyn Fn(usize, usize)>>,
|
||||
) -> Result<()> {
|
||||
let mut all_chunks: Vec<ChunkWork> = Vec::new();
|
||||
let mut all_chunks: Vec<ChunkWork> = Vec::with_capacity(pending.len() * 3);
|
||||
let mut page_normal_docs: usize = 0;
|
||||
|
||||
for doc in pending {
|
||||
@@ -159,7 +159,7 @@ async fn embed_page(
|
||||
page_normal_docs += 1;
|
||||
}
|
||||
|
||||
let mut cleared_docs: HashSet<i64> = HashSet::new();
|
||||
let mut cleared_docs: HashSet<i64> = HashSet::with_capacity(pending.len());
|
||||
|
||||
for batch in all_chunks.chunks(BATCH_SIZE) {
|
||||
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
|
||||
|
||||
Reference in New Issue
Block a user