perf: Optimize SQL queries and reduce allocations in hot paths

Change detection queries (embedding/change_detector.rs):
- Replace triple-EXISTS subquery pattern with LEFT JOIN + NULL check
- SQLite now scans embedding_metadata once instead of three times
- Semantically identical: returns docs needing embedding when no
  embedding exists, hash changed, or config mismatch

Count queries (cli/commands/count.rs):
- Consolidate 3 separate COUNT queries for issues into single query
  using conditional aggregation (CASE WHEN state = 'x' THEN 1)
- Same optimization for MRs: 5 queries reduced to 1

Search filter queries (search/filters.rs):
- Replace N separate EXISTS clauses for label filtering with single
  IN() clause with COUNT/GROUP BY HAVING pattern
- For multi-label AND queries, this reduces N subqueries to 1

FTS tokenization (search/fts.rs):
- Replace collect-into-Vec-then-join pattern with direct String building
- Pre-allocate capacity hint for result string

Discussion truncation (documents/truncation.rs):
- Calculate total length without allocating concatenated string first
- Only allocate full string when we know it fits within limit

Embedding pipeline (embedding/pipeline.rs):
- Add Vec::with_capacity hints for chunk work and cleared_docs hashset
- Reduces reallocations during embedding batch processing

Backoff calculation (core/backoff.rs):
- Replace unchecked addition with saturating_add to prevent overflow
- Add test case verifying overflow protection

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-05 11:21:28 -05:00
parent 9c04b7fb1b
commit 72f1cafdcf
7 changed files with 96 additions and 101 deletions

View File

@@ -16,30 +16,25 @@ pub fn find_pending_documents(
last_id: i64,
model_name: &str,
) -> Result<Vec<PendingDocument>> {
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern.
// This allows SQLite to scan embedding_metadata once instead of three times.
// Semantically identical: returns documents needing (re-)embedding when:
// - No embedding exists (em.document_id IS NULL)
// - Content hash changed (em.document_hash != d.content_hash)
// - Config mismatch (model/dims/chunk_max_bytes)
let sql = r#"
SELECT d.id, d.content_text, d.content_hash
FROM documents d
LEFT JOIN embedding_metadata em
ON em.document_id = d.id AND em.chunk_index = 0
WHERE d.id > ?1
AND (
NOT EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND (
em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?3
OR em.model != ?4
OR em.dims != ?5
)
)
em.document_id IS NULL
OR em.document_hash != d.content_hash
OR em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?3
OR em.model != ?4
OR em.dims != ?5
)
ORDER BY d.id
LIMIT ?2
@@ -69,31 +64,19 @@ pub fn find_pending_documents(
}
pub fn count_pending_documents(conn: &Connection, model_name: &str) -> Result<i64> {
// Optimized query: LEFT JOIN + NULL check replaces triple-EXISTS pattern
let count: i64 = conn.query_row(
r#"
SELECT COUNT(*)
FROM documents d
WHERE (
NOT EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND em.document_hash != d.content_hash
)
OR EXISTS (
SELECT 1 FROM embedding_metadata em
WHERE em.document_id = d.id AND em.chunk_index = 0
AND (
em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?1
OR em.model != ?2
OR em.dims != ?3
)
)
)
LEFT JOIN embedding_metadata em
ON em.document_id = d.id AND em.chunk_index = 0
WHERE em.document_id IS NULL
OR em.document_hash != d.content_hash
OR em.chunk_max_bytes IS NULL
OR em.chunk_max_bytes != ?1
OR em.model != ?2
OR em.dims != ?3
"#,
rusqlite::params![CHUNK_MAX_BYTES as i64, model_name, EXPECTED_DIMS as i64],
|row| row.get(0),