From b168a58134d3a0548b9ee63ac2ef3cf80ac3f476 Mon Sep 17 00:00:00 2001 From: Taylor Eernisse Date: Sun, 8 Feb 2026 14:34:05 -0500 Subject: [PATCH] fix(search): cap vector search k-value and add rowid assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vector search multiplier could grow unbounded on documents with many chunks, producing enormous k values that cause SQLite to scan far more rows than necessary. Clamp the multiplier to [8, 200] and cap k at 10,000 to prevent degenerate performance on large corpora. Also adds a debug_assert in decode_rowid to catch negative rowids early — these indicate a bug in the encoding pipeline and should fail fast rather than silently produce garbage document IDs. Co-Authored-By: Claude Opus 4.6 --- src/embedding/chunk_ids.rs | 4 ++++ src/search/vector.rs | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/embedding/chunk_ids.rs b/src/embedding/chunk_ids.rs index 4214b03..629a2a4 100644 --- a/src/embedding/chunk_ids.rs +++ b/src/embedding/chunk_ids.rs @@ -14,6 +14,10 @@ pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 { } pub fn decode_rowid(rowid: i64) -> (i64, i64) { + debug_assert!( + rowid >= 0, + "decode_rowid called with negative rowid: {rowid}" + ); let document_id = rowid / CHUNK_ROWID_MULTIPLIER; let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER; (document_id, chunk_index) diff --git a/src/search/vector.rs b/src/search/vector.rs index bb09ec8..b7de7e9 100644 --- a/src/search/vector.rs +++ b/src/search/vector.rs @@ -51,8 +51,8 @@ pub fn search_vector( .collect(); let max_chunks = max_chunks_per_document(conn).max(1); - let multiplier = ((max_chunks.unsigned_abs() as usize * 3 / 2) + 1).max(8); - let k = limit * multiplier; + let multiplier = ((max_chunks.unsigned_abs() as usize * 3 / 2) + 1).clamp(8, 200); + let k = (limit * multiplier).min(10_000); let mut stmt = conn.prepare( "SELECT rowid, distance