fix(search): cap vector search k-value and add rowid assertion

The vector search multiplier could grow unbounded on documents with
many chunks, producing enormous k values that cause SQLite to scan
far more rows than necessary. Clamp the multiplier to [8, 200] and
cap k at 10,000 to prevent degenerate performance on large corpora.

Also adds a debug_assert in decode_rowid to catch negative rowids
early — these indicate a bug in the encoding pipeline and should
fail fast rather than silently produce garbage document IDs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-08 14:34:05 -05:00
parent b704e33188
commit b168a58134
2 changed files with 6 additions and 2 deletions

View File

@@ -14,6 +14,10 @@ pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
} }
pub fn decode_rowid(rowid: i64) -> (i64, i64) { pub fn decode_rowid(rowid: i64) -> (i64, i64) {
debug_assert!(
rowid >= 0,
"decode_rowid called with negative rowid: {rowid}"
);
let document_id = rowid / CHUNK_ROWID_MULTIPLIER; let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER; let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
(document_id, chunk_index) (document_id, chunk_index)

View File

@@ -51,8 +51,8 @@ pub fn search_vector(
.collect(); .collect();
let max_chunks = max_chunks_per_document(conn).max(1); let max_chunks = max_chunks_per_document(conn).max(1);
let multiplier = ((max_chunks.unsigned_abs() as usize * 3 / 2) + 1).max(8); let multiplier = ((max_chunks.unsigned_abs() as usize * 3 / 2) + 1).clamp(8, 200);
let k = limit * multiplier; let k = (limit * multiplier).min(10_000);
let mut stmt = conn.prepare( let mut stmt = conn.prepare(
"SELECT rowid, distance "SELECT rowid, distance