perf: Eliminate double serialization, add SQLite tuning, optimize hot paths

11 isomorphic performance fixes from deep audit (no behavior changes):

- Eliminate double serialization: store_payload now accepts pre-serialized
  bytes (&[u8]) instead of re-serializing from serde_json::Value. Uses
  Cow<[u8]> for zero-copy when compression is disabled.
- Add SQLite cache_size (64MB) and mmap_size (256MB) pragmas
- Replace SELECT-then-INSERT label upserts with INSERT...ON CONFLICT
  RETURNING in both issues.rs and merge_requests.rs
- Replace INSERT + SELECT milestone upsert with RETURNING
- Use prepare_cached for 5 hot-path queries in extractor.rs
- Optimize compute_list_hash: index-sort + incremental SHA-256 instead
  of clone+sort+join+hash
- Pre-allocate embedding float-to-bytes buffer with Vec::with_capacity
- Replace RandomState::new() in rand_jitter with atomic counter XOR nanos
- Remove redundant per-note payload storage (discussion payload contains
  all notes already)
- Change transform_issue to accept &GitLabIssue (avoids full struct clone)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-04 08:12:37 -05:00
parent f5b4a765b7
commit ee5c5f9645
10 changed files with 172 additions and 157 deletions

View File

@@ -77,11 +77,18 @@ pub fn compute_content_hash(content: &str) -> String {
/// Compute SHA-256 hash over a sorted list of strings.
/// Used for labels_hash and paths_hash to detect changes efficiently.
/// Sorts by index reference to avoid cloning, hashes incrementally to avoid join allocation.
pub fn compute_list_hash(items: &[String]) -> String {
let mut sorted = items.to_vec();
sorted.sort();
let joined = sorted.join("\n");
compute_content_hash(&joined)
let mut indices: Vec<usize> = (0..items.len()).collect();
indices.sort_by(|a, b| items[*a].cmp(&items[*b]));
let mut hasher = Sha256::new();
for (i, &idx) in indices.iter().enumerate() {
if i > 0 {
hasher.update(b"\n");
}
hasher.update(items[idx].as_bytes());
}
format!("{:x}", hasher.finalize())
}
/// Extract a searchable document from an issue.
@@ -132,7 +139,7 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
};
// Query labels via junction table
let mut label_stmt = conn.prepare(
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id
WHERE il.issue_id = ?1
@@ -245,7 +252,7 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
};
// Query labels via junction table
let mut label_stmt = conn.prepare(
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM mr_labels ml
JOIN labels l ON l.id = ml.label_id
WHERE ml.merge_request_id = ?1
@@ -373,7 +380,7 @@ pub fn extract_discussion_document(
Err(e) => return Err(e.into()),
};
// Query parent labels
let mut label_stmt = conn.prepare(
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id
WHERE il.issue_id = ?1
@@ -407,7 +414,7 @@ pub fn extract_discussion_document(
Err(e) => return Err(e.into()),
};
// Query parent labels
let mut label_stmt = conn.prepare(
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM mr_labels ml
JOIN labels l ON l.id = ml.label_id
WHERE ml.merge_request_id = ?1
@@ -423,7 +430,7 @@ pub fn extract_discussion_document(
};
// Query non-system notes in thread order
let mut note_stmt = conn.prepare(
let mut note_stmt = conn.prepare_cached(
"SELECT n.author_username, n.body, n.created_at, n.gitlab_id,
n.note_type, n.position_old_path, n.position_new_path
FROM notes n
@@ -657,6 +664,7 @@ mod tests {
updated_at INTEGER NOT NULL,
last_seen_at INTEGER NOT NULL,
discussions_synced_for_updated_at INTEGER,
resource_events_synced_for_updated_at INTEGER,
web_url TEXT,
raw_payload_id INTEGER
);
@@ -899,6 +907,7 @@ mod tests {
discussions_sync_last_attempt_at INTEGER,
discussions_sync_attempts INTEGER DEFAULT 0,
discussions_sync_last_error TEXT,
resource_events_synced_for_updated_at INTEGER,
web_url TEXT,
raw_payload_id INTEGER
);