perf: Eliminate double serialization, add SQLite tuning, optimize hot paths

11 isomorphic performance fixes from deep audit (no behavior changes):

- Eliminate double serialization: store_payload now accepts pre-serialized
  bytes (&[u8]) instead of re-serializing from serde_json::Value. Uses
  Cow<[u8]> for zero-copy when compression is disabled.
- Add SQLite cache_size (64MB) and mmap_size (256MB) pragmas
- Replace SELECT-then-INSERT label upserts with INSERT...ON CONFLICT
  RETURNING in both issues.rs and merge_requests.rs
- Replace INSERT + SELECT milestone upsert with RETURNING
- Use prepare_cached for 5 hot-path queries in extractor.rs
- Optimize compute_list_hash: index-sort + incremental SHA-256 instead
  of clone+sort+join+hash
- Pre-allocate embedding float-to-bytes buffer with Vec::with_capacity
- Replace RandomState::new() in rand_jitter with atomic counter XOR nanos
- Remove redundant per-note payload storage (discussion payload contains
  all notes already)
- Change transform_issue to accept &GitLabIssue (avoids full struct clone)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-04 08:12:37 -05:00
parent f5b4a765b7
commit ee5c5f9645
10 changed files with 172 additions and 157 deletions

View File

@@ -111,14 +111,14 @@ async fn ingest_discussions_for_issue(
result.discussions_fetched += 1;
// Store raw payload
let payload_json = serde_json::to_value(&gitlab_discussion)?;
let payload_bytes = serde_json::to_vec(&gitlab_discussion)?;
let payload_id = store_payload(
conn,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "discussion",
gitlab_id: &gitlab_discussion.id,
payload: &payload_json,
json_bytes: &payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?;
@@ -156,25 +156,10 @@ async fn ingest_discussions_for_issue(
)?;
for note in notes {
// Store raw note payload
let note_payload_json = serde_json::to_value(
gitlab_discussion
.notes
.iter()
.find(|n| n.id == note.gitlab_id),
)?;
let note_payload_id = store_payload(
&tx,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "note",
gitlab_id: &note.gitlab_id.to_string(),
payload: &note_payload_json,
compress: config.storage.compress_raw_payloads,
},
)?;
insert_note(&tx, local_discussion_id, &note, note_payload_id)?;
// Note: per-note raw payload storage is skipped because the discussion
// payload (already stored above) contains all notes. The full note
// content is also stored in the notes table itself.
insert_note(&tx, local_discussion_id, &note, None)?;
}
tx.commit()?;
@@ -246,7 +231,7 @@ fn insert_note(
conn: &Connection,
discussion_id: i64,
note: &crate::gitlab::transformers::NormalizedNote,
payload_id: i64,
payload_id: Option<i64>,
) -> Result<()> {
conn.execute(
"INSERT INTO notes (

View File

@@ -196,8 +196,8 @@ fn process_single_issue(
let now = now_ms();
// Transform issue first (outside transaction - no DB access)
let payload_json = serde_json::to_value(issue)?;
let transformed = transform_issue(issue.clone())?;
let payload_bytes = serde_json::to_vec(issue)?;
let transformed = transform_issue(issue)?;
let issue_row = &transformed.issue;
// Wrap all DB operations in a transaction for atomicity
@@ -207,7 +207,7 @@ fn process_single_issue(
config,
project_id,
issue,
&payload_json,
&payload_bytes,
issue_row,
&transformed.label_names,
&transformed.assignee_usernames,
@@ -226,7 +226,7 @@ fn process_issue_in_transaction(
config: &Config,
project_id: i64,
issue: &GitLabIssue,
payload_json: &serde_json::Value,
payload_bytes: &[u8],
issue_row: &crate::gitlab::transformers::IssueRow,
label_names: &[String],
assignee_usernames: &[String],
@@ -242,7 +242,7 @@ fn process_issue_in_transaction(
project_id: Some(project_id),
resource_type: "issue",
gitlab_id: &issue.id.to_string(),
payload: payload_json,
json_bytes: payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?;
@@ -332,33 +332,27 @@ fn process_issue_in_transaction(
}
/// Upsert a label within a transaction, returning its ID.
/// Uses INSERT...ON CONFLICT...RETURNING for a single round-trip.
fn upsert_label_tx(
tx: &Transaction<'_>,
project_id: i64,
name: &str,
created_count: &mut usize,
) -> Result<i64> {
// Try to get existing
let existing: Option<i64> = tx
.query_row(
"SELECT id FROM labels WHERE project_id = ? AND name = ?",
(project_id, name),
|row| row.get(0),
)
.ok();
let id: i64 = tx.query_row(
"INSERT INTO labels (project_id, name) VALUES (?1, ?2)
ON CONFLICT(project_id, name) DO UPDATE SET name = excluded.name
RETURNING id",
(project_id, name),
|row| row.get(0),
)?;
if let Some(id) = existing {
return Ok(id);
// If the rowid matches last_insert_rowid, this was a new insert
if tx.last_insert_rowid() == id {
*created_count += 1;
}
// Insert new
tx.execute(
"INSERT INTO labels (project_id, name) VALUES (?, ?)",
(project_id, name),
)?;
*created_count += 1;
Ok(tx.last_insert_rowid())
Ok(id)
}
/// Link an issue to a label within a transaction.
@@ -371,12 +365,13 @@ fn link_issue_label_tx(tx: &Transaction<'_>, issue_id: i64, label_id: i64) -> Re
}
/// Upsert a milestone within a transaction, returning its local ID.
/// Uses RETURNING to avoid a separate SELECT round-trip.
fn upsert_milestone_tx(
tx: &Transaction<'_>,
project_id: i64,
milestone: &MilestoneRow,
) -> Result<i64> {
tx.execute(
let local_id: i64 = tx.query_row(
"INSERT INTO milestones (gitlab_id, project_id, iid, title, description, state, due_date, web_url)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
ON CONFLICT(project_id, gitlab_id) DO UPDATE SET
@@ -385,7 +380,8 @@ fn upsert_milestone_tx(
description = excluded.description,
state = excluded.state,
due_date = excluded.due_date,
web_url = excluded.web_url",
web_url = excluded.web_url
RETURNING id",
(
milestone.gitlab_id,
project_id,
@@ -396,12 +392,6 @@ fn upsert_milestone_tx(
&milestone.due_date,
&milestone.web_url,
),
)?;
// Get the local ID (whether inserted or updated)
let local_id: i64 = tx.query_row(
"SELECT id FROM milestones WHERE project_id = ? AND gitlab_id = ?",
(project_id, milestone.gitlab_id),
|row| row.get(0),
)?;

View File

@@ -166,14 +166,14 @@ fn process_single_mr(
mr: &GitLabMergeRequest,
) -> Result<ProcessMrResult> {
// Transform MR first (outside transaction - no DB access)
let payload_json = serde_json::to_value(mr)?;
let payload_bytes = serde_json::to_vec(mr)?;
let transformed = transform_merge_request(mr, project_id)
.map_err(|e| LoreError::Other(format!("MR transform failed: {}", e)))?;
// Wrap all DB operations in a transaction for atomicity
let tx = conn.unchecked_transaction()?;
let result =
process_mr_in_transaction(&tx, config, project_id, mr, &payload_json, &transformed)?;
process_mr_in_transaction(&tx, config, project_id, mr, &payload_bytes, &transformed)?;
tx.commit()?;
Ok(result)
@@ -185,7 +185,7 @@ fn process_mr_in_transaction(
config: &Config,
project_id: i64,
mr: &GitLabMergeRequest,
payload_json: &serde_json::Value,
payload_bytes: &[u8],
transformed: &crate::gitlab::transformers::merge_request::MergeRequestWithMetadata,
) -> Result<ProcessMrResult> {
let mut labels_created = 0;
@@ -199,7 +199,7 @@ fn process_mr_in_transaction(
project_id: Some(project_id),
resource_type: "merge_request",
gitlab_id: &mr.id.to_string(),
payload: payload_json,
json_bytes: payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?;
@@ -315,33 +315,28 @@ fn process_mr_in_transaction(
}
/// Upsert a label within a transaction, returning its ID.
/// Upsert a label within a transaction, returning its ID.
/// Uses INSERT...ON CONFLICT...RETURNING for a single round-trip.
fn upsert_label_tx(
tx: &Transaction<'_>,
project_id: i64,
name: &str,
created_count: &mut usize,
) -> Result<i64> {
// Try to get existing
let existing: Option<i64> = tx
.query_row(
"SELECT id FROM labels WHERE project_id = ? AND name = ?",
(project_id, name),
|row| row.get(0),
)
.ok();
let id: i64 = tx.query_row(
"INSERT INTO labels (project_id, name) VALUES (?1, ?2)
ON CONFLICT(project_id, name) DO UPDATE SET name = excluded.name
RETURNING id",
(project_id, name),
|row| row.get(0),
)?;
if let Some(id) = existing {
return Ok(id);
// If the rowid matches last_insert_rowid, this was a new insert
if tx.last_insert_rowid() == id {
*created_count += 1;
}
// Insert new
tx.execute(
"INSERT INTO labels (project_id, name) VALUES (?, ?)",
(project_id, name),
)?;
*created_count += 1;
Ok(tx.last_insert_rowid())
Ok(id)
}
/// Check if an MR passes the cursor filter (not already processed).
@@ -412,13 +407,14 @@ fn reset_sync_cursor(conn: &Connection, project_id: i64) -> Result<()> {
Ok(())
}
/// Reset discussion watermarks for all MRs in project (for full sync).
/// Reset discussion and resource event watermarks for all MRs in project (for full sync).
fn reset_discussion_watermarks(conn: &Connection, project_id: i64) -> Result<()> {
conn.execute(
"UPDATE merge_requests
SET discussions_synced_for_updated_at = NULL,
discussions_sync_attempts = 0,
discussions_sync_last_error = NULL
discussions_sync_last_error = NULL,
resource_events_synced_for_updated_at = NULL
WHERE project_id = ?",
[project_id],
)?;

View File

@@ -172,14 +172,14 @@ pub fn write_prefetched_mr_discussions(
let tx = conn.unchecked_transaction()?;
// Store raw payload
let payload_json = serde_json::to_value(&disc.raw)?;
let payload_bytes = serde_json::to_vec(&disc.raw)?;
let payload_id = Some(store_payload(
&tx,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "discussion",
gitlab_id: &disc.raw.id,
payload: &payload_json,
json_bytes: &payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?);
@@ -206,14 +206,14 @@ pub fn write_prefetched_mr_discussions(
let note_payload_id = if should_store_payload {
let note_data = disc.raw.notes.iter().find(|n| n.id == note.gitlab_id);
if let Some(note_data) = note_data {
let note_payload_json = serde_json::to_value(note_data)?;
let note_payload_bytes = serde_json::to_vec(note_data)?;
Some(store_payload(
&tx,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "note",
gitlab_id: &note.gitlab_id.to_string(),
payload: &note_payload_json,
json_bytes: &note_payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?)
@@ -388,14 +388,14 @@ async fn ingest_discussions_for_mr(
let tx = conn.unchecked_transaction()?;
// Store raw payload
let payload_json = serde_json::to_value(&gitlab_discussion)?;
let payload_bytes = serde_json::to_vec(&gitlab_discussion)?;
let payload_id = Some(store_payload(
&tx,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "discussion",
gitlab_id: &gitlab_discussion.id,
payload: &payload_json,
json_bytes: &payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?);
@@ -429,14 +429,14 @@ async fn ingest_discussions_for_mr(
.iter()
.find(|n| n.id == note.gitlab_id);
if let Some(note_data) = note_data {
let note_payload_json = serde_json::to_value(note_data)?;
let note_payload_bytes = serde_json::to_vec(note_data)?;
Some(store_payload(
&tx,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "note",
gitlab_id: &note.gitlab_id.to_string(),
payload: &note_payload_json,
json_bytes: &note_payload_bytes,
compress: config.storage.compress_raw_payloads,
},
)?)