perf(ingestion): replace per-row INSERT loops with chunked batch INSERTs

The issue and MR ingestion paths previously inserted labels, assignees,
and reviewers one row at a time inside a transaction. For entities with
many labels or assignees, this issued N separate SQLite statements where
a single multi-row INSERT suffices.

Replace the per-row loops with batch INSERT functions that build a
single `INSERT OR IGNORE ... VALUES (?1,?2),(?1,?3),...` statement per
chunk. Chunks are capped at 400 rows (BATCH_LINK_ROWS_MAX) to stay
comfortably below SQLite's default 999 bind-parameter limit.

Affected paths:
- issues.rs: link_issue_labels_batch_tx, insert_issue_assignees_batch_tx
- merge_requests.rs: insert_mr_labels_batch_tx,
  insert_mr_assignees_batch_tx, insert_mr_reviewers_batch_tx

New tests verify deduplication (OR IGNORE), multi-chunk correctness,
and equivalence with the old per-row approach. A perf benchmark
(bench_issue_assignee_insert_individual_vs_batch) demonstrates the
speedup across representative assignee set sizes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
teernisse
2026-03-06 11:28:08 -05:00
parent 5fb27b1fbb
commit 9107a78b57
4 changed files with 534 additions and 28 deletions

View File

@@ -36,6 +36,9 @@ struct SyncCursor {
tie_breaker_id: Option<i64>,
}
// Keep comfortably below SQLite's default 999 bind-parameter limit.
const BATCH_LINK_ROWS_MAX: usize = 400;
pub async fn ingest_issues(
conn: &Connection,
client: &GitLabClient,
@@ -252,22 +255,19 @@ fn process_issue_in_transaction(
[local_issue_id],
)?;
let mut label_ids = Vec::with_capacity(label_names.len());
for label_name in label_names {
let label_id = upsert_label_tx(tx, project_id, label_name, &mut labels_created)?;
link_issue_label_tx(tx, local_issue_id, label_id)?;
label_ids.push(label_id);
}
link_issue_labels_batch_tx(tx, local_issue_id, &label_ids)?;
tx.execute(
"DELETE FROM issue_assignees WHERE issue_id = ?",
[local_issue_id],
)?;
for username in assignee_usernames {
tx.execute(
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES (?, ?)",
(local_issue_id, username),
)?;
}
insert_issue_assignees_batch_tx(tx, local_issue_id, assignee_usernames)?;
Ok(labels_created)
}
@@ -296,11 +296,65 @@ fn upsert_label_tx(
Ok(id)
}
fn link_issue_label_tx(tx: &Transaction<'_>, issue_id: i64, label_id: i64) -> Result<()> {
tx.execute(
"INSERT OR IGNORE INTO issue_labels (issue_id, label_id) VALUES (?, ?)",
(issue_id, label_id),
)?;
fn link_issue_labels_batch_tx(
tx: &Transaction<'_>,
issue_id: i64,
label_ids: &[i64],
) -> Result<()> {
if label_ids.is_empty() {
return Ok(());
}
for chunk in label_ids.chunks(BATCH_LINK_ROWS_MAX) {
let placeholders = (0..chunk.len())
.map(|idx| format!("(?1, ?{})", idx + 2))
.collect::<Vec<_>>()
.join(", ");
let sql = format!(
"INSERT OR IGNORE INTO issue_labels (issue_id, label_id) VALUES {}",
placeholders
);
let mut params: Vec<&dyn rusqlite::types::ToSql> = Vec::with_capacity(chunk.len() + 1);
params.push(&issue_id);
for label_id in chunk {
params.push(label_id);
}
tx.execute(&sql, params.as_slice())?;
}
Ok(())
}
fn insert_issue_assignees_batch_tx(
tx: &Transaction<'_>,
issue_id: i64,
usernames: &[String],
) -> Result<()> {
if usernames.is_empty() {
return Ok(());
}
for chunk in usernames.chunks(BATCH_LINK_ROWS_MAX) {
let placeholders = (0..chunk.len())
.map(|idx| format!("(?1, ?{})", idx + 2))
.collect::<Vec<_>>()
.join(", ");
let sql = format!(
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES {}",
placeholders
);
let mut params: Vec<&dyn rusqlite::types::ToSql> = Vec::with_capacity(chunk.len() + 1);
params.push(&issue_id);
for username in chunk {
params.push(username);
}
tx.execute(&sql, params.as_slice())?;
}
Ok(())
}