perf(ingestion): replace per-row INSERT loops with chunked batch INSERTs

The issue and MR ingestion paths previously inserted labels, assignees,
and reviewers one row at a time inside a transaction. For entities with
many labels or assignees, this issued N separate SQLite statements where
a single multi-row INSERT suffices.

Replace the per-row loops with batch INSERT functions that build a
single `INSERT OR IGNORE ... VALUES (?1,?2),(?1,?3),...` statement per
chunk. Chunks are capped at 400 rows (BATCH_LINK_ROWS_MAX) to stay
comfortably below SQLite's default 999 bind-parameter limit.

Affected paths:
- issues.rs: link_issue_labels_batch_tx, insert_issue_assignees_batch_tx
- merge_requests.rs: insert_mr_labels_batch_tx,
  insert_mr_assignees_batch_tx, insert_mr_reviewers_batch_tx

New tests verify deduplication (OR IGNORE), multi-chunk correctness,
and equivalence with the old per-row approach. A perf benchmark
(bench_issue_assignee_insert_individual_vs_batch) demonstrates the
speedup across representative assignee set sizes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
teernisse
2026-03-06 11:28:08 -05:00
parent 5fb27b1fbb
commit 9107a78b57
4 changed files with 534 additions and 28 deletions

View File

@@ -54,6 +54,11 @@ fn setup_db() -> Connection {
label_id INTEGER NOT NULL REFERENCES labels(id),
PRIMARY KEY(issue_id, label_id)
);
CREATE TABLE issue_assignees (
issue_id INTEGER NOT NULL REFERENCES issues(id),
username TEXT NOT NULL,
PRIMARY KEY(issue_id, username)
);
CREATE TABLE documents (
id INTEGER PRIMARY KEY,
@@ -145,6 +150,55 @@ fn insert_labels_batch(conn: &Connection, doc_id: i64, labels: &[&str]) {
}
}
/// Simulate OLD ingestion approach: individual INSERT per assignee.
fn insert_issue_assignees_individual(conn: &Connection, issue_id: i64, usernames: &[&str]) {
conn.execute(
"DELETE FROM issue_assignees WHERE issue_id = ?1",
[issue_id],
)
.unwrap();
for username in usernames {
conn.execute(
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES (?1, ?2)",
rusqlite::params![issue_id, username],
)
.unwrap();
}
}
/// Simulate NEW ingestion approach: chunked batch INSERTs.
fn insert_issue_assignees_batch(conn: &Connection, issue_id: i64, usernames: &[&str]) {
conn.execute(
"DELETE FROM issue_assignees WHERE issue_id = ?1",
[issue_id],
)
.unwrap();
if usernames.is_empty() {
return;
}
const BATCH_ROWS_MAX: usize = 400;
for chunk in usernames.chunks(BATCH_ROWS_MAX) {
let placeholders = (0..chunk.len())
.map(|idx| format!("(?1, ?{})", idx + 2))
.collect::<Vec<_>>()
.join(", ");
let sql = format!(
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES {}",
placeholders
);
let mut params: Vec<&dyn rusqlite::types::ToSql> = Vec::with_capacity(chunk.len() + 1);
params.push(&issue_id);
for username in chunk {
params.push(username);
}
conn.execute(&sql, params.as_slice()).unwrap();
}
}
/// Simulate OLD string building: format! + push_str
fn build_content_old(
iid: i64,
@@ -197,6 +251,22 @@ const LABEL_SETS: &[&[&str]] = &[
],
];
const ASSIGNEE_SETS: &[&[&str]] = &[
&["alice", "bob", "carol", "dave", "eve", "alice", "bob"],
&[
"frontend1",
"frontend2",
"frontend3",
"frontend4",
"frontend5",
],
&["ops1", "ops2", "ops3", "ops1"],
&["writer1", "writer2"],
&[
"oncall1", "oncall2", "oncall3", "oncall4", "oncall5", "oncall6",
],
];
#[test]
fn bench_label_insert_individual_vs_batch() {
let conn = setup_db();
@@ -273,6 +343,77 @@ fn bench_label_insert_individual_vs_batch() {
);
}
#[test]
fn bench_issue_assignee_insert_individual_vs_batch() {
let conn = setup_db();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at)
VALUES (2, 20, 1, 43, 'Assignee Issue', 'opened', 1000, 2000, 3000)",
[],
)
.unwrap();
let iterations = 20_000;
// Warm up
for users in ASSIGNEE_SETS {
insert_issue_assignees_individual(&conn, 2, users);
insert_issue_assignees_batch(&conn, 2, users);
}
// Benchmark OLD
let start = Instant::now();
for i in 0..iterations {
let users = ASSIGNEE_SETS[i % ASSIGNEE_SETS.len()];
insert_issue_assignees_individual(&conn, 2, users);
}
let old_elapsed = start.elapsed();
// Benchmark NEW
let start = Instant::now();
for i in 0..iterations {
let users = ASSIGNEE_SETS[i % ASSIGNEE_SETS.len()];
insert_issue_assignees_batch(&conn, 2, users);
}
let new_elapsed = start.elapsed();
let speedup = old_elapsed.as_nanos() as f64 / new_elapsed.as_nanos() as f64;
println!(
"\n=== Issue Assignee INSERT Benchmark ({} iterations) ===",
iterations
);
println!("Individual INSERTs: {:?}", old_elapsed);
println!("Batch INSERT: {:?}", new_elapsed);
println!("Speedup: {:.2}x", speedup);
println!();
// Verify correctness: both approaches produce identical rows.
insert_issue_assignees_individual(&conn, 2, &["alice", "bob", "alice", "carol"]);
let old_rows: Vec<String> = conn
.prepare("SELECT username FROM issue_assignees WHERE issue_id = 2 ORDER BY username")
.unwrap()
.query_map([], |row| row.get(0))
.unwrap()
.collect::<std::result::Result<Vec<_>, _>>()
.unwrap();
insert_issue_assignees_batch(&conn, 2, &["alice", "bob", "alice", "carol"]);
let new_rows: Vec<String> = conn
.prepare("SELECT username FROM issue_assignees WHERE issue_id = 2 ORDER BY username")
.unwrap()
.query_map([], |row| row.get(0))
.unwrap()
.collect::<std::result::Result<Vec<_>, _>>()
.unwrap();
assert_eq!(
old_rows, new_rows,
"Both approaches must produce identical rows"
);
}
#[test]
fn bench_string_building_old_vs_new() {
let iterations = 50_000;