perf(ingestion): replace per-row INSERT loops with chunked batch INSERTs
The issue and MR ingestion paths previously inserted labels, assignees, and reviewers one row at a time inside a transaction. For entities with many labels or assignees, this issued N separate SQLite statements where a single multi-row INSERT suffices. Replace the per-row loops with batch INSERT functions that build a single `INSERT OR IGNORE ... VALUES (?1,?2),(?1,?3),...` statement per chunk. Chunks are capped at 400 rows (BATCH_LINK_ROWS_MAX) to stay comfortably below SQLite's default 999 bind-parameter limit. Affected paths: - issues.rs: link_issue_labels_batch_tx, insert_issue_assignees_batch_tx - merge_requests.rs: insert_mr_labels_batch_tx, insert_mr_assignees_batch_tx, insert_mr_reviewers_batch_tx New tests verify deduplication (OR IGNORE), multi-chunk correctness, and equivalence with the old per-row approach. A perf benchmark (bench_issue_assignee_insert_individual_vs_batch) demonstrates the speedup across representative assignee set sizes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -54,6 +54,11 @@ fn setup_db() -> Connection {
|
||||
label_id INTEGER NOT NULL REFERENCES labels(id),
|
||||
PRIMARY KEY(issue_id, label_id)
|
||||
);
|
||||
CREATE TABLE issue_assignees (
|
||||
issue_id INTEGER NOT NULL REFERENCES issues(id),
|
||||
username TEXT NOT NULL,
|
||||
PRIMARY KEY(issue_id, username)
|
||||
);
|
||||
|
||||
CREATE TABLE documents (
|
||||
id INTEGER PRIMARY KEY,
|
||||
@@ -145,6 +150,55 @@ fn insert_labels_batch(conn: &Connection, doc_id: i64, labels: &[&str]) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Simulate OLD ingestion approach: individual INSERT per assignee.
|
||||
fn insert_issue_assignees_individual(conn: &Connection, issue_id: i64, usernames: &[&str]) {
|
||||
conn.execute(
|
||||
"DELETE FROM issue_assignees WHERE issue_id = ?1",
|
||||
[issue_id],
|
||||
)
|
||||
.unwrap();
|
||||
for username in usernames {
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES (?1, ?2)",
|
||||
rusqlite::params![issue_id, username],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Simulate NEW ingestion approach: chunked batch INSERTs.
|
||||
fn insert_issue_assignees_batch(conn: &Connection, issue_id: i64, usernames: &[&str]) {
|
||||
conn.execute(
|
||||
"DELETE FROM issue_assignees WHERE issue_id = ?1",
|
||||
[issue_id],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
if usernames.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
const BATCH_ROWS_MAX: usize = 400;
|
||||
for chunk in usernames.chunks(BATCH_ROWS_MAX) {
|
||||
let placeholders = (0..chunk.len())
|
||||
.map(|idx| format!("(?1, ?{})", idx + 2))
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ");
|
||||
let sql = format!(
|
||||
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES {}",
|
||||
placeholders
|
||||
);
|
||||
|
||||
let mut params: Vec<&dyn rusqlite::types::ToSql> = Vec::with_capacity(chunk.len() + 1);
|
||||
params.push(&issue_id);
|
||||
for username in chunk {
|
||||
params.push(username);
|
||||
}
|
||||
|
||||
conn.execute(&sql, params.as_slice()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Simulate OLD string building: format! + push_str
|
||||
fn build_content_old(
|
||||
iid: i64,
|
||||
@@ -197,6 +251,22 @@ const LABEL_SETS: &[&[&str]] = &[
|
||||
],
|
||||
];
|
||||
|
||||
const ASSIGNEE_SETS: &[&[&str]] = &[
|
||||
&["alice", "bob", "carol", "dave", "eve", "alice", "bob"],
|
||||
&[
|
||||
"frontend1",
|
||||
"frontend2",
|
||||
"frontend3",
|
||||
"frontend4",
|
||||
"frontend5",
|
||||
],
|
||||
&["ops1", "ops2", "ops3", "ops1"],
|
||||
&["writer1", "writer2"],
|
||||
&[
|
||||
"oncall1", "oncall2", "oncall3", "oncall4", "oncall5", "oncall6",
|
||||
],
|
||||
];
|
||||
|
||||
#[test]
|
||||
fn bench_label_insert_individual_vs_batch() {
|
||||
let conn = setup_db();
|
||||
@@ -273,6 +343,77 @@ fn bench_label_insert_individual_vs_batch() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bench_issue_assignee_insert_individual_vs_batch() {
|
||||
let conn = setup_db();
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at)
|
||||
VALUES (2, 20, 1, 43, 'Assignee Issue', 'opened', 1000, 2000, 3000)",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let iterations = 20_000;
|
||||
|
||||
// Warm up
|
||||
for users in ASSIGNEE_SETS {
|
||||
insert_issue_assignees_individual(&conn, 2, users);
|
||||
insert_issue_assignees_batch(&conn, 2, users);
|
||||
}
|
||||
|
||||
// Benchmark OLD
|
||||
let start = Instant::now();
|
||||
for i in 0..iterations {
|
||||
let users = ASSIGNEE_SETS[i % ASSIGNEE_SETS.len()];
|
||||
insert_issue_assignees_individual(&conn, 2, users);
|
||||
}
|
||||
let old_elapsed = start.elapsed();
|
||||
|
||||
// Benchmark NEW
|
||||
let start = Instant::now();
|
||||
for i in 0..iterations {
|
||||
let users = ASSIGNEE_SETS[i % ASSIGNEE_SETS.len()];
|
||||
insert_issue_assignees_batch(&conn, 2, users);
|
||||
}
|
||||
let new_elapsed = start.elapsed();
|
||||
|
||||
let speedup = old_elapsed.as_nanos() as f64 / new_elapsed.as_nanos() as f64;
|
||||
|
||||
println!(
|
||||
"\n=== Issue Assignee INSERT Benchmark ({} iterations) ===",
|
||||
iterations
|
||||
);
|
||||
println!("Individual INSERTs: {:?}", old_elapsed);
|
||||
println!("Batch INSERT: {:?}", new_elapsed);
|
||||
println!("Speedup: {:.2}x", speedup);
|
||||
println!();
|
||||
|
||||
// Verify correctness: both approaches produce identical rows.
|
||||
insert_issue_assignees_individual(&conn, 2, &["alice", "bob", "alice", "carol"]);
|
||||
let old_rows: Vec<String> = conn
|
||||
.prepare("SELECT username FROM issue_assignees WHERE issue_id = 2 ORDER BY username")
|
||||
.unwrap()
|
||||
.query_map([], |row| row.get(0))
|
||||
.unwrap()
|
||||
.collect::<std::result::Result<Vec<_>, _>>()
|
||||
.unwrap();
|
||||
|
||||
insert_issue_assignees_batch(&conn, 2, &["alice", "bob", "alice", "carol"]);
|
||||
let new_rows: Vec<String> = conn
|
||||
.prepare("SELECT username FROM issue_assignees WHERE issue_id = 2 ORDER BY username")
|
||||
.unwrap()
|
||||
.query_map([], |row| row.get(0))
|
||||
.unwrap()
|
||||
.collect::<std::result::Result<Vec<_>, _>>()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
old_rows, new_rows,
|
||||
"Both approaches must produce identical rows"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bench_string_building_old_vs_new() {
|
||||
let iterations = 50_000;
|
||||
|
||||
Reference in New Issue
Block a user