perf(ingestion): replace per-row INSERT loops with chunked batch INSERTs

The issue and MR ingestion paths previously inserted labels, assignees, and reviewers one row at a time inside a transaction. For entities with many labels or assignees, this issued N separate SQLite statements where a single multi-row INSERT suffices. Replace the per-row loops with batch INSERT functions that build a single `INSERT OR IGNORE ... VALUES (?1,?2),(?1,?3),...` statement per chunk. Chunks are capped at 400 rows (BATCH_LINK_ROWS_MAX) to stay comfortably below SQLite's default 999 bind-parameter limit. Affected paths: - issues.rs: link_issue_labels_batch_tx, insert_issue_assignees_batch_tx - merge_requests.rs: insert_mr_labels_batch_tx, insert_mr_assignees_batch_tx, insert_mr_reviewers_batch_tx New tests verify deduplication (OR IGNORE), multi-chunk correctness, and equivalence with the old per-row approach. A perf benchmark (bench_issue_assignee_insert_individual_vs_batch) demonstrates the speedup across representative assignee set sizes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 11:28:08 -05:00
parent 5fb27b1fbb
commit 9107a78b57
4 changed files with 534 additions and 28 deletions
--- a/tests/perf_benchmark.rs
+++ b/tests/perf_benchmark.rs
@@ -54,6 +54,11 @@ fn setup_db() -> Connection {
            label_id INTEGER NOT NULL REFERENCES labels(id),
            PRIMARY KEY(issue_id, label_id)
        );
+        CREATE TABLE issue_assignees (
+            issue_id INTEGER NOT NULL REFERENCES issues(id),
+            username TEXT NOT NULL,
+            PRIMARY KEY(issue_id, username)
+        );

        CREATE TABLE documents (
            id INTEGER PRIMARY KEY,
@@ -145,6 +150,55 @@ fn insert_labels_batch(conn: &Connection, doc_id: i64, labels: &[&str]) {
    }
 }

+/// Simulate OLD ingestion approach: individual INSERT per assignee.
+fn insert_issue_assignees_individual(conn: &Connection, issue_id: i64, usernames: &[&str]) {
+    conn.execute(
+        "DELETE FROM issue_assignees WHERE issue_id = ?1",
+        [issue_id],
+    )
+    .unwrap();
+    for username in usernames {
+        conn.execute(
+            "INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES (?1, ?2)",
+            rusqlite::params![issue_id, username],
+        )
+        .unwrap();
+    }
+}
+
+/// Simulate NEW ingestion approach: chunked batch INSERTs.
+fn insert_issue_assignees_batch(conn: &Connection, issue_id: i64, usernames: &[&str]) {
+    conn.execute(
+        "DELETE FROM issue_assignees WHERE issue_id = ?1",
+        [issue_id],
+    )
+    .unwrap();
+
+    if usernames.is_empty() {
+        return;
+    }
+
+    const BATCH_ROWS_MAX: usize = 400;
+    for chunk in usernames.chunks(BATCH_ROWS_MAX) {
+        let placeholders = (0..chunk.len())
+            .map(|idx| format!("(?1, ?{})", idx + 2))
+            .collect::<Vec<_>>()
+            .join(", ");
+        let sql = format!(
+            "INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES {}",
+            placeholders
+        );
+
+        let mut params: Vec<&dyn rusqlite::types::ToSql> = Vec::with_capacity(chunk.len() + 1);
+        params.push(&issue_id);
+        for username in chunk {
+            params.push(username);
+        }
+
+        conn.execute(&sql, params.as_slice()).unwrap();
+    }
+}
+
 /// Simulate OLD string building: format! + push_str
 fn build_content_old(
    iid: i64,
@@ -197,6 +251,22 @@ const LABEL_SETS: &[&[&str]] = &[
    ],
 ];

+const ASSIGNEE_SETS: &[&[&str]] = &[
+    &["alice", "bob", "carol", "dave", "eve", "alice", "bob"],
+    &[
+        "frontend1",
+        "frontend2",
+        "frontend3",
+        "frontend4",
+        "frontend5",
+    ],
+    &["ops1", "ops2", "ops3", "ops1"],
+    &["writer1", "writer2"],
+    &[
+        "oncall1", "oncall2", "oncall3", "oncall4", "oncall5", "oncall6",
+    ],
+];
+
 #[test]
 fn bench_label_insert_individual_vs_batch() {
    let conn = setup_db();
@@ -273,6 +343,77 @@ fn bench_label_insert_individual_vs_batch() {
    );
 }

+#[test]
+fn bench_issue_assignee_insert_individual_vs_batch() {
+    let conn = setup_db();
+
+    conn.execute(
+        "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at)
+         VALUES (2, 20, 1, 43, 'Assignee Issue', 'opened', 1000, 2000, 3000)",
+        [],
+    )
+    .unwrap();
+
+    let iterations = 20_000;
+
+    // Warm up
+    for users in ASSIGNEE_SETS {
+        insert_issue_assignees_individual(&conn, 2, users);
+        insert_issue_assignees_batch(&conn, 2, users);
+    }
+
+    // Benchmark OLD
+    let start = Instant::now();
+    for i in 0..iterations {
+        let users = ASSIGNEE_SETS[i % ASSIGNEE_SETS.len()];
+        insert_issue_assignees_individual(&conn, 2, users);
+    }
+    let old_elapsed = start.elapsed();
+
+    // Benchmark NEW
+    let start = Instant::now();
+    for i in 0..iterations {
+        let users = ASSIGNEE_SETS[i % ASSIGNEE_SETS.len()];
+        insert_issue_assignees_batch(&conn, 2, users);
+    }
+    let new_elapsed = start.elapsed();
+
+    let speedup = old_elapsed.as_nanos() as f64 / new_elapsed.as_nanos() as f64;
+
+    println!(
+        "\n=== Issue Assignee INSERT Benchmark ({} iterations) ===",
+        iterations
+    );
+    println!("Individual INSERTs: {:?}", old_elapsed);
+    println!("Batch INSERT:       {:?}", new_elapsed);
+    println!("Speedup:            {:.2}x", speedup);
+    println!();
+
+    // Verify correctness: both approaches produce identical rows.
+    insert_issue_assignees_individual(&conn, 2, &["alice", "bob", "alice", "carol"]);
+    let old_rows: Vec<String> = conn
+        .prepare("SELECT username FROM issue_assignees WHERE issue_id = 2 ORDER BY username")
+        .unwrap()
+        .query_map([], |row| row.get(0))
+        .unwrap()
+        .collect::<std::result::Result<Vec<_>, _>>()
+        .unwrap();
+
+    insert_issue_assignees_batch(&conn, 2, &["alice", "bob", "alice", "carol"]);
+    let new_rows: Vec<String> = conn
+        .prepare("SELECT username FROM issue_assignees WHERE issue_id = 2 ORDER BY username")
+        .unwrap()
+        .query_map([], |row| row.get(0))
+        .unwrap()
+        .collect::<std::result::Result<Vec<_>, _>>()
+        .unwrap();
+
+    assert_eq!(
+        old_rows, new_rows,
+        "Both approaches must produce identical rows"
+    );
+}
+
 #[test]
 fn bench_string_building_old_vs_new() {
    let iterations = 50_000;