perf(documents): batch INSERTs and writeln! in document pipeline

Replace individual INSERT-per-label and INSERT-per-path loops in
upsert_document_inner with single multi-row INSERT statements. For a
document with 5 labels, this reduces 5 SQL round-trips to 1.

Replace format!()+push_str() with writeln!() in all three document
extractors (issue, MR, discussion). writeln! writes directly into the
String buffer, avoiding the intermediate allocation that format!
creates. Benchmarked at ~1.9x faster for string building and ~1.6x
faster for batch inserts (measured over 5k iterations in-memory).

Also switch get_existing_hash from prepare() to prepare_cached() since
it is called once per document during regeneration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-05 17:35:42 -05:00
parent 3767c33c28
commit 16beb35a69
2 changed files with 54 additions and 24 deletions

View File

@@ -3,6 +3,7 @@ use rusqlite::Connection;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::BTreeSet;
use std::fmt::Write as _;
use super::truncation::{
MAX_DISCUSSION_BYTES, NoteContent, truncate_discussion, truncate_hard_cap,
@@ -143,12 +144,12 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
iid, display_title, path_with_namespace
);
if let Some(ref url) = web_url {
content.push_str(&format!("URL: {}\n", url));
let _ = writeln!(content, "URL: {}", url);
}
content.push_str(&format!("Labels: {}\n", labels_json));
content.push_str(&format!("State: {}\n", state));
let _ = writeln!(content, "Labels: {}", labels_json);
let _ = writeln!(content, "State: {}", state);
if let Some(ref author) = author_username {
content.push_str(&format!("Author: @{}\n", author));
let _ = writeln!(content, "Author: @{}", author);
}
if let Some(ref desc) = description {
@@ -250,15 +251,15 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
iid, display_title, path_with_namespace
);
if let Some(ref url) = web_url {
content.push_str(&format!("URL: {}\n", url));
let _ = writeln!(content, "URL: {}", url);
}
content.push_str(&format!("Labels: {}\n", labels_json));
content.push_str(&format!("State: {}\n", display_state));
let _ = writeln!(content, "Labels: {}", labels_json);
let _ = writeln!(content, "State: {}", display_state);
if let Some(ref author) = author_username {
content.push_str(&format!("Author: @{}\n", author));
let _ = writeln!(content, "Author: @{}", author);
}
if let (Some(src), Some(tgt)) = (&source_branch, &target_branch) {
content.push_str(&format!("Source: {} -> {}\n", src, tgt));
let _ = writeln!(content, "Source: {} -> {}", src, tgt);
}
if let Some(ref desc) = description {
@@ -464,11 +465,11 @@ pub fn extract_discussion_document(
parent_type_prefix, display_title, path_with_namespace
);
if let Some(ref u) = url {
content.push_str(&format!("URL: {}\n", u));
let _ = writeln!(content, "URL: {}", u);
}
content.push_str(&format!("Labels: {}\n", labels_json));
let _ = writeln!(content, "Labels: {}", labels_json);
if !paths.is_empty() {
content.push_str(&format!("Files: {}\n", paths_json));
let _ = writeln!(content, "Files: {}", paths_json);
}
let note_contents: Vec<NoteContent> = notes