//! Performance benchmarks for optimization verification. //! Run with: cargo test --test perf_benchmark -- --nocapture use rusqlite::Connection; use std::time::Instant; fn setup_db() -> Connection { let conn = Connection::open_in_memory().unwrap(); conn.execute_batch( " PRAGMA journal_mode = WAL; PRAGMA synchronous = NORMAL; CREATE TABLE projects ( id INTEGER PRIMARY KEY, gitlab_project_id INTEGER UNIQUE NOT NULL, path_with_namespace TEXT NOT NULL, default_branch TEXT, web_url TEXT, created_at INTEGER, updated_at INTEGER, raw_payload_id INTEGER ); INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project'); CREATE TABLE issues ( id INTEGER PRIMARY KEY, gitlab_id INTEGER UNIQUE NOT NULL, project_id INTEGER NOT NULL REFERENCES projects(id), iid INTEGER NOT NULL, title TEXT, description TEXT, state TEXT NOT NULL, author_username TEXT, created_at INTEGER NOT NULL, updated_at INTEGER NOT NULL, last_seen_at INTEGER NOT NULL, discussions_synced_for_updated_at INTEGER, resource_events_synced_for_updated_at INTEGER, web_url TEXT, raw_payload_id INTEGER ); CREATE TABLE labels ( id INTEGER PRIMARY KEY, gitlab_id INTEGER, project_id INTEGER NOT NULL REFERENCES projects(id), name TEXT NOT NULL, color TEXT, description TEXT ); CREATE TABLE issue_labels ( issue_id INTEGER NOT NULL REFERENCES issues(id), label_id INTEGER NOT NULL REFERENCES labels(id), PRIMARY KEY(issue_id, label_id) ); CREATE TABLE documents ( id INTEGER PRIMARY KEY, source_type TEXT NOT NULL, source_id INTEGER NOT NULL, project_id INTEGER NOT NULL, author_username TEXT, label_names TEXT, created_at INTEGER, updated_at INTEGER, url TEXT, title TEXT, content_text TEXT NOT NULL, content_hash TEXT NOT NULL, labels_hash TEXT NOT NULL DEFAULT '', paths_hash TEXT NOT NULL DEFAULT '', is_truncated INTEGER NOT NULL DEFAULT 0, truncated_reason TEXT, UNIQUE(source_type, source_id) ); CREATE TABLE document_labels ( document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, label_name TEXT NOT NULL, PRIMARY KEY(document_id, label_name) ); CREATE TABLE document_paths ( document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, path TEXT NOT NULL, PRIMARY KEY(document_id, path) ); CREATE TABLE dirty_sources ( source_type TEXT NOT NULL, source_id INTEGER NOT NULL, queued_at INTEGER NOT NULL, attempt_count INTEGER NOT NULL DEFAULT 0, last_attempt_at INTEGER, last_error TEXT, next_attempt_at INTEGER, PRIMARY KEY(source_type, source_id) ); CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at); ", ) .unwrap(); conn } /// Simulate the OLD approach: individual INSERT per label fn insert_labels_individual(conn: &Connection, doc_id: i64, labels: &[&str]) { conn.execute( "DELETE FROM document_labels WHERE document_id = ?1", [doc_id], ) .unwrap(); for label in labels { conn.execute( "INSERT INTO document_labels (document_id, label_name) VALUES (?1, ?2)", rusqlite::params![doc_id, label], ) .unwrap(); } } /// Simulate the NEW approach: batch INSERT fn insert_labels_batch(conn: &Connection, doc_id: i64, labels: &[&str]) { conn.execute( "DELETE FROM document_labels WHERE document_id = ?1", [doc_id], ) .unwrap(); if !labels.is_empty() { let placeholders: String = labels .iter() .enumerate() .map(|(i, _)| format!("(?1, ?{})", i + 2)) .collect::>() .join(", "); let sql = format!( "INSERT INTO document_labels (document_id, label_name) VALUES {}", placeholders ); let mut params: Vec> = vec![Box::new(doc_id)]; for label in labels { params.push(Box::new(*label)); } let param_refs: Vec<&dyn rusqlite::types::ToSql> = params.iter().map(|p| p.as_ref()).collect(); conn.execute(&sql, param_refs.as_slice()).unwrap(); } } /// Simulate OLD string building: format! + push_str fn build_content_old( iid: i64, title: &str, project: &str, labels: &str, state: &str, author: &str, url: &str, ) -> String { let mut content = format!("[[Issue]] #{}: {}\nProject: {}\n", iid, title, project); content.push_str(&format!("URL: {}\n", url)); content.push_str(&format!("Labels: {}\n", labels)); content.push_str(&format!("State: {}\n", state)); content.push_str(&format!("Author: @{}\n", author)); content } /// Simulate NEW string building: writeln! directly fn build_content_new( iid: i64, title: &str, project: &str, labels: &str, state: &str, author: &str, url: &str, ) -> String { use std::fmt::Write as _; let mut content = format!("[[Issue]] #{}: {}\nProject: {}\n", iid, title, project); let _ = writeln!(content, "URL: {}", url); let _ = writeln!(content, "Labels: {}", labels); let _ = writeln!(content, "State: {}", state); let _ = writeln!(content, "Author: @{}", author); content } const LABEL_SETS: &[&[&str]] = &[ &["bug", "critical", "backend", "needs-review", "p1"], &["feature", "frontend", "design", "ux"], &["bug", "database", "performance"], &["docs", "api"], &[ "infrastructure", "ci-cd", "devops", "monitoring", "alerting", "sre", ], ]; #[test] fn bench_label_insert_individual_vs_batch() { let conn = setup_db(); // Create a document to attach labels to conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test Issue', 'opened', 1000, 2000, 3000)", [], ).unwrap(); conn.execute( "INSERT INTO documents (id, source_type, source_id, project_id, content_text, content_hash, labels_hash, paths_hash) VALUES (1, 'issue', 1, 1, 'test content', 'hash1', 'lhash1', 'phash1')", [], ).unwrap(); let iterations = 5000; // Warm up for labels in LABEL_SETS { insert_labels_individual(&conn, 1, labels); insert_labels_batch(&conn, 1, labels); } // Benchmark INDIVIDUAL inserts let start = Instant::now(); for i in 0..iterations { let labels = LABEL_SETS[i % LABEL_SETS.len()]; insert_labels_individual(&conn, 1, labels); } let individual_elapsed = start.elapsed(); // Benchmark BATCH inserts let start = Instant::now(); for i in 0..iterations { let labels = LABEL_SETS[i % LABEL_SETS.len()]; insert_labels_batch(&conn, 1, labels); } let batch_elapsed = start.elapsed(); let speedup = individual_elapsed.as_nanos() as f64 / batch_elapsed.as_nanos() as f64; println!( "\n=== Label INSERT Benchmark ({} iterations) ===", iterations ); println!("Individual INSERTs: {:?}", individual_elapsed); println!("Batch INSERT: {:?}", batch_elapsed); println!("Speedup: {:.2}x", speedup); println!(); // Verify correctness: both approaches produce same result insert_labels_individual(&conn, 1, &["a", "b", "c"]); let individual_labels: Vec = conn .prepare("SELECT label_name FROM document_labels WHERE document_id = 1 ORDER BY label_name") .unwrap() .query_map([], |row| row.get(0)) .unwrap() .collect::, _>>() .unwrap(); insert_labels_batch(&conn, 1, &["a", "b", "c"]); let batch_labels: Vec = conn .prepare("SELECT label_name FROM document_labels WHERE document_id = 1 ORDER BY label_name") .unwrap() .query_map([], |row| row.get(0)) .unwrap() .collect::, _>>() .unwrap(); assert_eq!( individual_labels, batch_labels, "Both approaches must produce identical results" ); } #[test] fn bench_string_building_old_vs_new() { let iterations = 50_000; // Warm up for _ in 0..100 { let _ = build_content_old( 42, "Fix authentication bug in login flow", "mygroup/myproject", "[\"bug\",\"auth\",\"critical\"]", "opened", "alice", "https://gitlab.example.com/mygroup/myproject/-/issues/42", ); let _ = build_content_new( 42, "Fix authentication bug in login flow", "mygroup/myproject", "[\"bug\",\"auth\",\"critical\"]", "opened", "alice", "https://gitlab.example.com/mygroup/myproject/-/issues/42", ); } // Benchmark OLD let start = Instant::now(); for i in 0..iterations { let s = build_content_old( i as i64, "Fix authentication bug in login flow with extended description", "mygroup/myproject", "[\"bug\",\"auth\",\"critical\",\"backend\",\"needs-review\"]", "opened", "alice", "https://gitlab.example.com/mygroup/myproject/-/issues/42", ); std::hint::black_box(s); } let old_elapsed = start.elapsed(); // Benchmark NEW let start = Instant::now(); for i in 0..iterations { let s = build_content_new( i as i64, "Fix authentication bug in login flow with extended description", "mygroup/myproject", "[\"bug\",\"auth\",\"critical\",\"backend\",\"needs-review\"]", "opened", "alice", "https://gitlab.example.com/mygroup/myproject/-/issues/42", ); std::hint::black_box(s); } let new_elapsed = start.elapsed(); let speedup = old_elapsed.as_nanos() as f64 / new_elapsed.as_nanos() as f64; println!( "\n=== String Building Benchmark ({} iterations) ===", iterations ); println!("format!+push_str: {:?}", old_elapsed); println!("writeln!: {:?}", new_elapsed); println!("Speedup: {:.2}x", speedup); println!(); // Verify correctness: both produce identical output let old = build_content_old( 42, "Test", "group/proj", "[\"bug\"]", "opened", "alice", "https://example.com", ); let new = build_content_new( 42, "Test", "group/proj", "[\"bug\"]", "opened", "alice", "https://example.com", ); assert_eq!(old, new, "Both approaches must produce identical strings"); } #[test] fn bench_prepare_vs_prepare_cached() { let conn = setup_db(); // Seed some documents for i in 1..=100 { conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (?1, ?2, 1, ?1, 'Test', 'opened', 1000, 2000, 3000)", rusqlite::params![i, i * 10], ).unwrap(); conn.execute( "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash, labels_hash, paths_hash) VALUES ('issue', ?1, 1, 'content', ?2, 'lh', 'ph')", rusqlite::params![i, format!("hash_{}", i)], ).unwrap(); } let iterations = 10_000; let sql = "SELECT content_hash FROM documents WHERE source_type = ?1 AND source_id = ?2"; // Benchmark prepare (uncached) let start = Instant::now(); for i in 0..iterations { let source_id = (i % 100) + 1; let mut stmt = conn.prepare(sql).unwrap(); let _hash: Option = stmt .query_row(rusqlite::params!["issue", source_id as i64], |row| { row.get(0) }) .ok(); } let uncached_elapsed = start.elapsed(); // Benchmark prepare_cached let start = Instant::now(); for i in 0..iterations { let source_id = (i % 100) + 1; let mut stmt = conn.prepare_cached(sql).unwrap(); let _hash: Option = stmt .query_row(rusqlite::params!["issue", source_id as i64], |row| { row.get(0) }) .ok(); } let cached_elapsed = start.elapsed(); let speedup = uncached_elapsed.as_nanos() as f64 / cached_elapsed.as_nanos() as f64; println!( "\n=== prepare vs prepare_cached Benchmark ({} iterations) ===", iterations ); println!("prepare(): {:?}", uncached_elapsed); println!("prepare_cached(): {:?}", cached_elapsed); println!("Speedup: {:.2}x", speedup); println!(); } /// Benchmark: redundant hash query elimination in document regeneration. /// OLD: get_existing_hash (1 query) + upsert_document_inner (1 query) = 2 queries per doc /// NEW: upsert_document_inner only (1 query) = 1 query per doc #[test] fn bench_redundant_hash_query_elimination() { let conn = setup_db(); // Seed documents for i in 1..=100 { conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (?1, ?2, 1, ?1, 'Test', 'opened', 1000, 2000, 3000)", rusqlite::params![i, i * 10], ).unwrap(); conn.execute( "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash, labels_hash, paths_hash) VALUES ('issue', ?1, 1, 'content', ?2, 'lh', 'ph')", rusqlite::params![i, format!("hash_{}", i)], ).unwrap(); } let iterations = 10_000; let hash_sql = "SELECT content_hash FROM documents WHERE source_type = ?1 AND source_id = ?2"; let full_sql = "SELECT id, content_hash, labels_hash, paths_hash FROM documents WHERE source_type = ?1 AND source_id = ?2"; // OLD: 2 queries per document (get_existing_hash + upsert_document_inner) let start = Instant::now(); for i in 0..iterations { let source_id = (i % 100) + 1; // Query 1: get_existing_hash let mut stmt1 = conn.prepare_cached(hash_sql).unwrap(); let _hash: Option = stmt1 .query_row(rusqlite::params!["issue", source_id as i64], |row| { row.get(0) }) .ok(); // Query 2: upsert_document_inner let mut stmt2 = conn.prepare_cached(full_sql).unwrap(); let _existing: Option<(i64, String, String, String)> = stmt2 .query_row(rusqlite::params!["issue", source_id as i64], |row| { Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) }) .ok(); std::hint::black_box((_hash, _existing)); } let old_elapsed = start.elapsed(); // NEW: 1 query per document (upsert_document_inner returns change info) let start = Instant::now(); for i in 0..iterations { let source_id = (i % 100) + 1; // Single query that provides both change detection and upsert data let mut stmt = conn.prepare_cached(full_sql).unwrap(); let existing: Option<(i64, String, String, String)> = stmt .query_row(rusqlite::params!["issue", source_id as i64], |row| { Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) }) .ok(); let _changed = match &existing { Some((_, old_hash, _, _)) => old_hash != &format!("hash_{}", source_id), None => true, }; std::hint::black_box((existing, _changed)); } let new_elapsed = start.elapsed(); let speedup = old_elapsed.as_nanos() as f64 / new_elapsed.as_nanos() as f64; println!( "\n=== Redundant Hash Query Elimination ({} iterations) ===", iterations ); println!("OLD (2 queries): {:?}", old_elapsed); println!("NEW (1 query): {:?}", new_elapsed); println!("Speedup: {:.2}x", speedup); println!(); } // NOTE: SHA256 hex formatting (format!("{:x}") vs LUT) was benchmarked at 1.01x. // The SHA256 computation dominates; hex encoding is negligible. Optimization reverted. // NOTE: compute_list_hash indirect index sort vs direct &str sort was benchmarked at 1.02x. // SHA256 dominates here too; the sort strategy is negligible. Optimization reverted. /// Benchmark: f32-to-bytes conversion - allocate-per-call vs reusable buffer. /// The embedding pipeline converts 768 f32s to 3072 bytes per chunk stored. #[test] fn bench_embedding_bytes_alloc_vs_reuse() { // Simulate 768-dim embeddings (nomic-embed-text) let dims = 768; let embeddings: Vec> = (0..100) .map(|i| (0..dims).map(|j| (i * dims + j) as f32 * 0.001).collect()) .collect(); let iterations = 50_000; fn to_bytes_alloc(embedding: &[f32]) -> Vec { let mut bytes = Vec::with_capacity(embedding.len() * 4); for f in embedding { bytes.extend_from_slice(&f.to_le_bytes()); } bytes } fn to_bytes_reuse(embedding: &[f32], buf: &mut Vec) { buf.clear(); buf.reserve(embedding.len() * 4); for f in embedding { buf.extend_from_slice(&f.to_le_bytes()); } } // Warm up let mut buf = Vec::with_capacity(dims * 4); for emb in &embeddings { let _ = to_bytes_alloc(emb); to_bytes_reuse(emb, &mut buf); } // Benchmark OLD: allocate per call let start = Instant::now(); for i in 0..iterations { let emb = &embeddings[i % embeddings.len()]; let bytes = to_bytes_alloc(emb); std::hint::black_box(&bytes); } let old_elapsed = start.elapsed(); // Benchmark NEW: reusable buffer let start = Instant::now(); let mut buf = Vec::with_capacity(dims * 4); for i in 0..iterations { let emb = &embeddings[i % embeddings.len()]; to_bytes_reuse(emb, &mut buf); std::hint::black_box(&buf); } let new_elapsed = start.elapsed(); let speedup = old_elapsed.as_nanos() as f64 / new_elapsed.as_nanos() as f64; println!( "\n=== Embedding Bytes Conversion Benchmark ({} iterations, {} dims) ===", iterations, dims ); println!("Alloc per call: {:?}", old_elapsed); println!("Reusable buffer: {:?}", new_elapsed); println!("Speedup: {:.2}x", speedup); println!(); // Verify correctness let test_emb: Vec = (0..dims).map(|i| i as f32 * 0.1).collect(); let alloc_result = to_bytes_alloc(&test_emb); to_bytes_reuse(&test_emb, &mut buf); assert_eq!( alloc_result, buf, "Both approaches must produce identical bytes" ); }