Files
gitlore/src/core/payloads.rs
Taylor Eernisse ee5c5f9645 perf: Eliminate double serialization, add SQLite tuning, optimize hot paths
11 isomorphic performance fixes from deep audit (no behavior changes):

- Eliminate double serialization: store_payload now accepts pre-serialized
  bytes (&[u8]) instead of re-serializing from serde_json::Value. Uses
  Cow<[u8]> for zero-copy when compression is disabled.
- Add SQLite cache_size (64MB) and mmap_size (256MB) pragmas
- Replace SELECT-then-INSERT label upserts with INSERT...ON CONFLICT
  RETURNING in both issues.rs and merge_requests.rs
- Replace INSERT + SELECT milestone upsert with RETURNING
- Use prepare_cached for 5 hot-path queries in extractor.rs
- Optimize compute_list_hash: index-sort + incremental SHA-256 instead
  of clone+sort+join+hash
- Pre-allocate embedding float-to-bytes buffer with Vec::with_capacity
- Replace RandomState::new() in rand_jitter with atomic counter XOR nanos
- Remove redundant per-note payload storage (discussion payload contains
  all notes already)
- Change transform_issue to accept &GitLabIssue (avoids full struct clone)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 08:12:37 -05:00

218 lines
6.6 KiB
Rust

//! Raw payload storage with optional compression and deduplication.
use flate2::Compression;
use flate2::read::GzDecoder;
use flate2::write::GzEncoder;
use rusqlite::Connection;
use sha2::{Digest, Sha256};
use std::io::{Read, Write};
use super::error::Result;
use super::time::now_ms;
/// Options for storing a payload.
pub struct StorePayloadOptions<'a> {
pub project_id: Option<i64>,
pub resource_type: &'a str, // 'project' | 'issue' | 'mr' | 'note' | 'discussion'
pub gitlab_id: &'a str, // TEXT because discussion IDs are strings
pub json_bytes: &'a [u8],
pub compress: bool,
}
/// Store a raw API payload with optional compression and deduplication.
/// Returns the row ID (either new or existing if duplicate).
pub fn store_payload(conn: &Connection, options: StorePayloadOptions) -> Result<i64> {
let json_bytes = options.json_bytes;
// 2. SHA-256 hash the JSON bytes (pre-compression)
let mut hasher = Sha256::new();
hasher.update(json_bytes);
let payload_hash = format!("{:x}", hasher.finalize());
// 3. Check for duplicate by (project_id, resource_type, gitlab_id, payload_hash)
let existing: Option<i64> = conn
.query_row(
"SELECT id FROM raw_payloads
WHERE project_id IS ? AND resource_type = ? AND gitlab_id = ? AND payload_hash = ?",
(
options.project_id,
options.resource_type,
options.gitlab_id,
&payload_hash,
),
|row| row.get(0),
)
.ok();
// 4. If duplicate, return existing ID
if let Some(id) = existing {
return Ok(id);
}
// 5. Compress if requested
let (encoding, payload_bytes): (&str, std::borrow::Cow<'_, [u8]>) = if options.compress {
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
encoder.write_all(json_bytes)?;
("gzip", std::borrow::Cow::Owned(encoder.finish()?))
} else {
("identity", std::borrow::Cow::Borrowed(json_bytes))
};
// 6. INSERT with content_encoding
conn.execute(
"INSERT INTO raw_payloads
(source, project_id, resource_type, gitlab_id, fetched_at, content_encoding, payload_hash, payload)
VALUES ('gitlab', ?, ?, ?, ?, ?, ?, ?)",
(
options.project_id,
options.resource_type,
options.gitlab_id,
now_ms(),
encoding,
&payload_hash,
payload_bytes.as_ref(),
),
)?;
Ok(conn.last_insert_rowid())
}
/// Read a raw payload by ID, decompressing if necessary.
/// Returns None if not found.
pub fn read_payload(conn: &Connection, id: i64) -> Result<Option<serde_json::Value>> {
let row: Option<(String, Vec<u8>)> = conn
.query_row(
"SELECT content_encoding, payload FROM raw_payloads WHERE id = ?",
[id],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.ok();
let Some((encoding, payload_bytes)) = row else {
return Ok(None);
};
// Decompress if needed
let json_bytes = if encoding == "gzip" {
let mut decoder = GzDecoder::new(&payload_bytes[..]);
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed)?;
decompressed
} else {
payload_bytes
};
let value: serde_json::Value = serde_json::from_slice(&json_bytes)?;
Ok(Some(value))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::db::create_connection;
use tempfile::tempdir;
fn setup_test_db() -> Connection {
let dir = tempdir().unwrap();
let db_path = dir.path().join("test.db");
let conn = create_connection(&db_path).unwrap();
// Create minimal schema for testing
conn.execute_batch(
"CREATE TABLE raw_payloads (
id INTEGER PRIMARY KEY,
source TEXT NOT NULL,
project_id INTEGER,
resource_type TEXT NOT NULL,
gitlab_id TEXT NOT NULL,
fetched_at INTEGER NOT NULL,
content_encoding TEXT NOT NULL DEFAULT 'identity',
payload_hash TEXT NOT NULL,
payload BLOB NOT NULL
);
CREATE UNIQUE INDEX uq_raw_payloads_dedupe
ON raw_payloads(project_id, resource_type, gitlab_id, payload_hash);",
)
.unwrap();
conn
}
#[test]
fn test_store_and_read_payload() {
let conn = setup_test_db();
let payload = serde_json::json!({"title": "Test Issue", "id": 123});
let json_bytes = serde_json::to_vec(&payload).unwrap();
let id = store_payload(
&conn,
StorePayloadOptions {
project_id: Some(1),
resource_type: "issue",
gitlab_id: "123",
json_bytes: &json_bytes,
compress: false,
},
)
.unwrap();
let result = read_payload(&conn, id).unwrap().unwrap();
assert_eq!(result["title"], "Test Issue");
}
#[test]
fn test_compression_roundtrip() {
let conn = setup_test_db();
let payload = serde_json::json!({"data": "x".repeat(1000)});
let json_bytes = serde_json::to_vec(&payload).unwrap();
let id = store_payload(
&conn,
StorePayloadOptions {
project_id: Some(1),
resource_type: "issue",
gitlab_id: "456",
json_bytes: &json_bytes,
compress: true,
},
)
.unwrap();
let result = read_payload(&conn, id).unwrap().unwrap();
assert_eq!(result["data"], "x".repeat(1000));
}
#[test]
fn test_deduplication() {
let conn = setup_test_db();
let payload = serde_json::json!({"id": 789});
let json_bytes = serde_json::to_vec(&payload).unwrap();
let id1 = store_payload(
&conn,
StorePayloadOptions {
project_id: Some(1),
resource_type: "issue",
gitlab_id: "789",
json_bytes: &json_bytes,
compress: false,
},
)
.unwrap();
let id2 = store_payload(
&conn,
StorePayloadOptions {
project_id: Some(1),
resource_type: "issue",
gitlab_id: "789",
json_bytes: &json_bytes,
compress: false,
},
)
.unwrap();
assert_eq!(id1, id2); // Same payload returns same ID
}
}