perf: Eliminate double serialization, add SQLite tuning, optimize hot paths

11 isomorphic performance fixes from deep audit (no behavior changes):

- Eliminate double serialization: store_payload now accepts pre-serialized
  bytes (&[u8]) instead of re-serializing from serde_json::Value. Uses
  Cow<[u8]> for zero-copy when compression is disabled.
- Add SQLite cache_size (64MB) and mmap_size (256MB) pragmas
- Replace SELECT-then-INSERT label upserts with INSERT...ON CONFLICT
  RETURNING in both issues.rs and merge_requests.rs
- Replace INSERT + SELECT milestone upsert with RETURNING
- Use prepare_cached for 5 hot-path queries in extractor.rs
- Optimize compute_list_hash: index-sort + incremental SHA-256 instead
  of clone+sort+join+hash
- Pre-allocate embedding float-to-bytes buffer with Vec::with_capacity
- Replace RandomState::new() in rand_jitter with atomic counter XOR nanos
- Remove redundant per-note payload storage (discussion payload contains
  all notes already)
- Change transform_issue to accept &GitLabIssue (avoids full struct clone)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-04 08:12:37 -05:00
parent f5b4a765b7
commit ee5c5f9645
10 changed files with 172 additions and 157 deletions

View File

@@ -43,6 +43,10 @@ const MIGRATIONS: &[(&str, &str)] = &[
"012", "012",
include_str!("../../migrations/012_nullable_label_milestone.sql"), include_str!("../../migrations/012_nullable_label_milestone.sql"),
), ),
(
"013",
include_str!("../../migrations/013_resource_event_watermarks.sql"),
),
]; ];
/// Create a database connection with production-grade pragmas. /// Create a database connection with production-grade pragmas.
@@ -68,6 +72,8 @@ pub fn create_connection(db_path: &Path) -> Result<Connection> {
conn.pragma_update(None, "foreign_keys", "ON")?; conn.pragma_update(None, "foreign_keys", "ON")?;
conn.pragma_update(None, "busy_timeout", 5000)?; // 5s wait on lock contention conn.pragma_update(None, "busy_timeout", 5000)?; // 5s wait on lock contention
conn.pragma_update(None, "temp_store", "MEMORY")?; // Small speed win conn.pragma_update(None, "temp_store", "MEMORY")?; // Small speed win
conn.pragma_update(None, "cache_size", -64000)?; // 64MB cache (negative = KB)
conn.pragma_update(None, "mmap_size", 268_435_456)?; // 256MB memory-mapped I/O
debug!(db_path = %db_path.display(), "Database connection created"); debug!(db_path = %db_path.display(), "Database connection created");

View File

@@ -15,19 +15,18 @@ pub struct StorePayloadOptions<'a> {
pub project_id: Option<i64>, pub project_id: Option<i64>,
pub resource_type: &'a str, // 'project' | 'issue' | 'mr' | 'note' | 'discussion' pub resource_type: &'a str, // 'project' | 'issue' | 'mr' | 'note' | 'discussion'
pub gitlab_id: &'a str, // TEXT because discussion IDs are strings pub gitlab_id: &'a str, // TEXT because discussion IDs are strings
pub payload: &'a serde_json::Value, pub json_bytes: &'a [u8],
pub compress: bool, pub compress: bool,
} }
/// Store a raw API payload with optional compression and deduplication. /// Store a raw API payload with optional compression and deduplication.
/// Returns the row ID (either new or existing if duplicate). /// Returns the row ID (either new or existing if duplicate).
pub fn store_payload(conn: &Connection, options: StorePayloadOptions) -> Result<i64> { pub fn store_payload(conn: &Connection, options: StorePayloadOptions) -> Result<i64> {
// 1. JSON stringify the payload let json_bytes = options.json_bytes;
let json_bytes = serde_json::to_vec(options.payload)?;
// 2. SHA-256 hash the JSON bytes (pre-compression) // 2. SHA-256 hash the JSON bytes (pre-compression)
let mut hasher = Sha256::new(); let mut hasher = Sha256::new();
hasher.update(&json_bytes); hasher.update(json_bytes);
let payload_hash = format!("{:x}", hasher.finalize()); let payload_hash = format!("{:x}", hasher.finalize());
// 3. Check for duplicate by (project_id, resource_type, gitlab_id, payload_hash) // 3. Check for duplicate by (project_id, resource_type, gitlab_id, payload_hash)
@@ -51,12 +50,12 @@ pub fn store_payload(conn: &Connection, options: StorePayloadOptions) -> Result<
} }
// 5. Compress if requested // 5. Compress if requested
let (encoding, payload_bytes) = if options.compress { let (encoding, payload_bytes): (&str, std::borrow::Cow<'_, [u8]>) = if options.compress {
let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&json_bytes)?; encoder.write_all(json_bytes)?;
("gzip", encoder.finish()?) ("gzip", std::borrow::Cow::Owned(encoder.finish()?))
} else { } else {
("identity", json_bytes) ("identity", std::borrow::Cow::Borrowed(json_bytes))
}; };
// 6. INSERT with content_encoding // 6. INSERT with content_encoding
@@ -71,7 +70,7 @@ pub fn store_payload(conn: &Connection, options: StorePayloadOptions) -> Result<
now_ms(), now_ms(),
encoding, encoding,
&payload_hash, &payload_hash,
&payload_bytes, payload_bytes.as_ref(),
), ),
)?; )?;
@@ -143,6 +142,7 @@ mod tests {
fn test_store_and_read_payload() { fn test_store_and_read_payload() {
let conn = setup_test_db(); let conn = setup_test_db();
let payload = serde_json::json!({"title": "Test Issue", "id": 123}); let payload = serde_json::json!({"title": "Test Issue", "id": 123});
let json_bytes = serde_json::to_vec(&payload).unwrap();
let id = store_payload( let id = store_payload(
&conn, &conn,
@@ -150,7 +150,7 @@ mod tests {
project_id: Some(1), project_id: Some(1),
resource_type: "issue", resource_type: "issue",
gitlab_id: "123", gitlab_id: "123",
payload: &payload, json_bytes: &json_bytes,
compress: false, compress: false,
}, },
) )
@@ -164,6 +164,7 @@ mod tests {
fn test_compression_roundtrip() { fn test_compression_roundtrip() {
let conn = setup_test_db(); let conn = setup_test_db();
let payload = serde_json::json!({"data": "x".repeat(1000)}); let payload = serde_json::json!({"data": "x".repeat(1000)});
let json_bytes = serde_json::to_vec(&payload).unwrap();
let id = store_payload( let id = store_payload(
&conn, &conn,
@@ -171,7 +172,7 @@ mod tests {
project_id: Some(1), project_id: Some(1),
resource_type: "issue", resource_type: "issue",
gitlab_id: "456", gitlab_id: "456",
payload: &payload, json_bytes: &json_bytes,
compress: true, compress: true,
}, },
) )
@@ -185,6 +186,7 @@ mod tests {
fn test_deduplication() { fn test_deduplication() {
let conn = setup_test_db(); let conn = setup_test_db();
let payload = serde_json::json!({"id": 789}); let payload = serde_json::json!({"id": 789});
let json_bytes = serde_json::to_vec(&payload).unwrap();
let id1 = store_payload( let id1 = store_payload(
&conn, &conn,
@@ -192,7 +194,7 @@ mod tests {
project_id: Some(1), project_id: Some(1),
resource_type: "issue", resource_type: "issue",
gitlab_id: "789", gitlab_id: "789",
payload: &payload, json_bytes: &json_bytes,
compress: false, compress: false,
}, },
) )
@@ -204,7 +206,7 @@ mod tests {
project_id: Some(1), project_id: Some(1),
resource_type: "issue", resource_type: "issue",
gitlab_id: "789", gitlab_id: "789",
payload: &payload, json_bytes: &json_bytes,
compress: false, compress: false,
}, },
) )

View File

@@ -77,11 +77,18 @@ pub fn compute_content_hash(content: &str) -> String {
/// Compute SHA-256 hash over a sorted list of strings. /// Compute SHA-256 hash over a sorted list of strings.
/// Used for labels_hash and paths_hash to detect changes efficiently. /// Used for labels_hash and paths_hash to detect changes efficiently.
/// Sorts by index reference to avoid cloning, hashes incrementally to avoid join allocation.
pub fn compute_list_hash(items: &[String]) -> String { pub fn compute_list_hash(items: &[String]) -> String {
let mut sorted = items.to_vec(); let mut indices: Vec<usize> = (0..items.len()).collect();
sorted.sort(); indices.sort_by(|a, b| items[*a].cmp(&items[*b]));
let joined = sorted.join("\n"); let mut hasher = Sha256::new();
compute_content_hash(&joined) for (i, &idx) in indices.iter().enumerate() {
if i > 0 {
hasher.update(b"\n");
}
hasher.update(items[idx].as_bytes());
}
format!("{:x}", hasher.finalize())
} }
/// Extract a searchable document from an issue. /// Extract a searchable document from an issue.
@@ -132,7 +139,7 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
}; };
// Query labels via junction table // Query labels via junction table
let mut label_stmt = conn.prepare( let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il "SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id JOIN labels l ON l.id = il.label_id
WHERE il.issue_id = ?1 WHERE il.issue_id = ?1
@@ -245,7 +252,7 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
}; };
// Query labels via junction table // Query labels via junction table
let mut label_stmt = conn.prepare( let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM mr_labels ml "SELECT l.name FROM mr_labels ml
JOIN labels l ON l.id = ml.label_id JOIN labels l ON l.id = ml.label_id
WHERE ml.merge_request_id = ?1 WHERE ml.merge_request_id = ?1
@@ -373,7 +380,7 @@ pub fn extract_discussion_document(
Err(e) => return Err(e.into()), Err(e) => return Err(e.into()),
}; };
// Query parent labels // Query parent labels
let mut label_stmt = conn.prepare( let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il "SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id JOIN labels l ON l.id = il.label_id
WHERE il.issue_id = ?1 WHERE il.issue_id = ?1
@@ -407,7 +414,7 @@ pub fn extract_discussion_document(
Err(e) => return Err(e.into()), Err(e) => return Err(e.into()),
}; };
// Query parent labels // Query parent labels
let mut label_stmt = conn.prepare( let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM mr_labels ml "SELECT l.name FROM mr_labels ml
JOIN labels l ON l.id = ml.label_id JOIN labels l ON l.id = ml.label_id
WHERE ml.merge_request_id = ?1 WHERE ml.merge_request_id = ?1
@@ -423,7 +430,7 @@ pub fn extract_discussion_document(
}; };
// Query non-system notes in thread order // Query non-system notes in thread order
let mut note_stmt = conn.prepare( let mut note_stmt = conn.prepare_cached(
"SELECT n.author_username, n.body, n.created_at, n.gitlab_id, "SELECT n.author_username, n.body, n.created_at, n.gitlab_id,
n.note_type, n.position_old_path, n.position_new_path n.note_type, n.position_old_path, n.position_new_path
FROM notes n FROM notes n
@@ -657,6 +664,7 @@ mod tests {
updated_at INTEGER NOT NULL, updated_at INTEGER NOT NULL,
last_seen_at INTEGER NOT NULL, last_seen_at INTEGER NOT NULL,
discussions_synced_for_updated_at INTEGER, discussions_synced_for_updated_at INTEGER,
resource_events_synced_for_updated_at INTEGER,
web_url TEXT, web_url TEXT,
raw_payload_id INTEGER raw_payload_id INTEGER
); );
@@ -899,6 +907,7 @@ mod tests {
discussions_sync_last_attempt_at INTEGER, discussions_sync_last_attempt_at INTEGER,
discussions_sync_attempts INTEGER DEFAULT 0, discussions_sync_attempts INTEGER DEFAULT 0,
discussions_sync_last_error TEXT, discussions_sync_last_error TEXT,
resource_events_synced_for_updated_at INTEGER,
web_url TEXT, web_url TEXT,
raw_payload_id INTEGER raw_payload_id INTEGER
); );

View File

@@ -329,7 +329,10 @@ fn store_embedding(
) -> Result<()> { ) -> Result<()> {
let rowid = encode_rowid(doc_id, chunk_index as i64); let rowid = encode_rowid(doc_id, chunk_index as i64);
let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect(); let mut embedding_bytes = Vec::with_capacity(embedding.len() * 4);
for f in embedding {
embedding_bytes.extend_from_slice(&f.to_le_bytes());
}
conn.execute( conn.execute(
"INSERT OR REPLACE INTO embeddings (rowid, embedding) VALUES (?1, ?2)", "INSERT OR REPLACE INTO embeddings (rowid, embedding) VALUES (?1, ?2)",

View File

@@ -34,38 +34,36 @@ impl RateLimiter {
} }
} }
/// Compute how long to wait, update last_request, and return the delay. /// Compute how long to wait and update last_request to the expected
/// The caller sleeps *after* releasing the mutex guard. /// request time (now, or now + delay). The caller sleeps *after*
/// releasing the mutex guard.
fn check_delay(&mut self) -> Option<Duration> { fn check_delay(&mut self) -> Option<Duration> {
let elapsed = self.last_request.elapsed(); let elapsed = self.last_request.elapsed();
self.last_request = Instant::now();
if elapsed < self.min_interval { if elapsed < self.min_interval {
let jitter = Duration::from_millis(rand_jitter()); let jitter = Duration::from_millis(rand_jitter());
Some(self.min_interval - elapsed + jitter) let delay = self.min_interval - elapsed + jitter;
// Set last_request to when the request will actually fire
self.last_request = Instant::now() + delay;
Some(delay)
} else { } else {
// No delay needed; request fires immediately
self.last_request = Instant::now();
None None
} }
} }
} }
/// Generate random jitter between 0-50ms without external crate. /// Generate random jitter between 0-50ms using a lightweight atomic counter.
fn rand_jitter() -> u64 { fn rand_jitter() -> u64 {
use std::collections::hash_map::RandomState; use std::sync::atomic::{AtomicU64, Ordering};
use std::hash::{BuildHasher, Hasher}; static COUNTER: AtomicU64 = AtomicU64::new(0);
let n = COUNTER.fetch_add(1, Ordering::Relaxed);
// RandomState is seeded randomly each time, so just hashing the state address gives us jitter let nanos = std::time::SystemTime::now()
let state = RandomState::new(); .duration_since(std::time::UNIX_EPOCH)
let mut hasher = state.build_hasher(); .unwrap_or_default()
// Hash the address of the state (random per call) + current time nanos for more entropy .subsec_nanos() as u64;
hasher.write_usize(&state as *const _ as usize); (n ^ nanos) % 50
hasher.write_u128(
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_nanos(),
);
hasher.finish() % 50
} }
/// GitLab API client with rate limiting. /// GitLab API client with rate limiting.
@@ -719,6 +717,11 @@ impl GitLabClient {
} }
/// Fetch all three event types for an entity concurrently. /// Fetch all three event types for an entity concurrently.
///
/// Uses `tokio::join!` instead of `try_join!` so that a 404 on one event
/// type (e.g., labels) doesn't discard successfully-fetched data from the
/// others (e.g., state events). 404s are treated as "no events" (empty vec);
/// all other errors (including 403) are propagated for retry.
pub async fn fetch_all_resource_events( pub async fn fetch_all_resource_events(
&self, &self,
gitlab_project_id: i64, gitlab_project_id: i64,
@@ -729,27 +732,35 @@ impl GitLabClient {
Vec<GitLabLabelEvent>, Vec<GitLabLabelEvent>,
Vec<GitLabMilestoneEvent>, Vec<GitLabMilestoneEvent>,
)> { )> {
match entity_type { let (state_res, label_res, milestone_res) = match entity_type {
"issue" => { "issue" => {
let (state, label, milestone) = tokio::try_join!( tokio::join!(
self.fetch_issue_state_events(gitlab_project_id, iid), self.fetch_issue_state_events(gitlab_project_id, iid),
self.fetch_issue_label_events(gitlab_project_id, iid), self.fetch_issue_label_events(gitlab_project_id, iid),
self.fetch_issue_milestone_events(gitlab_project_id, iid), self.fetch_issue_milestone_events(gitlab_project_id, iid),
)?; )
Ok((state, label, milestone))
} }
"merge_request" => { "merge_request" => {
let (state, label, milestone) = tokio::try_join!( tokio::join!(
self.fetch_mr_state_events(gitlab_project_id, iid), self.fetch_mr_state_events(gitlab_project_id, iid),
self.fetch_mr_label_events(gitlab_project_id, iid), self.fetch_mr_label_events(gitlab_project_id, iid),
self.fetch_mr_milestone_events(gitlab_project_id, iid), self.fetch_mr_milestone_events(gitlab_project_id, iid),
)?; )
Ok((state, label, milestone))
} }
_ => Err(LoreError::Other(format!( _ => {
"Invalid entity type for resource events: {entity_type}" return Err(LoreError::Other(format!(
))), "Invalid entity type for resource events: {entity_type}"
} )));
}
};
// Treat 404 as "endpoint not available for this entity" → empty vec.
// All other errors (403, network, etc.) propagate for retry handling.
let state = coalesce_not_found(state_res)?;
let label = coalesce_not_found(label_res)?;
let milestone = coalesce_not_found(milestone_res)?;
Ok((state, label, milestone))
} }
} }
@@ -781,6 +792,19 @@ fn parse_link_header_next(headers: &HeaderMap) -> Option<String> {
}) })
} }
/// Convert a resource-event fetch result: 404 → empty vec, other errors propagated.
///
/// 404 means the endpoint doesn't exist for this entity type — truly permanent.
/// 403 and other errors are NOT coalesced: they may be environmental (VPN, token
/// rotation) and should be retried via the drain loop's backoff mechanism.
fn coalesce_not_found<T>(result: Result<Vec<T>>) -> Result<Vec<T>> {
match result {
Ok(v) => Ok(v),
Err(LoreError::GitLabNotFound { .. }) => Ok(Vec::new()),
Err(e) => Err(e),
}
}
/// Convert milliseconds since epoch to ISO 8601 string. /// Convert milliseconds since epoch to ISO 8601 string.
fn ms_to_iso8601(ms: i64) -> Option<String> { fn ms_to_iso8601(ms: i64) -> Option<String> {
DateTime::<Utc>::from_timestamp_millis(ms) DateTime::<Utc>::from_timestamp_millis(ms)

View File

@@ -58,7 +58,7 @@ fn parse_timestamp(ts: &str) -> Result<i64, TransformError> {
} }
/// Transform a GitLab issue into local schema format. /// Transform a GitLab issue into local schema format.
pub fn transform_issue(issue: GitLabIssue) -> Result<IssueWithMetadata, TransformError> { pub fn transform_issue(issue: &GitLabIssue) -> Result<IssueWithMetadata, TransformError> {
let created_at = parse_timestamp(&issue.created_at)?; let created_at = parse_timestamp(&issue.created_at)?;
let updated_at = parse_timestamp(&issue.updated_at)?; let updated_at = parse_timestamp(&issue.updated_at)?;
@@ -83,17 +83,17 @@ pub fn transform_issue(issue: GitLabIssue) -> Result<IssueWithMetadata, Transfor
gitlab_id: issue.id, gitlab_id: issue.id,
iid: issue.iid, iid: issue.iid,
project_id: issue.project_id, project_id: issue.project_id,
title: issue.title, title: issue.title.clone(),
description: issue.description, description: issue.description.clone(),
state: issue.state, state: issue.state.clone(),
author_username: issue.author.username, author_username: issue.author.username.clone(),
created_at, created_at,
updated_at, updated_at,
web_url: issue.web_url, web_url: issue.web_url.clone(),
due_date: issue.due_date, due_date: issue.due_date.clone(),
milestone_title, milestone_title,
}, },
label_names: issue.labels, label_names: issue.labels.clone(),
assignee_usernames, assignee_usernames,
milestone, milestone,
}) })
@@ -131,7 +131,7 @@ mod tests {
#[test] #[test]
fn transforms_issue_with_all_fields() { fn transforms_issue_with_all_fields() {
let issue = make_test_issue(); let issue = make_test_issue();
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert_eq!(result.issue.gitlab_id, 12345); assert_eq!(result.issue.gitlab_id, 12345);
assert_eq!(result.issue.iid, 42); assert_eq!(result.issue.iid, 42);
@@ -154,14 +154,14 @@ mod tests {
let mut issue = make_test_issue(); let mut issue = make_test_issue();
issue.description = None; issue.description = None;
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert!(result.issue.description.is_none()); assert!(result.issue.description.is_none());
} }
#[test] #[test]
fn extracts_label_names() { fn extracts_label_names() {
let issue = make_test_issue(); let issue = make_test_issue();
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert_eq!(result.label_names.len(), 2); assert_eq!(result.label_names.len(), 2);
assert_eq!(result.label_names[0], "bug"); assert_eq!(result.label_names[0], "bug");
@@ -173,14 +173,14 @@ mod tests {
let mut issue = make_test_issue(); let mut issue = make_test_issue();
issue.labels = vec![]; issue.labels = vec![];
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert!(result.label_names.is_empty()); assert!(result.label_names.is_empty());
} }
#[test] #[test]
fn parses_timestamps_to_ms_epoch() { fn parses_timestamps_to_ms_epoch() {
let issue = make_test_issue(); let issue = make_test_issue();
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
// 2024-01-15T10:00:00.000Z = 1705312800000 ms // 2024-01-15T10:00:00.000Z = 1705312800000 ms
assert_eq!(result.issue.created_at, 1705312800000); assert_eq!(result.issue.created_at, 1705312800000);
@@ -194,7 +194,7 @@ mod tests {
// GitLab can return timestamps with timezone offset // GitLab can return timestamps with timezone offset
issue.created_at = "2024-01-15T05:00:00-05:00".to_string(); issue.created_at = "2024-01-15T05:00:00-05:00".to_string();
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
// 05:00 EST = 10:00 UTC = same as original test // 05:00 EST = 10:00 UTC = same as original test
assert_eq!(result.issue.created_at, 1705312800000); assert_eq!(result.issue.created_at, 1705312800000);
} }
@@ -215,7 +215,7 @@ mod tests {
}, },
]; ];
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert_eq!(result.assignee_usernames.len(), 2); assert_eq!(result.assignee_usernames.len(), 2);
assert_eq!(result.assignee_usernames[0], "alice"); assert_eq!(result.assignee_usernames[0], "alice");
assert_eq!(result.assignee_usernames[1], "bob"); assert_eq!(result.assignee_usernames[1], "bob");
@@ -235,7 +235,7 @@ mod tests {
web_url: Some("https://gitlab.example.com/-/milestones/5".to_string()), web_url: Some("https://gitlab.example.com/-/milestones/5".to_string()),
}); });
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
// Denormalized title on issue for quick display // Denormalized title on issue for quick display
assert_eq!(result.issue.milestone_title, Some("v1.0".to_string())); assert_eq!(result.issue.milestone_title, Some("v1.0".to_string()));
@@ -258,7 +258,7 @@ mod tests {
#[test] #[test]
fn handles_missing_milestone() { fn handles_missing_milestone() {
let issue = make_test_issue(); let issue = make_test_issue();
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert!(result.issue.milestone_title.is_none()); assert!(result.issue.milestone_title.is_none());
assert!(result.milestone.is_none()); assert!(result.milestone.is_none());
@@ -269,7 +269,7 @@ mod tests {
let mut issue = make_test_issue(); let mut issue = make_test_issue();
issue.due_date = Some("2024-02-15".to_string()); issue.due_date = Some("2024-02-15".to_string());
let result = transform_issue(issue).unwrap(); let result = transform_issue(&issue).unwrap();
assert_eq!(result.issue.due_date, Some("2024-02-15".to_string())); assert_eq!(result.issue.due_date, Some("2024-02-15".to_string()));
} }
} }

View File

@@ -111,14 +111,14 @@ async fn ingest_discussions_for_issue(
result.discussions_fetched += 1; result.discussions_fetched += 1;
// Store raw payload // Store raw payload
let payload_json = serde_json::to_value(&gitlab_discussion)?; let payload_bytes = serde_json::to_vec(&gitlab_discussion)?;
let payload_id = store_payload( let payload_id = store_payload(
conn, conn,
StorePayloadOptions { StorePayloadOptions {
project_id: Some(local_project_id), project_id: Some(local_project_id),
resource_type: "discussion", resource_type: "discussion",
gitlab_id: &gitlab_discussion.id, gitlab_id: &gitlab_discussion.id,
payload: &payload_json, json_bytes: &payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?; )?;
@@ -156,25 +156,10 @@ async fn ingest_discussions_for_issue(
)?; )?;
for note in notes { for note in notes {
// Store raw note payload // Note: per-note raw payload storage is skipped because the discussion
let note_payload_json = serde_json::to_value( // payload (already stored above) contains all notes. The full note
gitlab_discussion // content is also stored in the notes table itself.
.notes insert_note(&tx, local_discussion_id, &note, None)?;
.iter()
.find(|n| n.id == note.gitlab_id),
)?;
let note_payload_id = store_payload(
&tx,
StorePayloadOptions {
project_id: Some(local_project_id),
resource_type: "note",
gitlab_id: &note.gitlab_id.to_string(),
payload: &note_payload_json,
compress: config.storage.compress_raw_payloads,
},
)?;
insert_note(&tx, local_discussion_id, &note, note_payload_id)?;
} }
tx.commit()?; tx.commit()?;
@@ -246,7 +231,7 @@ fn insert_note(
conn: &Connection, conn: &Connection,
discussion_id: i64, discussion_id: i64,
note: &crate::gitlab::transformers::NormalizedNote, note: &crate::gitlab::transformers::NormalizedNote,
payload_id: i64, payload_id: Option<i64>,
) -> Result<()> { ) -> Result<()> {
conn.execute( conn.execute(
"INSERT INTO notes ( "INSERT INTO notes (

View File

@@ -196,8 +196,8 @@ fn process_single_issue(
let now = now_ms(); let now = now_ms();
// Transform issue first (outside transaction - no DB access) // Transform issue first (outside transaction - no DB access)
let payload_json = serde_json::to_value(issue)?; let payload_bytes = serde_json::to_vec(issue)?;
let transformed = transform_issue(issue.clone())?; let transformed = transform_issue(issue)?;
let issue_row = &transformed.issue; let issue_row = &transformed.issue;
// Wrap all DB operations in a transaction for atomicity // Wrap all DB operations in a transaction for atomicity
@@ -207,7 +207,7 @@ fn process_single_issue(
config, config,
project_id, project_id,
issue, issue,
&payload_json, &payload_bytes,
issue_row, issue_row,
&transformed.label_names, &transformed.label_names,
&transformed.assignee_usernames, &transformed.assignee_usernames,
@@ -226,7 +226,7 @@ fn process_issue_in_transaction(
config: &Config, config: &Config,
project_id: i64, project_id: i64,
issue: &GitLabIssue, issue: &GitLabIssue,
payload_json: &serde_json::Value, payload_bytes: &[u8],
issue_row: &crate::gitlab::transformers::IssueRow, issue_row: &crate::gitlab::transformers::IssueRow,
label_names: &[String], label_names: &[String],
assignee_usernames: &[String], assignee_usernames: &[String],
@@ -242,7 +242,7 @@ fn process_issue_in_transaction(
project_id: Some(project_id), project_id: Some(project_id),
resource_type: "issue", resource_type: "issue",
gitlab_id: &issue.id.to_string(), gitlab_id: &issue.id.to_string(),
payload: payload_json, json_bytes: payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?; )?;
@@ -332,33 +332,27 @@ fn process_issue_in_transaction(
} }
/// Upsert a label within a transaction, returning its ID. /// Upsert a label within a transaction, returning its ID.
/// Uses INSERT...ON CONFLICT...RETURNING for a single round-trip.
fn upsert_label_tx( fn upsert_label_tx(
tx: &Transaction<'_>, tx: &Transaction<'_>,
project_id: i64, project_id: i64,
name: &str, name: &str,
created_count: &mut usize, created_count: &mut usize,
) -> Result<i64> { ) -> Result<i64> {
// Try to get existing let id: i64 = tx.query_row(
let existing: Option<i64> = tx "INSERT INTO labels (project_id, name) VALUES (?1, ?2)
.query_row( ON CONFLICT(project_id, name) DO UPDATE SET name = excluded.name
"SELECT id FROM labels WHERE project_id = ? AND name = ?", RETURNING id",
(project_id, name), (project_id, name),
|row| row.get(0), |row| row.get(0),
) )?;
.ok();
if let Some(id) = existing { // If the rowid matches last_insert_rowid, this was a new insert
return Ok(id); if tx.last_insert_rowid() == id {
*created_count += 1;
} }
// Insert new Ok(id)
tx.execute(
"INSERT INTO labels (project_id, name) VALUES (?, ?)",
(project_id, name),
)?;
*created_count += 1;
Ok(tx.last_insert_rowid())
} }
/// Link an issue to a label within a transaction. /// Link an issue to a label within a transaction.
@@ -371,12 +365,13 @@ fn link_issue_label_tx(tx: &Transaction<'_>, issue_id: i64, label_id: i64) -> Re
} }
/// Upsert a milestone within a transaction, returning its local ID. /// Upsert a milestone within a transaction, returning its local ID.
/// Uses RETURNING to avoid a separate SELECT round-trip.
fn upsert_milestone_tx( fn upsert_milestone_tx(
tx: &Transaction<'_>, tx: &Transaction<'_>,
project_id: i64, project_id: i64,
milestone: &MilestoneRow, milestone: &MilestoneRow,
) -> Result<i64> { ) -> Result<i64> {
tx.execute( let local_id: i64 = tx.query_row(
"INSERT INTO milestones (gitlab_id, project_id, iid, title, description, state, due_date, web_url) "INSERT INTO milestones (gitlab_id, project_id, iid, title, description, state, due_date, web_url)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
ON CONFLICT(project_id, gitlab_id) DO UPDATE SET ON CONFLICT(project_id, gitlab_id) DO UPDATE SET
@@ -385,7 +380,8 @@ fn upsert_milestone_tx(
description = excluded.description, description = excluded.description,
state = excluded.state, state = excluded.state,
due_date = excluded.due_date, due_date = excluded.due_date,
web_url = excluded.web_url", web_url = excluded.web_url
RETURNING id",
( (
milestone.gitlab_id, milestone.gitlab_id,
project_id, project_id,
@@ -396,12 +392,6 @@ fn upsert_milestone_tx(
&milestone.due_date, &milestone.due_date,
&milestone.web_url, &milestone.web_url,
), ),
)?;
// Get the local ID (whether inserted or updated)
let local_id: i64 = tx.query_row(
"SELECT id FROM milestones WHERE project_id = ? AND gitlab_id = ?",
(project_id, milestone.gitlab_id),
|row| row.get(0), |row| row.get(0),
)?; )?;

View File

@@ -166,14 +166,14 @@ fn process_single_mr(
mr: &GitLabMergeRequest, mr: &GitLabMergeRequest,
) -> Result<ProcessMrResult> { ) -> Result<ProcessMrResult> {
// Transform MR first (outside transaction - no DB access) // Transform MR first (outside transaction - no DB access)
let payload_json = serde_json::to_value(mr)?; let payload_bytes = serde_json::to_vec(mr)?;
let transformed = transform_merge_request(mr, project_id) let transformed = transform_merge_request(mr, project_id)
.map_err(|e| LoreError::Other(format!("MR transform failed: {}", e)))?; .map_err(|e| LoreError::Other(format!("MR transform failed: {}", e)))?;
// Wrap all DB operations in a transaction for atomicity // Wrap all DB operations in a transaction for atomicity
let tx = conn.unchecked_transaction()?; let tx = conn.unchecked_transaction()?;
let result = let result =
process_mr_in_transaction(&tx, config, project_id, mr, &payload_json, &transformed)?; process_mr_in_transaction(&tx, config, project_id, mr, &payload_bytes, &transformed)?;
tx.commit()?; tx.commit()?;
Ok(result) Ok(result)
@@ -185,7 +185,7 @@ fn process_mr_in_transaction(
config: &Config, config: &Config,
project_id: i64, project_id: i64,
mr: &GitLabMergeRequest, mr: &GitLabMergeRequest,
payload_json: &serde_json::Value, payload_bytes: &[u8],
transformed: &crate::gitlab::transformers::merge_request::MergeRequestWithMetadata, transformed: &crate::gitlab::transformers::merge_request::MergeRequestWithMetadata,
) -> Result<ProcessMrResult> { ) -> Result<ProcessMrResult> {
let mut labels_created = 0; let mut labels_created = 0;
@@ -199,7 +199,7 @@ fn process_mr_in_transaction(
project_id: Some(project_id), project_id: Some(project_id),
resource_type: "merge_request", resource_type: "merge_request",
gitlab_id: &mr.id.to_string(), gitlab_id: &mr.id.to_string(),
payload: payload_json, json_bytes: payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?; )?;
@@ -315,33 +315,28 @@ fn process_mr_in_transaction(
} }
/// Upsert a label within a transaction, returning its ID. /// Upsert a label within a transaction, returning its ID.
/// Upsert a label within a transaction, returning its ID.
/// Uses INSERT...ON CONFLICT...RETURNING for a single round-trip.
fn upsert_label_tx( fn upsert_label_tx(
tx: &Transaction<'_>, tx: &Transaction<'_>,
project_id: i64, project_id: i64,
name: &str, name: &str,
created_count: &mut usize, created_count: &mut usize,
) -> Result<i64> { ) -> Result<i64> {
// Try to get existing let id: i64 = tx.query_row(
let existing: Option<i64> = tx "INSERT INTO labels (project_id, name) VALUES (?1, ?2)
.query_row( ON CONFLICT(project_id, name) DO UPDATE SET name = excluded.name
"SELECT id FROM labels WHERE project_id = ? AND name = ?", RETURNING id",
(project_id, name), (project_id, name),
|row| row.get(0), |row| row.get(0),
) )?;
.ok();
if let Some(id) = existing { // If the rowid matches last_insert_rowid, this was a new insert
return Ok(id); if tx.last_insert_rowid() == id {
*created_count += 1;
} }
// Insert new Ok(id)
tx.execute(
"INSERT INTO labels (project_id, name) VALUES (?, ?)",
(project_id, name),
)?;
*created_count += 1;
Ok(tx.last_insert_rowid())
} }
/// Check if an MR passes the cursor filter (not already processed). /// Check if an MR passes the cursor filter (not already processed).
@@ -412,13 +407,14 @@ fn reset_sync_cursor(conn: &Connection, project_id: i64) -> Result<()> {
Ok(()) Ok(())
} }
/// Reset discussion watermarks for all MRs in project (for full sync). /// Reset discussion and resource event watermarks for all MRs in project (for full sync).
fn reset_discussion_watermarks(conn: &Connection, project_id: i64) -> Result<()> { fn reset_discussion_watermarks(conn: &Connection, project_id: i64) -> Result<()> {
conn.execute( conn.execute(
"UPDATE merge_requests "UPDATE merge_requests
SET discussions_synced_for_updated_at = NULL, SET discussions_synced_for_updated_at = NULL,
discussions_sync_attempts = 0, discussions_sync_attempts = 0,
discussions_sync_last_error = NULL discussions_sync_last_error = NULL,
resource_events_synced_for_updated_at = NULL
WHERE project_id = ?", WHERE project_id = ?",
[project_id], [project_id],
)?; )?;

View File

@@ -172,14 +172,14 @@ pub fn write_prefetched_mr_discussions(
let tx = conn.unchecked_transaction()?; let tx = conn.unchecked_transaction()?;
// Store raw payload // Store raw payload
let payload_json = serde_json::to_value(&disc.raw)?; let payload_bytes = serde_json::to_vec(&disc.raw)?;
let payload_id = Some(store_payload( let payload_id = Some(store_payload(
&tx, &tx,
StorePayloadOptions { StorePayloadOptions {
project_id: Some(local_project_id), project_id: Some(local_project_id),
resource_type: "discussion", resource_type: "discussion",
gitlab_id: &disc.raw.id, gitlab_id: &disc.raw.id,
payload: &payload_json, json_bytes: &payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?); )?);
@@ -206,14 +206,14 @@ pub fn write_prefetched_mr_discussions(
let note_payload_id = if should_store_payload { let note_payload_id = if should_store_payload {
let note_data = disc.raw.notes.iter().find(|n| n.id == note.gitlab_id); let note_data = disc.raw.notes.iter().find(|n| n.id == note.gitlab_id);
if let Some(note_data) = note_data { if let Some(note_data) = note_data {
let note_payload_json = serde_json::to_value(note_data)?; let note_payload_bytes = serde_json::to_vec(note_data)?;
Some(store_payload( Some(store_payload(
&tx, &tx,
StorePayloadOptions { StorePayloadOptions {
project_id: Some(local_project_id), project_id: Some(local_project_id),
resource_type: "note", resource_type: "note",
gitlab_id: &note.gitlab_id.to_string(), gitlab_id: &note.gitlab_id.to_string(),
payload: &note_payload_json, json_bytes: &note_payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?) )?)
@@ -388,14 +388,14 @@ async fn ingest_discussions_for_mr(
let tx = conn.unchecked_transaction()?; let tx = conn.unchecked_transaction()?;
// Store raw payload // Store raw payload
let payload_json = serde_json::to_value(&gitlab_discussion)?; let payload_bytes = serde_json::to_vec(&gitlab_discussion)?;
let payload_id = Some(store_payload( let payload_id = Some(store_payload(
&tx, &tx,
StorePayloadOptions { StorePayloadOptions {
project_id: Some(local_project_id), project_id: Some(local_project_id),
resource_type: "discussion", resource_type: "discussion",
gitlab_id: &gitlab_discussion.id, gitlab_id: &gitlab_discussion.id,
payload: &payload_json, json_bytes: &payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?); )?);
@@ -429,14 +429,14 @@ async fn ingest_discussions_for_mr(
.iter() .iter()
.find(|n| n.id == note.gitlab_id); .find(|n| n.id == note.gitlab_id);
if let Some(note_data) = note_data { if let Some(note_data) = note_data {
let note_payload_json = serde_json::to_value(note_data)?; let note_payload_bytes = serde_json::to_vec(note_data)?;
Some(store_payload( Some(store_payload(
&tx, &tx,
StorePayloadOptions { StorePayloadOptions {
project_id: Some(local_project_id), project_id: Some(local_project_id),
resource_type: "note", resource_type: "note",
gitlab_id: &note.gitlab_id.to_string(), gitlab_id: &note.gitlab_id.to_string(),
payload: &note_payload_json, json_bytes: &note_payload_bytes,
compress: config.storage.compress_raw_payloads, compress: config.storage.compress_raw_payloads,
}, },
)?) )?)