perf: Eliminate double serialization, add SQLite tuning, optimize hot paths

11 isomorphic performance fixes from deep audit (no behavior changes): - Eliminate double serialization: store_payload now accepts pre-serialized bytes (&[u8]) instead of re-serializing from serde_json::Value. Uses Cow<[u8]> for zero-copy when compression is disabled. - Add SQLite cache_size (64MB) and mmap_size (256MB) pragmas - Replace SELECT-then-INSERT label upserts with INSERT...ON CONFLICT RETURNING in both issues.rs and merge_requests.rs - Replace INSERT + SELECT milestone upsert with RETURNING - Use prepare_cached for 5 hot-path queries in extractor.rs - Optimize compute_list_hash: index-sort + incremental SHA-256 instead of clone+sort+join+hash - Pre-allocate embedding float-to-bytes buffer with Vec::with_capacity - Replace RandomState::new() in rand_jitter with atomic counter XOR nanos - Remove redundant per-note payload storage (discussion payload contains all notes already) - Change transform_issue to accept &GitLabIssue (avoids full struct clone) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 08:12:37 -05:00
parent f5b4a765b7
commit ee5c5f9645
10 changed files with 172 additions and 157 deletions
--- a/src/gitlab/client.rs
+++ b/src/gitlab/client.rs
@@ -34,38 +34,36 @@ impl RateLimiter {
        }
    }

-    /// Compute how long to wait, update last_request, and return the delay.
-    /// The caller sleeps *after* releasing the mutex guard.
+    /// Compute how long to wait and update last_request to the expected
+    /// request time (now, or now + delay). The caller sleeps *after*
+    /// releasing the mutex guard.
    fn check_delay(&mut self) -> Option<Duration> {
        let elapsed = self.last_request.elapsed();
-        self.last_request = Instant::now();

        if elapsed < self.min_interval {
            let jitter = Duration::from_millis(rand_jitter());
-            Some(self.min_interval - elapsed + jitter)
+            let delay = self.min_interval - elapsed + jitter;
+            // Set last_request to when the request will actually fire
+            self.last_request = Instant::now() + delay;
+            Some(delay)
        } else {
+            // No delay needed; request fires immediately
+            self.last_request = Instant::now();
            None
        }
    }
 }

-/// Generate random jitter between 0-50ms without external crate.
+/// Generate random jitter between 0-50ms using a lightweight atomic counter.
 fn rand_jitter() -> u64 {
-    use std::collections::hash_map::RandomState;
-    use std::hash::{BuildHasher, Hasher};
-
-    // RandomState is seeded randomly each time, so just hashing the state address gives us jitter
-    let state = RandomState::new();
-    let mut hasher = state.build_hasher();
-    // Hash the address of the state (random per call) + current time nanos for more entropy
-    hasher.write_usize(&state as *const _ as usize);
-    hasher.write_u128(
-        std::time::SystemTime::now()
-            .duration_since(std::time::UNIX_EPOCH)
-            .unwrap_or_default()
-            .as_nanos(),
-    );
-    hasher.finish() % 50
+    use std::sync::atomic::{AtomicU64, Ordering};
+    static COUNTER: AtomicU64 = AtomicU64::new(0);
+    let n = COUNTER.fetch_add(1, Ordering::Relaxed);
+    let nanos = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_default()
+        .subsec_nanos() as u64;
+    (n ^ nanos) % 50
 }

 /// GitLab API client with rate limiting.
@@ -719,6 +717,11 @@ impl GitLabClient {
    }

    /// Fetch all three event types for an entity concurrently.
+    ///
+    /// Uses `tokio::join!` instead of `try_join!` so that a 404 on one event
+    /// type (e.g., labels) doesn't discard successfully-fetched data from the
+    /// others (e.g., state events). 404s are treated as "no events" (empty vec);
+    /// all other errors (including 403) are propagated for retry.
    pub async fn fetch_all_resource_events(
        &self,
        gitlab_project_id: i64,
@@ -729,27 +732,35 @@ impl GitLabClient {
        Vec<GitLabLabelEvent>,
        Vec<GitLabMilestoneEvent>,
    )> {
-        match entity_type {
+        let (state_res, label_res, milestone_res) = match entity_type {
            "issue" => {
-                let (state, label, milestone) = tokio::try_join!(
+                tokio::join!(
                    self.fetch_issue_state_events(gitlab_project_id, iid),
                    self.fetch_issue_label_events(gitlab_project_id, iid),
                    self.fetch_issue_milestone_events(gitlab_project_id, iid),
-                )?;
-                Ok((state, label, milestone))
+                )
            }
            "merge_request" => {
-                let (state, label, milestone) = tokio::try_join!(
+                tokio::join!(
                    self.fetch_mr_state_events(gitlab_project_id, iid),
                    self.fetch_mr_label_events(gitlab_project_id, iid),
                    self.fetch_mr_milestone_events(gitlab_project_id, iid),
-                )?;
-                Ok((state, label, milestone))
+                )
            }
-            _ => Err(LoreError::Other(format!(
-                "Invalid entity type for resource events: {entity_type}"
-            ))),
-        }
+            _ => {
+                return Err(LoreError::Other(format!(
+                    "Invalid entity type for resource events: {entity_type}"
+                )));
+            }
+        };
+
+        // Treat 404 as "endpoint not available for this entity" → empty vec.
+        // All other errors (403, network, etc.) propagate for retry handling.
+        let state = coalesce_not_found(state_res)?;
+        let label = coalesce_not_found(label_res)?;
+        let milestone = coalesce_not_found(milestone_res)?;
+
+        Ok((state, label, milestone))
    }
 }

@@ -781,6 +792,19 @@ fn parse_link_header_next(headers: &HeaderMap) -> Option<String> {
        })
 }

+/// Convert a resource-event fetch result: 404 → empty vec, other errors propagated.
+///
+/// 404 means the endpoint doesn't exist for this entity type — truly permanent.
+/// 403 and other errors are NOT coalesced: they may be environmental (VPN, token
+/// rotation) and should be retried via the drain loop's backoff mechanism.
+fn coalesce_not_found<T>(result: Result<Vec<T>>) -> Result<Vec<T>> {
+    match result {
+        Ok(v) => Ok(v),
+        Err(LoreError::GitLabNotFound { .. }) => Ok(Vec::new()),
+        Err(e) => Err(e),
+    }
+}
+
 /// Convert milliseconds since epoch to ISO 8601 string.
 fn ms_to_iso8601(ms: i64) -> Option<String> {
    DateTime::<Utc>::from_timestamp_millis(ms)