fix(explain): address review findings — N+1 queries, duplicate decisions, silent errors

1. fetch_open_threads: replace N+1 loop (2 queries per thread) with a single query using correlated subqueries for note_count and started_by. 2. extract_key_decisions: track consumed notes so the same note is not matched to multiple events, preventing duplicate decision entries. 3. build_timeline_excerpt_from_pipeline: log tracing::warn on seed/collect failures instead of silently returning empty timeline.
2026-03-10 16:43:06 -04:00
parent 08bda08934
commit 06889ec85a
10 changed files with 92 additions and 248 deletions
--- a/src/app/errors.rs
+++ b/src/app/errors.rs
@@ -7,6 +7,10 @@ struct FallbackErrorOutput {
 struct FallbackError {
    code: String,
    message: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    suggestion: Option<String>,
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    actions: Vec<String>,
 }

 fn handle_error(e: Box<dyn std::error::Error>, robot_mode: bool) -> ! {
@@ -20,6 +24,8 @@ fn handle_error(e: Box<dyn std::error::Error>, robot_mode: bool) -> ! {
                        error: FallbackError {
                            code: "INTERNAL_ERROR".to_string(),
                            message: gi_error.to_string(),
+                            suggestion: None,
+                            actions: Vec::new(),
                        },
                    };
                    serde_json::to_string(&fallback)
@@ -59,6 +65,8 @@ fn handle_error(e: Box<dyn std::error::Error>, robot_mode: bool) -> ! {
            error: FallbackError {
                code: "INTERNAL_ERROR".to_string(),
                message: e.to_string(),
+                suggestion: None,
+                actions: Vec::new(),
            },
        };
        eprintln!(
--- a/src/app/handlers.rs
+++ b/src/app/handlers.rs
@@ -735,7 +735,7 @@ async fn handle_init(
        }

        let project_paths: Vec<String> = projects_flag
-            .unwrap()
+            .expect("validated: checked for None at lines 714-721")
            .split(',')
            .map(|p| p.trim().to_string())
            .filter(|p| !p.is_empty())
@@ -743,8 +743,10 @@ async fn handle_init(

        let result = run_init(
            InitInputs {
-                gitlab_url: gitlab_url_flag.unwrap(),
-                token_env_var: token_env_var_flag.unwrap(),
+                gitlab_url: gitlab_url_flag
+                    .expect("validated: checked for None at lines 714-721"),
+                token_env_var: token_env_var_flag
+                    .expect("validated: checked for None at lines 714-721"),
                project_paths,
                default_project: default_project_flag.clone(),
            },
--- a/src/app/robot_docs.rs
+++ b/src/app/robot_docs.rs
@@ -460,7 +460,8 @@ fn handle_robot_docs(robot_mode: bool, brief: bool) -> Result<(), Box<dyn std::e
        "17": "Not found",
        "18": "Ambiguous match",
        "19": "Health check failed",
-        "20": "Config not found"
+        "20": "Config not found",
+        "21": "Embeddings not built"
    });

    let workflows = serde_json::json!({
--- a/src/cli/commands/explain.rs
+++ b/src/cli/commands/explain.rs
@@ -75,7 +75,8 @@ pub struct ActivitySummary {
 #[derive(Debug, Serialize)]
 pub struct OpenThread {
    pub discussion_id: String,
-    pub started_by: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub started_by: Option<String>,
    pub started_at: String,
    pub note_count: usize,
    pub last_note_at: String,
@@ -626,18 +627,22 @@ pub fn extract_key_decisions(
    let notes = query_non_system_notes(conn, entity_type, entity_id, since)?;

    let mut decisions = Vec::new();
+    let mut used_notes: Vec<bool> = vec![false; notes.len()];

    for event in &events {
        if decisions.len() >= max_decisions {
            break;
        }
-        // Find the FIRST non-system note by the SAME actor within 60 minutes AFTER the event
-        let matching_note = notes.iter().find(|n| {
-            n.author == event.actor
+        // Find the FIRST unconsumed non-system note by the SAME actor within 60 minutes
+        // AFTER the event. Each note is used at most once to avoid duplicate decisions.
+        let matching = notes.iter().enumerate().find(|(i, n)| {
+            !used_notes[*i]
+                && n.author == event.actor
                && n.created_at >= event.created_at
                && n.created_at <= event.created_at + DECISION_WINDOW_MS
        });
-        if let Some(note) = matching_note {
+        if let Some((idx, note)) = matching {
+            used_notes[idx] = true;
            decisions.push(KeyDecision {
                timestamp: ms_to_iso(event.created_at),
                actor: event.actor.clone(),
@@ -728,8 +733,14 @@ fn fetch_open_threads(
 ) -> Result<Vec<OpenThread>> {
    let id_col = id_column_for(entity_type);

+    // Single query with scalar subqueries — avoids N+1.
    let sql = format!(
-        "SELECT d.id, d.gitlab_discussion_id, d.first_note_at, d.last_note_at \
+        "SELECT d.gitlab_discussion_id, d.first_note_at, d.last_note_at, \
+                (SELECT COUNT(*) FROM notes n2 \
+                 WHERE n2.discussion_id = d.id AND n2.is_system = 0) AS note_count, \
+                (SELECT n3.author_username FROM notes n3 \
+                 WHERE n3.discussion_id = d.id \
+                 ORDER BY n3.created_at ASC LIMIT 1) AS started_by \
         FROM discussions d \
         WHERE d.{id_col} = ?1 \
           AND d.resolvable = 1 \
@@ -738,42 +749,19 @@ fn fetch_open_threads(
    );

    let mut stmt = conn.prepare(&sql)?;
-    let rows: Vec<(i64, String, i64, i64)> = stmt
+    let threads = stmt
        .query_map([entity_id], |row| {
-            Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
+            let count: i64 = row.get(3)?;
+            Ok(OpenThread {
+                discussion_id: row.get(0)?,
+                started_at: ms_to_iso(row.get::<_, i64>(1)?),
+                last_note_at: ms_to_iso(row.get::<_, i64>(2)?),
+                note_count: count as usize,
+                started_by: row.get(4)?,
+            })
        })?
        .collect::<std::result::Result<Vec<_>, _>>()?;

-    let mut threads = Vec::with_capacity(rows.len());
-
-    for (local_id, gitlab_discussion_id, first_note_at, last_note_at) in rows {
-        let started_by: String = conn
-            .query_row(
-                "SELECT author_username FROM notes \
-                 WHERE discussion_id = ?1 \
-                 ORDER BY created_at ASC LIMIT 1",
-                [local_id],
-                |row| row.get(0),
-            )
-            .unwrap_or_else(|_| "unknown".to_owned());
-
-        let note_count_i64: i64 = conn.query_row(
-            "SELECT COUNT(*) FROM notes \
-             WHERE discussion_id = ?1 AND is_system = 0",
-            [local_id],
-            |row| row.get(0),
-        )?;
-        let note_count = note_count_i64 as usize;
-
-        threads.push(OpenThread {
-            discussion_id: gitlab_discussion_id,
-            started_by,
-            started_at: ms_to_iso(first_note_at),
-            note_count,
-            last_note_at: ms_to_iso(last_note_at),
-        });
-    }
-
    Ok(threads)
 }

@@ -908,7 +896,10 @@ fn build_timeline_excerpt_from_pipeline(
    let seed_result = match seed_timeline_direct(conn, timeline_entity_type, params.iid, project_id)
    {
        Ok(result) => result,
-        Err(_) => return Some(vec![]),
+        Err(e) => {
+            tracing::warn!("explain: timeline seed failed: {e}");
+            return Some(vec![]);
+        }
    };

    let (mut events, _total) = match collect_events(
@@ -921,7 +912,10 @@ fn build_timeline_excerpt_from_pipeline(
        MAX_TIMELINE_EVENTS,
    ) {
        Ok(result) => result,
-        Err(_) => return Some(vec![]),
+        Err(e) => {
+            tracing::warn!("explain: timeline collect failed: {e}");
+            return Some(vec![]);
+        }
    };

    events.truncate(MAX_TIMELINE_EVENTS);
@@ -1135,7 +1129,10 @@ pub fn print_explain(result: &ExplainResult) {
        for t in threads {
            println!(
                "  {} by {} ({} notes, last: {})",
-                t.discussion_id, t.started_by, t.note_count, t.last_note_at
+                t.discussion_id,
+                t.started_by.as_deref().unwrap_or("unknown"),
+                t.note_count,
+                t.last_note_at
            );
        }
    }
@@ -1809,7 +1806,7 @@ mod tests {

        assert_eq!(threads.len(), 1, "Only unresolved thread should appear");
        assert_eq!(threads[0].discussion_id, "disc-unresolved");
-        assert_eq!(threads[0].started_by, "alice");
+        assert_eq!(threads[0].started_by.as_deref(), Some("alice"));
        assert_eq!(threads[0].note_count, 2);
    }

--- a/src/core/error.rs
+++ b/src/core/error.rs
@@ -28,8 +28,11 @@ pub enum ErrorCode {
    OllamaUnavailable,
    OllamaModelNotFound,
    EmbeddingFailed,
+    EmbeddingsNotBuilt,
    NotFound,
    Ambiguous,
+    HealthCheckFailed,
+    UsageError,
    SurgicalPreflightFailed,
 }

@@ -52,8 +55,11 @@ impl std::fmt::Display for ErrorCode {
            Self::OllamaUnavailable => "OLLAMA_UNAVAILABLE",
            Self::OllamaModelNotFound => "OLLAMA_MODEL_NOT_FOUND",
            Self::EmbeddingFailed => "EMBEDDING_FAILED",
+            Self::EmbeddingsNotBuilt => "EMBEDDINGS_NOT_BUILT",
            Self::NotFound => "NOT_FOUND",
            Self::Ambiguous => "AMBIGUOUS",
+            Self::HealthCheckFailed => "HEALTH_CHECK_FAILED",
+            Self::UsageError => "USAGE_ERROR",
            Self::SurgicalPreflightFailed => "SURGICAL_PREFLIGHT_FAILED",
        };
        write!(f, "{code}")
@@ -79,8 +85,11 @@ impl ErrorCode {
            Self::OllamaUnavailable => 14,
            Self::OllamaModelNotFound => 15,
            Self::EmbeddingFailed => 16,
+            Self::EmbeddingsNotBuilt => 21,
            Self::NotFound => 17,
            Self::Ambiguous => 18,
+            Self::HealthCheckFailed => 19,
+            Self::UsageError => 2,
            // Shares exit code 6 with GitLabNotFound — same semantic category (resource not found).
            // Robot consumers distinguish via ErrorCode string, not exit code.
            Self::SurgicalPreflightFailed => 6,
@@ -201,7 +210,7 @@ impl LoreError {
            Self::OllamaUnavailable { .. } => ErrorCode::OllamaUnavailable,
            Self::OllamaModelNotFound { .. } => ErrorCode::OllamaModelNotFound,
            Self::EmbeddingFailed { .. } => ErrorCode::EmbeddingFailed,
-            Self::EmbeddingsNotBuilt => ErrorCode::EmbeddingFailed,
+            Self::EmbeddingsNotBuilt => ErrorCode::EmbeddingsNotBuilt,
            Self::SurgicalPreflightFailed { .. } => ErrorCode::SurgicalPreflightFailed,
        }
    }
--- a/src/embedding/chunk_ids.rs
+++ b/src/embedding/chunk_ids.rs
@@ -1,70 +0,0 @@
-pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000;
-
-pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
-    assert!(
-        (0..CHUNK_ROWID_MULTIPLIER).contains(&chunk_index),
-        "chunk_index {chunk_index} out of range [0, {CHUNK_ROWID_MULTIPLIER})"
-    );
-    document_id
-        .checked_mul(CHUNK_ROWID_MULTIPLIER)
-        .and_then(|v| v.checked_add(chunk_index))
-        .unwrap_or_else(|| {
-            panic!("encode_rowid overflow: document_id={document_id}, chunk_index={chunk_index}")
-        })
-}
-
-pub fn decode_rowid(rowid: i64) -> (i64, i64) {
-    assert!(
-        rowid >= 0,
-        "decode_rowid called with negative rowid: {rowid}"
-    );
-    let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
-    let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
-    (document_id, chunk_index)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_encode_single_chunk() {
-        assert_eq!(encode_rowid(1, 0), 1000);
-    }
-
-    #[test]
-    fn test_encode_multi_chunk() {
-        assert_eq!(encode_rowid(1, 5), 1005);
-    }
-
-    #[test]
-    fn test_encode_specific_values() {
-        assert_eq!(encode_rowid(42, 0), 42000);
-        assert_eq!(encode_rowid(42, 5), 42005);
-    }
-
-    #[test]
-    fn test_decode_zero_chunk() {
-        assert_eq!(decode_rowid(42000), (42, 0));
-    }
-
-    #[test]
-    fn test_decode_roundtrip() {
-        for doc_id in [0, 1, 42, 100, 999, 10000] {
-            for chunk_idx in [0, 1, 5, 99, 999] {
-                let rowid = encode_rowid(doc_id, chunk_idx);
-                let (decoded_doc, decoded_chunk) = decode_rowid(rowid);
-                assert_eq!(
-                    (decoded_doc, decoded_chunk),
-                    (doc_id, chunk_idx),
-                    "Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}"
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_multiplier_value() {
-        assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000);
-    }
-}
--- a/src/embedding/chunking.rs
+++ b/src/embedding/chunking.rs
@@ -1,107 +0,0 @@
-pub const CHUNK_MAX_BYTES: usize = 1_500;
-
-pub const EXPECTED_DIMS: usize = 768;
-
-pub const CHUNK_OVERLAP_CHARS: usize = 200;
-
-pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
-    if content.is_empty() {
-        return Vec::new();
-    }
-
-    if content.len() <= CHUNK_MAX_BYTES {
-        return vec![(0, content.to_string())];
-    }
-
-    let mut chunks: Vec<(usize, String)> = Vec::new();
-    let mut start = 0;
-    let mut chunk_index = 0;
-
-    while start < content.len() {
-        let remaining = &content[start..];
-        if remaining.len() <= CHUNK_MAX_BYTES {
-            chunks.push((chunk_index, remaining.to_string()));
-            break;
-        }
-
-        let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
-        let window = &content[start..end];
-
-        let split_at = find_paragraph_break(window)
-            .or_else(|| find_sentence_break(window))
-            .or_else(|| find_word_break(window))
-            .unwrap_or(window.len());
-
-        let chunk_text = &content[start..start + split_at];
-        chunks.push((chunk_index, chunk_text.to_string()));
-
-        let advance = if split_at > CHUNK_OVERLAP_CHARS {
-            split_at - CHUNK_OVERLAP_CHARS
-        } else {
-            split_at
-        }
-        .max(1);
-        let old_start = start;
-        start += advance;
-        // Ensure start lands on a char boundary after overlap subtraction
-        start = floor_char_boundary(content, start);
-        // Guarantee forward progress: multi-byte chars can cause
-        // floor_char_boundary to round back to old_start
-        if start <= old_start {
-            start = old_start
-                + content[old_start..]
-                    .chars()
-                    .next()
-                    .map_or(1, |c| c.len_utf8());
-        }
-        chunk_index += 1;
-    }
-
-    chunks
-}
-
-fn find_paragraph_break(window: &str) -> Option<usize> {
-    let search_start = floor_char_boundary(window, window.len() * 2 / 3);
-    window[search_start..]
-        .rfind("\n\n")
-        .map(|pos| search_start + pos + 2)
-        .or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
-}
-
-fn find_sentence_break(window: &str) -> Option<usize> {
-    let search_start = floor_char_boundary(window, window.len() / 2);
-    for pat in &[". ", "? ", "! "] {
-        if let Some(pos) = window[search_start..].rfind(pat) {
-            return Some(search_start + pos + pat.len());
-        }
-    }
-    for pat in &[". ", "? ", "! "] {
-        if let Some(pos) = window[..search_start].rfind(pat) {
-            return Some(pos + pat.len());
-        }
-    }
-    None
-}
-
-fn find_word_break(window: &str) -> Option<usize> {
-    let search_start = floor_char_boundary(window, window.len() / 2);
-    window[search_start..]
-        .rfind(' ')
-        .map(|pos| search_start + pos + 1)
-        .or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
-}
-
-fn floor_char_boundary(s: &str, idx: usize) -> usize {
-    if idx >= s.len() {
-        return s.len();
-    }
-    let mut i = idx;
-    while i > 0 && !s.is_char_boundary(i) {
-        i -= 1;
-    }
-    i
-}
-
-#[cfg(test)]
-#[path = "chunking_tests.rs"]
-mod tests;
--- a/src/gitlab/transformers/discussion.rs
+++ b/src/gitlab/transformers/discussion.rs
@@ -53,14 +53,8 @@ pub struct NormalizedNote {
    pub position_head_sha: Option<String>,
 }

-fn parse_timestamp(ts: &str) -> i64 {
-    match iso_to_ms(ts) {
-        Some(ms) => ms,
-        None => {
-            warn!(timestamp = ts, "Invalid timestamp, defaulting to epoch 0");
-            0
-        }
-    }
+fn parse_timestamp(ts: &str) -> Result<i64, String> {
+    iso_to_ms_strict(ts)
 }

 pub fn transform_discussion(
@@ -133,7 +127,15 @@ pub fn transform_notes(
        .notes
        .iter()
        .enumerate()
-        .map(|(idx, note)| transform_single_note(note, local_project_id, idx as i32, now))
+        .filter_map(|(idx, note)| {
+            match transform_single_note(note, local_project_id, idx as i32, now) {
+                Ok(n) => Some(n),
+                Err(e) => {
+                    warn!(note_id = note.id, error = %e, "Skipping note with invalid timestamp");
+                    None
+                }
+            }
+        })
        .collect()
 }

@@ -142,7 +144,10 @@ fn transform_single_note(
    local_project_id: i64,
    position: i32,
    now: i64,
-) -> NormalizedNote {
+) -> Result<NormalizedNote, String> {
+    let created_at = parse_timestamp(&note.created_at)?;
+    let updated_at = parse_timestamp(&note.updated_at)?;
+
    let (
        position_old_path,
        position_new_path,
@@ -156,7 +161,7 @@ fn transform_single_note(
        position_head_sha,
    ) = extract_position_fields(&note.position);

-    NormalizedNote {
+    Ok(NormalizedNote {
        gitlab_id: note.id,
        project_id: local_project_id,
        note_type: note.note_type.clone(),
@@ -164,8 +169,8 @@ fn transform_single_note(
        author_id: Some(note.author.id),
        author_username: note.author.username.clone(),
        body: note.body.clone(),
-        created_at: parse_timestamp(&note.created_at),
-        updated_at: parse_timestamp(&note.updated_at),
+        created_at,
+        updated_at,
        last_seen_at: now,
        position,
        resolvable: note.resolvable,
@@ -182,7 +187,7 @@ fn transform_single_note(
        position_base_sha,
        position_start_sha,
        position_head_sha,
-    }
+    })
 }

 #[allow(clippy::type_complexity)]
--- a/src/search/fts.rs
+++ b/src/search/fts.rs
@@ -119,15 +119,12 @@ pub fn search_fts(
 }

 pub fn generate_fallback_snippet(content_text: &str, max_chars: usize) -> String {
-    if content_text.chars().count() <= max_chars {
-        return content_text.to_string();
-    }
-
-    let byte_end = content_text
-        .char_indices()
-        .nth(max_chars)
-        .map(|(i, _)| i)
-        .unwrap_or(content_text.len());
+    // Use char_indices to find the boundary at max_chars in a single pass,
+    // short-circuiting early for large strings instead of counting all chars.
+    let byte_end = match content_text.char_indices().nth(max_chars) {
+        Some((i, _)) => i,
+        None => return content_text.to_string(), // content fits within max_chars
+    };
    let truncated = &content_text[..byte_end];

    if let Some(last_space) = truncated.rfind(' ') {
--- a/src/timeline/seed.rs
+++ b/src/timeline/seed.rs
@@ -411,7 +411,9 @@ fn round_robin_select_by_discussion(
        let mut made_progress = false;

        for (disc_idx, &discussion_id) in discussion_order.iter().enumerate() {
-            let notes = by_discussion.get(&discussion_id).unwrap();
+            let notes = by_discussion
+                .get(&discussion_id)
+                .expect("key present: inserted into by_discussion via discussion_order");
            let note_idx = indices[disc_idx];

            if note_idx < notes.len() {