From 06889ec85addfb8b80c304367f74c9c475cc6c71 Mon Sep 17 00:00:00 2001 From: teernisse Date: Tue, 10 Mar 2026 16:43:06 -0400 Subject: [PATCH] =?UTF-8?q?fix(explain):=20address=20review=20findings=20?= =?UTF-8?q?=E2=80=94=20N+1=20queries,=20duplicate=20decisions,=20silent=20?= =?UTF-8?q?errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. fetch_open_threads: replace N+1 loop (2 queries per thread) with a single query using correlated subqueries for note_count and started_by. 2. extract_key_decisions: track consumed notes so the same note is not matched to multiple events, preventing duplicate decision entries. 3. build_timeline_excerpt_from_pipeline: log tracing::warn on seed/collect failures instead of silently returning empty timeline. --- src/app/errors.rs | 8 ++ src/app/handlers.rs | 8 +- src/app/robot_docs.rs | 3 +- src/cli/commands/explain.rs | 81 ++++++++++--------- src/core/error.rs | 11 ++- src/embedding/chunk_ids.rs | 70 ----------------- src/embedding/chunking.rs | 107 -------------------------- src/gitlab/transformers/discussion.rs | 33 ++++---- src/search/fts.rs | 15 ++-- src/timeline/seed.rs | 4 +- 10 files changed, 92 insertions(+), 248 deletions(-) delete mode 100644 src/embedding/chunk_ids.rs delete mode 100644 src/embedding/chunking.rs diff --git a/src/app/errors.rs b/src/app/errors.rs index 8859618..57614b5 100644 --- a/src/app/errors.rs +++ b/src/app/errors.rs @@ -7,6 +7,10 @@ struct FallbackErrorOutput { struct FallbackError { code: String, message: String, + #[serde(skip_serializing_if = "Option::is_none")] + suggestion: Option, + #[serde(skip_serializing_if = "Vec::is_empty")] + actions: Vec, } fn handle_error(e: Box, robot_mode: bool) -> ! { @@ -20,6 +24,8 @@ fn handle_error(e: Box, robot_mode: bool) -> ! { error: FallbackError { code: "INTERNAL_ERROR".to_string(), message: gi_error.to_string(), + suggestion: None, + actions: Vec::new(), }, }; serde_json::to_string(&fallback) @@ -59,6 +65,8 @@ fn handle_error(e: Box, robot_mode: bool) -> ! { error: FallbackError { code: "INTERNAL_ERROR".to_string(), message: e.to_string(), + suggestion: None, + actions: Vec::new(), }, }; eprintln!( diff --git a/src/app/handlers.rs b/src/app/handlers.rs index faae0ac..6eb3760 100644 --- a/src/app/handlers.rs +++ b/src/app/handlers.rs @@ -735,7 +735,7 @@ async fn handle_init( } let project_paths: Vec = projects_flag - .unwrap() + .expect("validated: checked for None at lines 714-721") .split(',') .map(|p| p.trim().to_string()) .filter(|p| !p.is_empty()) @@ -743,8 +743,10 @@ async fn handle_init( let result = run_init( InitInputs { - gitlab_url: gitlab_url_flag.unwrap(), - token_env_var: token_env_var_flag.unwrap(), + gitlab_url: gitlab_url_flag + .expect("validated: checked for None at lines 714-721"), + token_env_var: token_env_var_flag + .expect("validated: checked for None at lines 714-721"), project_paths, default_project: default_project_flag.clone(), }, diff --git a/src/app/robot_docs.rs b/src/app/robot_docs.rs index 52788c9..efd6dfe 100644 --- a/src/app/robot_docs.rs +++ b/src/app/robot_docs.rs @@ -460,7 +460,8 @@ fn handle_robot_docs(robot_mode: bool, brief: bool) -> Result<(), Box, pub started_at: String, pub note_count: usize, pub last_note_at: String, @@ -626,18 +627,22 @@ pub fn extract_key_decisions( let notes = query_non_system_notes(conn, entity_type, entity_id, since)?; let mut decisions = Vec::new(); + let mut used_notes: Vec = vec![false; notes.len()]; for event in &events { if decisions.len() >= max_decisions { break; } - // Find the FIRST non-system note by the SAME actor within 60 minutes AFTER the event - let matching_note = notes.iter().find(|n| { - n.author == event.actor + // Find the FIRST unconsumed non-system note by the SAME actor within 60 minutes + // AFTER the event. Each note is used at most once to avoid duplicate decisions. + let matching = notes.iter().enumerate().find(|(i, n)| { + !used_notes[*i] + && n.author == event.actor && n.created_at >= event.created_at && n.created_at <= event.created_at + DECISION_WINDOW_MS }); - if let Some(note) = matching_note { + if let Some((idx, note)) = matching { + used_notes[idx] = true; decisions.push(KeyDecision { timestamp: ms_to_iso(event.created_at), actor: event.actor.clone(), @@ -728,8 +733,14 @@ fn fetch_open_threads( ) -> Result> { let id_col = id_column_for(entity_type); + // Single query with scalar subqueries — avoids N+1. let sql = format!( - "SELECT d.id, d.gitlab_discussion_id, d.first_note_at, d.last_note_at \ + "SELECT d.gitlab_discussion_id, d.first_note_at, d.last_note_at, \ + (SELECT COUNT(*) FROM notes n2 \ + WHERE n2.discussion_id = d.id AND n2.is_system = 0) AS note_count, \ + (SELECT n3.author_username FROM notes n3 \ + WHERE n3.discussion_id = d.id \ + ORDER BY n3.created_at ASC LIMIT 1) AS started_by \ FROM discussions d \ WHERE d.{id_col} = ?1 \ AND d.resolvable = 1 \ @@ -738,42 +749,19 @@ fn fetch_open_threads( ); let mut stmt = conn.prepare(&sql)?; - let rows: Vec<(i64, String, i64, i64)> = stmt + let threads = stmt .query_map([entity_id], |row| { - Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) + let count: i64 = row.get(3)?; + Ok(OpenThread { + discussion_id: row.get(0)?, + started_at: ms_to_iso(row.get::<_, i64>(1)?), + last_note_at: ms_to_iso(row.get::<_, i64>(2)?), + note_count: count as usize, + started_by: row.get(4)?, + }) })? .collect::, _>>()?; - let mut threads = Vec::with_capacity(rows.len()); - - for (local_id, gitlab_discussion_id, first_note_at, last_note_at) in rows { - let started_by: String = conn - .query_row( - "SELECT author_username FROM notes \ - WHERE discussion_id = ?1 \ - ORDER BY created_at ASC LIMIT 1", - [local_id], - |row| row.get(0), - ) - .unwrap_or_else(|_| "unknown".to_owned()); - - let note_count_i64: i64 = conn.query_row( - "SELECT COUNT(*) FROM notes \ - WHERE discussion_id = ?1 AND is_system = 0", - [local_id], - |row| row.get(0), - )?; - let note_count = note_count_i64 as usize; - - threads.push(OpenThread { - discussion_id: gitlab_discussion_id, - started_by, - started_at: ms_to_iso(first_note_at), - note_count, - last_note_at: ms_to_iso(last_note_at), - }); - } - Ok(threads) } @@ -908,7 +896,10 @@ fn build_timeline_excerpt_from_pipeline( let seed_result = match seed_timeline_direct(conn, timeline_entity_type, params.iid, project_id) { Ok(result) => result, - Err(_) => return Some(vec![]), + Err(e) => { + tracing::warn!("explain: timeline seed failed: {e}"); + return Some(vec![]); + } }; let (mut events, _total) = match collect_events( @@ -921,7 +912,10 @@ fn build_timeline_excerpt_from_pipeline( MAX_TIMELINE_EVENTS, ) { Ok(result) => result, - Err(_) => return Some(vec![]), + Err(e) => { + tracing::warn!("explain: timeline collect failed: {e}"); + return Some(vec![]); + } }; events.truncate(MAX_TIMELINE_EVENTS); @@ -1135,7 +1129,10 @@ pub fn print_explain(result: &ExplainResult) { for t in threads { println!( " {} by {} ({} notes, last: {})", - t.discussion_id, t.started_by, t.note_count, t.last_note_at + t.discussion_id, + t.started_by.as_deref().unwrap_or("unknown"), + t.note_count, + t.last_note_at ); } } @@ -1809,7 +1806,7 @@ mod tests { assert_eq!(threads.len(), 1, "Only unresolved thread should appear"); assert_eq!(threads[0].discussion_id, "disc-unresolved"); - assert_eq!(threads[0].started_by, "alice"); + assert_eq!(threads[0].started_by.as_deref(), Some("alice")); assert_eq!(threads[0].note_count, 2); } diff --git a/src/core/error.rs b/src/core/error.rs index d22f53d..872765b 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -28,8 +28,11 @@ pub enum ErrorCode { OllamaUnavailable, OllamaModelNotFound, EmbeddingFailed, + EmbeddingsNotBuilt, NotFound, Ambiguous, + HealthCheckFailed, + UsageError, SurgicalPreflightFailed, } @@ -52,8 +55,11 @@ impl std::fmt::Display for ErrorCode { Self::OllamaUnavailable => "OLLAMA_UNAVAILABLE", Self::OllamaModelNotFound => "OLLAMA_MODEL_NOT_FOUND", Self::EmbeddingFailed => "EMBEDDING_FAILED", + Self::EmbeddingsNotBuilt => "EMBEDDINGS_NOT_BUILT", Self::NotFound => "NOT_FOUND", Self::Ambiguous => "AMBIGUOUS", + Self::HealthCheckFailed => "HEALTH_CHECK_FAILED", + Self::UsageError => "USAGE_ERROR", Self::SurgicalPreflightFailed => "SURGICAL_PREFLIGHT_FAILED", }; write!(f, "{code}") @@ -79,8 +85,11 @@ impl ErrorCode { Self::OllamaUnavailable => 14, Self::OllamaModelNotFound => 15, Self::EmbeddingFailed => 16, + Self::EmbeddingsNotBuilt => 21, Self::NotFound => 17, Self::Ambiguous => 18, + Self::HealthCheckFailed => 19, + Self::UsageError => 2, // Shares exit code 6 with GitLabNotFound — same semantic category (resource not found). // Robot consumers distinguish via ErrorCode string, not exit code. Self::SurgicalPreflightFailed => 6, @@ -201,7 +210,7 @@ impl LoreError { Self::OllamaUnavailable { .. } => ErrorCode::OllamaUnavailable, Self::OllamaModelNotFound { .. } => ErrorCode::OllamaModelNotFound, Self::EmbeddingFailed { .. } => ErrorCode::EmbeddingFailed, - Self::EmbeddingsNotBuilt => ErrorCode::EmbeddingFailed, + Self::EmbeddingsNotBuilt => ErrorCode::EmbeddingsNotBuilt, Self::SurgicalPreflightFailed { .. } => ErrorCode::SurgicalPreflightFailed, } } diff --git a/src/embedding/chunk_ids.rs b/src/embedding/chunk_ids.rs deleted file mode 100644 index d5df5a5..0000000 --- a/src/embedding/chunk_ids.rs +++ /dev/null @@ -1,70 +0,0 @@ -pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000; - -pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 { - assert!( - (0..CHUNK_ROWID_MULTIPLIER).contains(&chunk_index), - "chunk_index {chunk_index} out of range [0, {CHUNK_ROWID_MULTIPLIER})" - ); - document_id - .checked_mul(CHUNK_ROWID_MULTIPLIER) - .and_then(|v| v.checked_add(chunk_index)) - .unwrap_or_else(|| { - panic!("encode_rowid overflow: document_id={document_id}, chunk_index={chunk_index}") - }) -} - -pub fn decode_rowid(rowid: i64) -> (i64, i64) { - assert!( - rowid >= 0, - "decode_rowid called with negative rowid: {rowid}" - ); - let document_id = rowid / CHUNK_ROWID_MULTIPLIER; - let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER; - (document_id, chunk_index) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_encode_single_chunk() { - assert_eq!(encode_rowid(1, 0), 1000); - } - - #[test] - fn test_encode_multi_chunk() { - assert_eq!(encode_rowid(1, 5), 1005); - } - - #[test] - fn test_encode_specific_values() { - assert_eq!(encode_rowid(42, 0), 42000); - assert_eq!(encode_rowid(42, 5), 42005); - } - - #[test] - fn test_decode_zero_chunk() { - assert_eq!(decode_rowid(42000), (42, 0)); - } - - #[test] - fn test_decode_roundtrip() { - for doc_id in [0, 1, 42, 100, 999, 10000] { - for chunk_idx in [0, 1, 5, 99, 999] { - let rowid = encode_rowid(doc_id, chunk_idx); - let (decoded_doc, decoded_chunk) = decode_rowid(rowid); - assert_eq!( - (decoded_doc, decoded_chunk), - (doc_id, chunk_idx), - "Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}" - ); - } - } - } - - #[test] - fn test_multiplier_value() { - assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000); - } -} diff --git a/src/embedding/chunking.rs b/src/embedding/chunking.rs deleted file mode 100644 index f28da03..0000000 --- a/src/embedding/chunking.rs +++ /dev/null @@ -1,107 +0,0 @@ -pub const CHUNK_MAX_BYTES: usize = 1_500; - -pub const EXPECTED_DIMS: usize = 768; - -pub const CHUNK_OVERLAP_CHARS: usize = 200; - -pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> { - if content.is_empty() { - return Vec::new(); - } - - if content.len() <= CHUNK_MAX_BYTES { - return vec![(0, content.to_string())]; - } - - let mut chunks: Vec<(usize, String)> = Vec::new(); - let mut start = 0; - let mut chunk_index = 0; - - while start < content.len() { - let remaining = &content[start..]; - if remaining.len() <= CHUNK_MAX_BYTES { - chunks.push((chunk_index, remaining.to_string())); - break; - } - - let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES); - let window = &content[start..end]; - - let split_at = find_paragraph_break(window) - .or_else(|| find_sentence_break(window)) - .or_else(|| find_word_break(window)) - .unwrap_or(window.len()); - - let chunk_text = &content[start..start + split_at]; - chunks.push((chunk_index, chunk_text.to_string())); - - let advance = if split_at > CHUNK_OVERLAP_CHARS { - split_at - CHUNK_OVERLAP_CHARS - } else { - split_at - } - .max(1); - let old_start = start; - start += advance; - // Ensure start lands on a char boundary after overlap subtraction - start = floor_char_boundary(content, start); - // Guarantee forward progress: multi-byte chars can cause - // floor_char_boundary to round back to old_start - if start <= old_start { - start = old_start - + content[old_start..] - .chars() - .next() - .map_or(1, |c| c.len_utf8()); - } - chunk_index += 1; - } - - chunks -} - -fn find_paragraph_break(window: &str) -> Option { - let search_start = floor_char_boundary(window, window.len() * 2 / 3); - window[search_start..] - .rfind("\n\n") - .map(|pos| search_start + pos + 2) - .or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2)) -} - -fn find_sentence_break(window: &str) -> Option { - let search_start = floor_char_boundary(window, window.len() / 2); - for pat in &[". ", "? ", "! "] { - if let Some(pos) = window[search_start..].rfind(pat) { - return Some(search_start + pos + pat.len()); - } - } - for pat in &[". ", "? ", "! "] { - if let Some(pos) = window[..search_start].rfind(pat) { - return Some(pos + pat.len()); - } - } - None -} - -fn find_word_break(window: &str) -> Option { - let search_start = floor_char_boundary(window, window.len() / 2); - window[search_start..] - .rfind(' ') - .map(|pos| search_start + pos + 1) - .or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1)) -} - -fn floor_char_boundary(s: &str, idx: usize) -> usize { - if idx >= s.len() { - return s.len(); - } - let mut i = idx; - while i > 0 && !s.is_char_boundary(i) { - i -= 1; - } - i -} - -#[cfg(test)] -#[path = "chunking_tests.rs"] -mod tests; diff --git a/src/gitlab/transformers/discussion.rs b/src/gitlab/transformers/discussion.rs index ecafac4..4d313aa 100644 --- a/src/gitlab/transformers/discussion.rs +++ b/src/gitlab/transformers/discussion.rs @@ -53,14 +53,8 @@ pub struct NormalizedNote { pub position_head_sha: Option, } -fn parse_timestamp(ts: &str) -> i64 { - match iso_to_ms(ts) { - Some(ms) => ms, - None => { - warn!(timestamp = ts, "Invalid timestamp, defaulting to epoch 0"); - 0 - } - } +fn parse_timestamp(ts: &str) -> Result { + iso_to_ms_strict(ts) } pub fn transform_discussion( @@ -133,7 +127,15 @@ pub fn transform_notes( .notes .iter() .enumerate() - .map(|(idx, note)| transform_single_note(note, local_project_id, idx as i32, now)) + .filter_map(|(idx, note)| { + match transform_single_note(note, local_project_id, idx as i32, now) { + Ok(n) => Some(n), + Err(e) => { + warn!(note_id = note.id, error = %e, "Skipping note with invalid timestamp"); + None + } + } + }) .collect() } @@ -142,7 +144,10 @@ fn transform_single_note( local_project_id: i64, position: i32, now: i64, -) -> NormalizedNote { +) -> Result { + let created_at = parse_timestamp(¬e.created_at)?; + let updated_at = parse_timestamp(¬e.updated_at)?; + let ( position_old_path, position_new_path, @@ -156,7 +161,7 @@ fn transform_single_note( position_head_sha, ) = extract_position_fields(¬e.position); - NormalizedNote { + Ok(NormalizedNote { gitlab_id: note.id, project_id: local_project_id, note_type: note.note_type.clone(), @@ -164,8 +169,8 @@ fn transform_single_note( author_id: Some(note.author.id), author_username: note.author.username.clone(), body: note.body.clone(), - created_at: parse_timestamp(¬e.created_at), - updated_at: parse_timestamp(¬e.updated_at), + created_at, + updated_at, last_seen_at: now, position, resolvable: note.resolvable, @@ -182,7 +187,7 @@ fn transform_single_note( position_base_sha, position_start_sha, position_head_sha, - } + }) } #[allow(clippy::type_complexity)] diff --git a/src/search/fts.rs b/src/search/fts.rs index 34159a5..6c179d6 100644 --- a/src/search/fts.rs +++ b/src/search/fts.rs @@ -119,15 +119,12 @@ pub fn search_fts( } pub fn generate_fallback_snippet(content_text: &str, max_chars: usize) -> String { - if content_text.chars().count() <= max_chars { - return content_text.to_string(); - } - - let byte_end = content_text - .char_indices() - .nth(max_chars) - .map(|(i, _)| i) - .unwrap_or(content_text.len()); + // Use char_indices to find the boundary at max_chars in a single pass, + // short-circuiting early for large strings instead of counting all chars. + let byte_end = match content_text.char_indices().nth(max_chars) { + Some((i, _)) => i, + None => return content_text.to_string(), // content fits within max_chars + }; let truncated = &content_text[..byte_end]; if let Some(last_space) = truncated.rfind(' ') { diff --git a/src/timeline/seed.rs b/src/timeline/seed.rs index fd26e31..561ac4e 100644 --- a/src/timeline/seed.rs +++ b/src/timeline/seed.rs @@ -411,7 +411,9 @@ fn round_robin_select_by_discussion( let mut made_progress = false; for (disc_idx, &discussion_id) in discussion_order.iter().enumerate() { - let notes = by_discussion.get(&discussion_id).unwrap(); + let notes = by_discussion + .get(&discussion_id) + .expect("key present: inserted into by_discussion via discussion_order"); let note_idx = indices[disc_idx]; if note_idx < notes.len() {