From 0e6520277875b286e0118f4bc3acfa99c2a32ab6 Mon Sep 17 00:00:00 2001 From: teernisse Date: Fri, 13 Feb 2026 14:08:25 -0500 Subject: [PATCH] feat(timeline): add DiscussionThread types and seed-phase discussion matching Introduces the foundation for full discussion thread support in the timeline pipeline. Adds three new domain types to timeline.rs: - ThreadNote: individual note within a thread (id, author, body, timestamp) - MatchedDiscussion: tracks discussions matched during seeding with their parent entity (issue or MR) for downstream collection - DiscussionThread variant on TimelineEventType: carries a full thread of notes, sorted between NoteEvidence and CrossReferenced Moves truncate_to_chars() from timeline_seed.rs to timeline.rs as pub(crate) for reuse by the collect phase. Adds THREAD_NOTE_MAX_CHARS (2000) and THREAD_MAX_NOTES (50) constants. Upgrades the seed SQL in resolve_documents_to_entities() to resolve note documents to their parent discussion via an additional LEFT JOIN chain (notes -> discussions), using COALESCE to unify the entity resolution path for both discussion and note source types. SeedResult gains a matched_discussions field that captures deduplicated discussion matches. Tests cover: discussion matching from discussion docs, note-to-parent resolution, deduplication of same discussion across multiple docs, and correct parent entity type (issue vs MR). Co-Authored-By: Claude Opus 4.6 --- src/core/timeline.rs | 130 +++++++++++++++++++++++++++++++- src/core/timeline_seed.rs | 81 ++++++++++++-------- src/core/timeline_seed_tests.rs | 117 ++++++++++++++++++++++++---- 3 files changed, 278 insertions(+), 50 deletions(-) diff --git a/src/core/timeline.rs b/src/core/timeline.rs index bf89b26..4be0b50 100644 --- a/src/core/timeline.rs +++ b/src/core/timeline.rs @@ -49,6 +49,21 @@ impl Ord for TimelineEvent { } } +/// Maximum characters per note body in a discussion thread. +pub const THREAD_NOTE_MAX_CHARS: usize = 2000; + +/// Maximum notes per discussion thread before truncation. +pub const THREAD_MAX_NOTES: usize = 50; + +/// A single note within a discussion thread. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize)] +pub struct ThreadNote { + pub note_id: i64, + pub author: Option, + pub body: String, + pub created_at: i64, +} + /// Per spec Section 3.3. Serde tagged enum for JSON output. /// /// Variant declaration order defines the sort order within a timestamp+entity @@ -78,11 +93,39 @@ pub enum TimelineEventType { snippet: String, discussion_id: Option, }, + DiscussionThread { + discussion_id: i64, + notes: Vec, + }, CrossReferenced { target: String, }, } +/// Truncate a string to at most `max_chars` characters on a safe UTF-8 boundary. +pub(crate) fn truncate_to_chars(s: &str, max_chars: usize) -> String { + let char_count = s.chars().count(); + if char_count <= max_chars { + return s.to_owned(); + } + + let byte_end = s + .char_indices() + .nth(max_chars) + .map(|(i, _)| i) + .unwrap_or(s.len()); + s[..byte_end].to_owned() +} + +/// A discussion matched during the seed phase, to be collected as a full thread. +#[derive(Debug, Clone)] +pub struct MatchedDiscussion { + pub discussion_id: i64, + pub entity_type: String, + pub entity_id: i64, + pub project_id: i64, +} + /// Internal entity reference used across pipeline stages. #[derive(Debug, Clone, Serialize)] pub struct EntityRef { @@ -250,7 +293,7 @@ mod tests { #[test] fn test_timeline_event_type_variant_count() { - // Verify all 9 variants serialize without panic + // Verify all 10 variants serialize without panic let variants: Vec = vec![ TimelineEventType::Created, TimelineEventType::StateChanged { @@ -274,13 +317,96 @@ mod tests { snippet: "text".to_owned(), discussion_id: None, }, + TimelineEventType::DiscussionThread { + discussion_id: 1, + notes: vec![ThreadNote { + note_id: 1, + author: Some("alice".to_owned()), + body: "hello".to_owned(), + created_at: 1000, + }], + }, TimelineEventType::CrossReferenced { target: "!567".to_owned(), }, ]; - assert_eq!(variants.len(), 9); + assert_eq!(variants.len(), 10); for v in &variants { serde_json::to_value(v).unwrap(); } } + + #[test] + fn test_discussion_thread_serializes_tagged() { + let event_type = TimelineEventType::DiscussionThread { + discussion_id: 42, + notes: vec![ + ThreadNote { + note_id: 1, + author: Some("alice".to_owned()), + body: "first note".to_owned(), + created_at: 1000, + }, + ThreadNote { + note_id: 2, + author: Some("bob".to_owned()), + body: "second note".to_owned(), + created_at: 2000, + }, + ], + }; + let json = serde_json::to_value(&event_type).unwrap(); + assert_eq!(json["kind"], "discussion_thread"); + assert_eq!(json["discussion_id"], 42); + assert_eq!(json["notes"].as_array().unwrap().len(), 2); + assert_eq!(json["notes"][0]["note_id"], 1); + assert_eq!(json["notes"][0]["author"], "alice"); + assert_eq!(json["notes"][0]["body"], "first note"); + assert_eq!(json["notes"][1]["note_id"], 2); + } + + #[test] + fn test_discussion_thread_sort_order() { + // DiscussionThread should sort after NoteEvidence, before CrossReferenced + let note_ev = TimelineEventType::NoteEvidence { + note_id: 1, + snippet: "a".to_owned(), + discussion_id: None, + }; + let thread = TimelineEventType::DiscussionThread { + discussion_id: 1, + notes: vec![], + }; + let cross_ref = TimelineEventType::CrossReferenced { + target: "!1".to_owned(), + }; + + assert!(note_ev < thread); + assert!(thread < cross_ref); + } + + #[test] + fn test_thread_note_ord() { + let a = ThreadNote { + note_id: 1, + author: Some("alice".to_owned()), + body: "first".to_owned(), + created_at: 1000, + }; + let b = ThreadNote { + note_id: 2, + author: Some("bob".to_owned()), + body: "second".to_owned(), + created_at: 2000, + }; + // ThreadNote derives Ord — note_id is the first field, so ordering is by note_id + assert!(a < b); + } + + #[test] + fn test_truncate_to_chars() { + assert_eq!(truncate_to_chars("hello", 200), "hello"); + let long = "a".repeat(300); + assert_eq!(truncate_to_chars(&long, 200).chars().count(), 200); + } } diff --git a/src/core/timeline_seed.rs b/src/core/timeline_seed.rs index 1610211..b0bcbeb 100644 --- a/src/core/timeline_seed.rs +++ b/src/core/timeline_seed.rs @@ -4,7 +4,10 @@ use rusqlite::Connection; use tracing::debug; use crate::core::error::Result; -use crate::core::timeline::{EntityRef, TimelineEvent, TimelineEventType, resolve_entity_ref}; +use crate::core::timeline::{ + EntityRef, MatchedDiscussion, TimelineEvent, TimelineEventType, resolve_entity_ref, + truncate_to_chars, +}; use crate::embedding::ollama::OllamaClient; use crate::search::{FtsQueryMode, SearchFilters, SearchMode, search_hybrid, to_fts_query}; @@ -12,6 +15,8 @@ use crate::search::{FtsQueryMode, SearchFilters, SearchMode, search_hybrid, to_f pub struct SeedResult { pub seed_entities: Vec, pub evidence_notes: Vec, + /// Discussions matched during seeding, to be collected as full threads. + pub matched_discussions: Vec, /// The search mode actually used (hybrid with fallback info). pub search_mode: String, } @@ -38,6 +43,7 @@ pub async fn seed_timeline( return Ok(SeedResult { seed_entities: Vec::new(), evidence_notes: Vec::new(), + matched_discussions: Vec::new(), search_mode: "lexical".to_owned(), }); } @@ -76,7 +82,7 @@ pub async fn seed_timeline( debug!(warning = %w, "hybrid search warning during timeline seeding"); } - let seed_entities = resolve_documents_to_entities( + let (seed_entities, matched_discussions) = resolve_documents_to_entities( conn, &hybrid_results .iter() @@ -91,19 +97,21 @@ pub async fn seed_timeline( Ok(SeedResult { seed_entities, evidence_notes, + matched_discussions, search_mode, }) } -/// Resolve a list of document IDs to deduplicated entity refs. -/// Discussion documents are resolved to their parent entity (issue or MR). +/// Resolve a list of document IDs to deduplicated entity refs and matched discussions. +/// Discussion and note documents are resolved to their parent entity (issue or MR). +/// Returns (entities, matched_discussions). fn resolve_documents_to_entities( conn: &Connection, document_ids: &[i64], max_entities: usize, -) -> Result> { +) -> Result<(Vec, Vec)> { if document_ids.is_empty() { - return Ok(Vec::new()); + return Ok((Vec::new(), Vec::new())); } let placeholders: String = document_ids @@ -114,9 +122,13 @@ fn resolve_documents_to_entities( let sql = format!( r" SELECT d.source_type, d.source_id, d.project_id, - disc.issue_id, disc.merge_request_id + COALESCE(disc.issue_id, note_disc.issue_id) AS issue_id, + COALESCE(disc.merge_request_id, note_disc.merge_request_id) AS mr_id, + COALESCE(disc.id, note_disc.id) AS discussion_id FROM documents d LEFT JOIN discussions disc ON disc.id = d.source_id AND d.source_type = 'discussion' + LEFT JOIN notes n ON n.id = d.source_id AND d.source_type = 'note' + LEFT JOIN discussions note_disc ON note_disc.id = n.discussion_id AND d.source_type = 'note' WHERE d.id IN ({placeholders}) ORDER BY CASE d.id {order_clause} END ", @@ -135,37 +147,55 @@ fn resolve_documents_to_entities( .collect(); let rows = stmt.query_map(params.as_slice(), |row| { Ok(( - row.get::<_, String>(0)?, - row.get::<_, i64>(1)?, - row.get::<_, i64>(2)?, - row.get::<_, Option>(3)?, - row.get::<_, Option>(4)?, + row.get::<_, String>(0)?, // source_type + row.get::<_, i64>(1)?, // source_id + row.get::<_, i64>(2)?, // project_id + row.get::<_, Option>(3)?, // issue_id (coalesced) + row.get::<_, Option>(4)?, // mr_id (coalesced) + row.get::<_, Option>(5)?, // discussion_id (coalesced) )) })?; - let mut seen = HashSet::new(); + let mut seen_entities = HashSet::new(); + let mut seen_discussions = HashSet::new(); let mut entities = Vec::new(); + let mut matched_discussions = Vec::new(); for row_result in rows { - let (source_type, source_id, proj_id, disc_issue_id, disc_mr_id) = row_result?; + let (source_type, source_id, proj_id, disc_issue_id, disc_mr_id, discussion_id) = + row_result?; let (entity_type, entity_id) = match source_type.as_str() { "issue" => ("issue".to_owned(), source_id), "merge_request" => ("merge_request".to_owned(), source_id), - "discussion" => { + "discussion" | "note" => { if let Some(issue_id) = disc_issue_id { ("issue".to_owned(), issue_id) } else if let Some(mr_id) = disc_mr_id { ("merge_request".to_owned(), mr_id) } else { - continue; // orphaned discussion + continue; // orphaned discussion/note } } _ => continue, }; + // Capture matched discussion (deduplicated) + if let Some(disc_id) = discussion_id + && (source_type == "discussion" || source_type == "note") + && seen_discussions.insert(disc_id) + { + matched_discussions.push(MatchedDiscussion { + discussion_id: disc_id, + entity_type: entity_type.clone(), + entity_id, + project_id: proj_id, + }); + } + + // Entity dedup let key = (entity_type.clone(), entity_id); - if !seen.insert(key) { + if !seen_entities.insert(key) { continue; } @@ -179,7 +209,7 @@ fn resolve_documents_to_entities( } } - Ok(entities) + Ok((entities, matched_discussions)) } /// Find evidence notes: FTS5-matched discussion notes that provide context. @@ -275,21 +305,6 @@ fn find_evidence_notes( Ok(events) } -/// Truncate a string to at most `max_chars` characters on a safe UTF-8 boundary. -fn truncate_to_chars(s: &str, max_chars: usize) -> String { - let char_count = s.chars().count(); - if char_count <= max_chars { - return s.to_owned(); - } - - let byte_end = s - .char_indices() - .nth(max_chars) - .map(|(i, _)| i) - .unwrap_or(s.len()); - s[..byte_end].to_owned() -} - #[cfg(test)] #[path = "timeline_seed_tests.rs"] mod tests; diff --git a/src/core/timeline_seed_tests.rs b/src/core/timeline_seed_tests.rs index 7e4e379..119fb2d 100644 --- a/src/core/timeline_seed_tests.rs +++ b/src/core/timeline_seed_tests.rs @@ -316,23 +316,110 @@ async fn test_seed_respects_project_filter() { assert_eq!(result.seed_entities[0].project_path, "group/project"); } -#[test] -fn test_truncate_to_chars_short() { - assert_eq!(truncate_to_chars("hello", 200), "hello"); +// ─── Matched discussion tests ─────────────────────────────────────────────── + +#[tokio::test] +async fn test_seed_captures_matched_discussions_from_discussion_doc() { + let conn = setup_test_db(); + let project_id = insert_test_project(&conn); + let issue_id = insert_test_issue(&conn, project_id, 1); + let disc_id = insert_discussion(&conn, project_id, Some(issue_id), None); + insert_document( + &conn, + "discussion", + disc_id, + project_id, + "deployment pipeline authentication", + ); + + let result = seed_timeline(&conn, None, "deployment", None, None, 50, 10) + .await + .unwrap(); + assert_eq!(result.matched_discussions.len(), 1); + assert_eq!(result.matched_discussions[0].discussion_id, disc_id); + assert_eq!(result.matched_discussions[0].entity_type, "issue"); + assert_eq!(result.matched_discussions[0].entity_id, issue_id); } -#[test] -fn test_truncate_to_chars_long() { - let long = "a".repeat(300); - let result = truncate_to_chars(&long, 200); - assert_eq!(result.chars().count(), 200); +#[tokio::test] +async fn test_seed_captures_matched_discussions_from_note_doc() { + let conn = setup_test_db(); + let project_id = insert_test_project(&conn); + let issue_id = insert_test_issue(&conn, project_id, 1); + let disc_id = insert_discussion(&conn, project_id, Some(issue_id), None); + let note_id = insert_note(&conn, disc_id, project_id, "note about deployment", false); + insert_document( + &conn, + "note", + note_id, + project_id, + "deployment configuration details", + ); + + let result = seed_timeline(&conn, None, "deployment", None, None, 50, 10) + .await + .unwrap(); + assert_eq!( + result.matched_discussions.len(), + 1, + "Note doc should resolve to parent discussion" + ); + assert_eq!(result.matched_discussions[0].discussion_id, disc_id); + assert_eq!(result.matched_discussions[0].entity_type, "issue"); } -#[test] -fn test_truncate_to_chars_multibyte() { - let s = "\u{1F600}".repeat(300); // emoji - let result = truncate_to_chars(&s, 200); - assert_eq!(result.chars().count(), 200); - // Verify valid UTF-8 - assert!(std::str::from_utf8(result.as_bytes()).is_ok()); +#[tokio::test] +async fn test_seed_deduplicates_matched_discussions() { + let conn = setup_test_db(); + let project_id = insert_test_project(&conn); + let issue_id = insert_test_issue(&conn, project_id, 1); + let disc_id = insert_discussion(&conn, project_id, Some(issue_id), None); + + // Two docs referencing the same discussion + insert_document( + &conn, + "discussion", + disc_id, + project_id, + "deployment pipeline first doc", + ); + let note_id = insert_note(&conn, disc_id, project_id, "deployment note", false); + insert_document( + &conn, + "note", + note_id, + project_id, + "deployment pipeline second doc", + ); + + let result = seed_timeline(&conn, None, "deployment", None, None, 50, 10) + .await + .unwrap(); + assert_eq!( + result.matched_discussions.len(), + 1, + "Same discussion_id from two docs should deduplicate" + ); +} + +#[tokio::test] +async fn test_seed_matched_discussions_have_correct_parent_entity() { + let conn = setup_test_db(); + let project_id = insert_test_project(&conn); + let mr_id = insert_test_mr(&conn, project_id, 99); + let disc_id = insert_discussion(&conn, project_id, None, Some(mr_id)); + insert_document( + &conn, + "discussion", + disc_id, + project_id, + "deployment pipeline for merge request", + ); + + let result = seed_timeline(&conn, None, "deployment", None, None, 50, 10) + .await + .unwrap(); + assert_eq!(result.matched_discussions.len(), 1); + assert_eq!(result.matched_discussions[0].entity_type, "merge_request"); + assert_eq!(result.matched_discussions[0].entity_id, mr_id); }