diff --git a/src/documents/extractor/common.rs b/src/documents/extractor/common.rs index 9620581..11f7e4a 100644 --- a/src/documents/extractor/common.rs +++ b/src/documents/extractor/common.rs @@ -73,6 +73,59 @@ pub fn compute_list_hash(items: &[String]) -> String { format!("{:x}", hasher.finalize()) } +/// Strip GitLab-generated boilerplate from titles before embedding. +/// +/// Common patterns that inflate embedding similarity between unrelated entities: +/// - `Draft: Resolve "Actual Title"` → `Actual Title` +/// - `Resolve "Actual Title"` → `Actual Title` +/// - `Draft: Some Title` → `Some Title` +/// - `WIP: Some Title` → `Some Title` +/// +/// The original title is preserved in `DocumentData.title` for display; +/// this function only affects `content_text` (what gets embedded). +fn normalize_title_for_embedding(title: &str) -> &str { + let mut s = title; + + // Strip leading "Draft: " and/or "WIP: " (case-insensitive, repeatable). + // Use `get()` for slicing — direct `str[..N]` panics if byte N is mid-character + // (e.g. titles starting with emoji or accented characters). + loop { + let trimmed = s.trim_start(); + if trimmed + .get(..6) + .is_some_and(|p| p.eq_ignore_ascii_case("draft:")) + { + s = trimmed[6..].trim_start(); + } else if trimmed + .get(..4) + .is_some_and(|p| p.eq_ignore_ascii_case("wip:")) + { + s = trimmed[4..].trim_start(); + } else { + break; + } + } + + // Strip `Resolve "..."` wrapper (case-insensitive) + if s.len() >= 10 + && s.get(..8).is_some_and(|p| p.eq_ignore_ascii_case("resolve ")) + && s.as_bytes()[8] == b'"' + && let Some(end) = s[9..].rfind('"') + { + let inner = &s[9..9 + end]; + if !inner.is_empty() { + return inner; + } + } + + // Guard: if stripping left us with nothing, return the original + if s.is_empty() { + return title; + } + + s +} + fn format_date(ms: i64) -> String { DateTime::from_timestamp_millis(ms) .map(|dt| dt.format("%Y-%m-%d").to_string()) diff --git a/src/documents/extractor/discussions.rs b/src/documents/extractor/discussions.rs index bbcc408..8526c14 100644 --- a/src/documents/extractor/discussions.rs +++ b/src/documents/extractor/discussions.rs @@ -156,12 +156,13 @@ pub fn extract_discussion_document( let author_username = notes[0].author.clone(); let display_title = parent_title.as_deref().unwrap_or("(untitled)"); + let embed_title = normalize_title_for_embedding(display_title); let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string()); let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string()); let mut content = format!( "[[Discussion]] {}: {}\nProject: {}\n", - parent_type_prefix, display_title, path_with_namespace + parent_type_prefix, embed_title, path_with_namespace ); if let Some(ref u) = url { let _ = writeln!(content, "URL: {}", u); diff --git a/src/documents/extractor/extractor_tests.rs b/src/documents/extractor/extractor_tests.rs index e818847..5bae319 100644 --- a/src/documents/extractor/extractor_tests.rs +++ b/src/documents/extractor/extractor_tests.rs @@ -1,5 +1,171 @@ use super::*; +// --- normalize_title_for_embedding tests --- + +#[test] +fn test_normalize_title_strips_draft_resolve_quotes() { + assert_eq!( + normalize_title_for_embedding("Draft: Resolve \"Analytics Studio: Subformulas\""), + "Analytics Studio: Subformulas" + ); +} + +#[test] +fn test_normalize_title_strips_resolve_quotes() { + assert_eq!( + normalize_title_for_embedding("Resolve \"RUL Report: Use param_trends from S3\""), + "RUL Report: Use param_trends from S3" + ); +} + +#[test] +fn test_normalize_title_strips_draft_prefix() { + assert_eq!( + normalize_title_for_embedding("Draft: Implement JWT authentication"), + "Implement JWT authentication" + ); +} + +#[test] +fn test_normalize_title_strips_wip_prefix() { + assert_eq!( + normalize_title_for_embedding("WIP: Implement JWT authentication"), + "Implement JWT authentication" + ); +} + +#[test] +fn test_normalize_title_strips_draft_wip_combined() { + assert_eq!( + normalize_title_for_embedding("Draft: WIP: Fix auth"), + "Fix auth" + ); +} + +#[test] +fn test_normalize_title_no_change_for_normal_title() { + assert_eq!( + normalize_title_for_embedding("Implement JWT authentication"), + "Implement JWT authentication" + ); +} + +#[test] +fn test_normalize_title_case_insensitive_draft() { + assert_eq!( + normalize_title_for_embedding("draft: Resolve \"Some Issue\""), + "Some Issue" + ); +} + +#[test] +fn test_normalize_title_case_insensitive_wip() { + assert_eq!(normalize_title_for_embedding("wip: Something"), "Something"); +} + +#[test] +fn test_normalize_title_untitled_passthrough() { + assert_eq!(normalize_title_for_embedding("(untitled)"), "(untitled)"); +} + +#[test] +fn test_normalize_title_resolve_without_quotes_unchanged() { + // "Resolve something" without quotes is not the GitLab pattern + assert_eq!( + normalize_title_for_embedding("Resolve the flaky test"), + "Resolve the flaky test" + ); +} + +#[test] +fn test_normalize_title_empty_after_strip_returns_original() { + // Edge case: "Draft: " with nothing after → return original + assert_eq!(normalize_title_for_embedding("Draft: "), "Draft: "); +} + +#[test] +fn test_normalize_title_resolve_empty_quotes() { + // Edge case: Resolve "" → return original (empty inner text) + assert_eq!( + normalize_title_for_embedding("Resolve \"\""), + "Resolve \"\"" + ); +} + +#[test] +fn test_normalize_title_non_ascii_does_not_panic() { + // Emoji at start: byte offsets 4 and 8 fall mid-character. + // Must not panic — should return the title unchanged. + assert_eq!( + normalize_title_for_embedding("\u{1F389}\u{1F389} celebration"), + "\u{1F389}\u{1F389} celebration" + ); + // Accented characters + assert_eq!( + normalize_title_for_embedding("\u{00DC}berpr\u{00FC}fung der Daten"), + "\u{00DC}berpr\u{00FC}fung der Daten" + ); +} + +// --- MR document uses normalized title in content_text --- + +#[test] +fn test_mr_document_normalizes_draft_resolve_title() { + let conn = setup_mr_test_db(); + insert_mr( + &conn, + 1, + 4064, + Some("Draft: Resolve \"Analytics Studio: Subformulas\""), + Some("Implements subformula support"), + Some("opened"), + Some("dev"), + Some("feature/subformulas"), + Some("main"), + None, + ); + + let doc = extract_mr_document(&conn, 1).unwrap().unwrap(); + // content_text should use the normalized title (no boilerplate) + assert!( + doc.content_text + .starts_with("[[MergeRequest]] !4064: Analytics Studio: Subformulas\n") + ); + // but DocumentData.title preserves the original for display + assert_eq!( + doc.title, + Some("Draft: Resolve \"Analytics Studio: Subformulas\"".to_string()) + ); +} + +// --- Issue document uses normalized title in content_text --- + +#[test] +fn test_issue_document_normalizes_draft_title() { + let conn = setup_test_db(); + insert_issue( + &conn, + 1, + 100, + Some("Draft: WIP: Rethink caching strategy"), + Some("We should reconsider..."), + "opened", + Some("alice"), + None, + ); + + let doc = extract_issue_document(&conn, 1).unwrap().unwrap(); + assert!( + doc.content_text + .starts_with("[[Issue]] #100: Rethink caching strategy\n") + ); + // Original title preserved for display + assert_eq!( + doc.title, + Some("Draft: WIP: Rethink caching strategy".to_string()) + ); +} + #[test] fn test_source_type_parse_aliases() { assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue)); diff --git a/src/documents/extractor/issues.rs b/src/documents/extractor/issues.rs index 972361b..ba6377e 100644 --- a/src/documents/extractor/issues.rs +++ b/src/documents/extractor/issues.rs @@ -55,9 +55,10 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result