diff --git a/src/documents/extractor/common.rs b/src/documents/extractor/common.rs
index 9620581..11f7e4a 100644
--- a/src/documents/extractor/common.rs
+++ b/src/documents/extractor/common.rs
@@ -73,6 +73,59 @@ pub fn compute_list_hash(items: &[String]) -> String {
format!("{:x}", hasher.finalize())
}
+/// Strip GitLab-generated boilerplate from titles before embedding.
+///
+/// Common patterns that inflate embedding similarity between unrelated entities:
+/// - `Draft: Resolve "Actual Title"` → `Actual Title`
+/// - `Resolve "Actual Title"` → `Actual Title`
+/// - `Draft: Some Title` → `Some Title`
+/// - `WIP: Some Title` → `Some Title`
+///
+/// The original title is preserved in `DocumentData.title` for display;
+/// this function only affects `content_text` (what gets embedded).
+fn normalize_title_for_embedding(title: &str) -> &str {
+ let mut s = title;
+
+ // Strip leading "Draft: " and/or "WIP: " (case-insensitive, repeatable).
+ // Use `get()` for slicing — direct `str[..N]` panics if byte N is mid-character
+ // (e.g. titles starting with emoji or accented characters).
+ loop {
+ let trimmed = s.trim_start();
+ if trimmed
+ .get(..6)
+ .is_some_and(|p| p.eq_ignore_ascii_case("draft:"))
+ {
+ s = trimmed[6..].trim_start();
+ } else if trimmed
+ .get(..4)
+ .is_some_and(|p| p.eq_ignore_ascii_case("wip:"))
+ {
+ s = trimmed[4..].trim_start();
+ } else {
+ break;
+ }
+ }
+
+ // Strip `Resolve "..."` wrapper (case-insensitive)
+ if s.len() >= 10
+ && s.get(..8).is_some_and(|p| p.eq_ignore_ascii_case("resolve "))
+ && s.as_bytes()[8] == b'"'
+ && let Some(end) = s[9..].rfind('"')
+ {
+ let inner = &s[9..9 + end];
+ if !inner.is_empty() {
+ return inner;
+ }
+ }
+
+ // Guard: if stripping left us with nothing, return the original
+ if s.is_empty() {
+ return title;
+ }
+
+ s
+}
+
fn format_date(ms: i64) -> String {
DateTime::from_timestamp_millis(ms)
.map(|dt| dt.format("%Y-%m-%d").to_string())
diff --git a/src/documents/extractor/discussions.rs b/src/documents/extractor/discussions.rs
index bbcc408..8526c14 100644
--- a/src/documents/extractor/discussions.rs
+++ b/src/documents/extractor/discussions.rs
@@ -156,12 +156,13 @@ pub fn extract_discussion_document(
let author_username = notes[0].author.clone();
let display_title = parent_title.as_deref().unwrap_or("(untitled)");
+ let embed_title = normalize_title_for_embedding(display_title);
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());
let mut content = format!(
"[[Discussion]] {}: {}\nProject: {}\n",
- parent_type_prefix, display_title, path_with_namespace
+ parent_type_prefix, embed_title, path_with_namespace
);
if let Some(ref u) = url {
let _ = writeln!(content, "URL: {}", u);
diff --git a/src/documents/extractor/extractor_tests.rs b/src/documents/extractor/extractor_tests.rs
index e818847..5bae319 100644
--- a/src/documents/extractor/extractor_tests.rs
+++ b/src/documents/extractor/extractor_tests.rs
@@ -1,5 +1,171 @@
use super::*;
+// --- normalize_title_for_embedding tests ---
+
+#[test]
+fn test_normalize_title_strips_draft_resolve_quotes() {
+ assert_eq!(
+ normalize_title_for_embedding("Draft: Resolve \"Analytics Studio: Subformulas\""),
+ "Analytics Studio: Subformulas"
+ );
+}
+
+#[test]
+fn test_normalize_title_strips_resolve_quotes() {
+ assert_eq!(
+ normalize_title_for_embedding("Resolve \"RUL Report: Use param_trends from S3\""),
+ "RUL Report: Use param_trends from S3"
+ );
+}
+
+#[test]
+fn test_normalize_title_strips_draft_prefix() {
+ assert_eq!(
+ normalize_title_for_embedding("Draft: Implement JWT authentication"),
+ "Implement JWT authentication"
+ );
+}
+
+#[test]
+fn test_normalize_title_strips_wip_prefix() {
+ assert_eq!(
+ normalize_title_for_embedding("WIP: Implement JWT authentication"),
+ "Implement JWT authentication"
+ );
+}
+
+#[test]
+fn test_normalize_title_strips_draft_wip_combined() {
+ assert_eq!(
+ normalize_title_for_embedding("Draft: WIP: Fix auth"),
+ "Fix auth"
+ );
+}
+
+#[test]
+fn test_normalize_title_no_change_for_normal_title() {
+ assert_eq!(
+ normalize_title_for_embedding("Implement JWT authentication"),
+ "Implement JWT authentication"
+ );
+}
+
+#[test]
+fn test_normalize_title_case_insensitive_draft() {
+ assert_eq!(
+ normalize_title_for_embedding("draft: Resolve \"Some Issue\""),
+ "Some Issue"
+ );
+}
+
+#[test]
+fn test_normalize_title_case_insensitive_wip() {
+ assert_eq!(normalize_title_for_embedding("wip: Something"), "Something");
+}
+
+#[test]
+fn test_normalize_title_untitled_passthrough() {
+ assert_eq!(normalize_title_for_embedding("(untitled)"), "(untitled)");
+}
+
+#[test]
+fn test_normalize_title_resolve_without_quotes_unchanged() {
+ // "Resolve something" without quotes is not the GitLab pattern
+ assert_eq!(
+ normalize_title_for_embedding("Resolve the flaky test"),
+ "Resolve the flaky test"
+ );
+}
+
+#[test]
+fn test_normalize_title_empty_after_strip_returns_original() {
+ // Edge case: "Draft: " with nothing after → return original
+ assert_eq!(normalize_title_for_embedding("Draft: "), "Draft: ");
+}
+
+#[test]
+fn test_normalize_title_resolve_empty_quotes() {
+ // Edge case: Resolve "" → return original (empty inner text)
+ assert_eq!(
+ normalize_title_for_embedding("Resolve \"\""),
+ "Resolve \"\""
+ );
+}
+
+#[test]
+fn test_normalize_title_non_ascii_does_not_panic() {
+ // Emoji at start: byte offsets 4 and 8 fall mid-character.
+ // Must not panic — should return the title unchanged.
+ assert_eq!(
+ normalize_title_for_embedding("\u{1F389}\u{1F389} celebration"),
+ "\u{1F389}\u{1F389} celebration"
+ );
+ // Accented characters
+ assert_eq!(
+ normalize_title_for_embedding("\u{00DC}berpr\u{00FC}fung der Daten"),
+ "\u{00DC}berpr\u{00FC}fung der Daten"
+ );
+}
+
+// --- MR document uses normalized title in content_text ---
+
+#[test]
+fn test_mr_document_normalizes_draft_resolve_title() {
+ let conn = setup_mr_test_db();
+ insert_mr(
+ &conn,
+ 1,
+ 4064,
+ Some("Draft: Resolve \"Analytics Studio: Subformulas\""),
+ Some("Implements subformula support"),
+ Some("opened"),
+ Some("dev"),
+ Some("feature/subformulas"),
+ Some("main"),
+ None,
+ );
+
+ let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
+ // content_text should use the normalized title (no boilerplate)
+ assert!(
+ doc.content_text
+ .starts_with("[[MergeRequest]] !4064: Analytics Studio: Subformulas\n")
+ );
+ // but DocumentData.title preserves the original for display
+ assert_eq!(
+ doc.title,
+ Some("Draft: Resolve \"Analytics Studio: Subformulas\"".to_string())
+ );
+}
+
+// --- Issue document uses normalized title in content_text ---
+
+#[test]
+fn test_issue_document_normalizes_draft_title() {
+ let conn = setup_test_db();
+ insert_issue(
+ &conn,
+ 1,
+ 100,
+ Some("Draft: WIP: Rethink caching strategy"),
+ Some("We should reconsider..."),
+ "opened",
+ Some("alice"),
+ None,
+ );
+
+ let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
+ assert!(
+ doc.content_text
+ .starts_with("[[Issue]] #100: Rethink caching strategy\n")
+ );
+ // Original title preserved for display
+ assert_eq!(
+ doc.title,
+ Some("Draft: WIP: Rethink caching strategy".to_string())
+ );
+}
+
#[test]
fn test_source_type_parse_aliases() {
assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue));
diff --git a/src/documents/extractor/issues.rs b/src/documents/extractor/issues.rs
index 972361b..ba6377e 100644
--- a/src/documents/extractor/issues.rs
+++ b/src/documents/extractor/issues.rs
@@ -55,9 +55,10 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result