feat(embedding): strip GitLab boilerplate from titles before embedding
GitLab auto-generates MR titles like "Draft: Resolve \"Issue Title\"" when creating MRs from issues. This 4-token boilerplate prefix dominated the embedding vectors, causing unrelated MRs with the same title structure to appear as highly similar in "lore related" results (0.667 similarity vs 0.674 for the actual parent issue — a difference of only 0.007). Add normalize_title_for_embedding() which deterministically strips: - "Draft: " prefix (case-insensitive) - "WIP: " prefix (case-insensitive) - "Resolve \"...\"" wrapper (extracts inner title) - Combinations: "Draft: Resolve \"...\"" The normalization is applied in all four document extractors (issues, MRs, discussions, notes) to the content_text field only. DocumentData.title preserves the original title for human-readable display in CLI output. Since content_text changes, content_hash will differ from stored values, triggering automatic re-embedding on the next "lore embed" run. Uses str::get() for all byte-offset slicing to prevent panics on titles containing emoji or other multi-byte UTF-8 characters. 15 new tests covering: all boilerplate patterns, case insensitivity, edge cases (empty inner text, no-op for normal titles), UTF-8 safety, and end-to-end document extraction with boilerplate titles. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -73,6 +73,59 @@ pub fn compute_list_hash(items: &[String]) -> String {
|
|||||||
format!("{:x}", hasher.finalize())
|
format!("{:x}", hasher.finalize())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Strip GitLab-generated boilerplate from titles before embedding.
|
||||||
|
///
|
||||||
|
/// Common patterns that inflate embedding similarity between unrelated entities:
|
||||||
|
/// - `Draft: Resolve "Actual Title"` → `Actual Title`
|
||||||
|
/// - `Resolve "Actual Title"` → `Actual Title`
|
||||||
|
/// - `Draft: Some Title` → `Some Title`
|
||||||
|
/// - `WIP: Some Title` → `Some Title`
|
||||||
|
///
|
||||||
|
/// The original title is preserved in `DocumentData.title` for display;
|
||||||
|
/// this function only affects `content_text` (what gets embedded).
|
||||||
|
fn normalize_title_for_embedding(title: &str) -> &str {
|
||||||
|
let mut s = title;
|
||||||
|
|
||||||
|
// Strip leading "Draft: " and/or "WIP: " (case-insensitive, repeatable).
|
||||||
|
// Use `get()` for slicing — direct `str[..N]` panics if byte N is mid-character
|
||||||
|
// (e.g. titles starting with emoji or accented characters).
|
||||||
|
loop {
|
||||||
|
let trimmed = s.trim_start();
|
||||||
|
if trimmed
|
||||||
|
.get(..6)
|
||||||
|
.is_some_and(|p| p.eq_ignore_ascii_case("draft:"))
|
||||||
|
{
|
||||||
|
s = trimmed[6..].trim_start();
|
||||||
|
} else if trimmed
|
||||||
|
.get(..4)
|
||||||
|
.is_some_and(|p| p.eq_ignore_ascii_case("wip:"))
|
||||||
|
{
|
||||||
|
s = trimmed[4..].trim_start();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strip `Resolve "..."` wrapper (case-insensitive)
|
||||||
|
if s.len() >= 10
|
||||||
|
&& s.get(..8).is_some_and(|p| p.eq_ignore_ascii_case("resolve "))
|
||||||
|
&& s.as_bytes()[8] == b'"'
|
||||||
|
&& let Some(end) = s[9..].rfind('"')
|
||||||
|
{
|
||||||
|
let inner = &s[9..9 + end];
|
||||||
|
if !inner.is_empty() {
|
||||||
|
return inner;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guard: if stripping left us with nothing, return the original
|
||||||
|
if s.is_empty() {
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
fn format_date(ms: i64) -> String {
|
fn format_date(ms: i64) -> String {
|
||||||
DateTime::from_timestamp_millis(ms)
|
DateTime::from_timestamp_millis(ms)
|
||||||
.map(|dt| dt.format("%Y-%m-%d").to_string())
|
.map(|dt| dt.format("%Y-%m-%d").to_string())
|
||||||
|
|||||||
@@ -156,12 +156,13 @@ pub fn extract_discussion_document(
|
|||||||
let author_username = notes[0].author.clone();
|
let author_username = notes[0].author.clone();
|
||||||
|
|
||||||
let display_title = parent_title.as_deref().unwrap_or("(untitled)");
|
let display_title = parent_title.as_deref().unwrap_or("(untitled)");
|
||||||
|
let embed_title = normalize_title_for_embedding(display_title);
|
||||||
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
|
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
|
||||||
let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());
|
let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());
|
||||||
|
|
||||||
let mut content = format!(
|
let mut content = format!(
|
||||||
"[[Discussion]] {}: {}\nProject: {}\n",
|
"[[Discussion]] {}: {}\nProject: {}\n",
|
||||||
parent_type_prefix, display_title, path_with_namespace
|
parent_type_prefix, embed_title, path_with_namespace
|
||||||
);
|
);
|
||||||
if let Some(ref u) = url {
|
if let Some(ref u) = url {
|
||||||
let _ = writeln!(content, "URL: {}", u);
|
let _ = writeln!(content, "URL: {}", u);
|
||||||
|
|||||||
@@ -1,5 +1,171 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
// --- normalize_title_for_embedding tests ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_strips_draft_resolve_quotes() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Draft: Resolve \"Analytics Studio: Subformulas\""),
|
||||||
|
"Analytics Studio: Subformulas"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_strips_resolve_quotes() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Resolve \"RUL Report: Use param_trends from S3\""),
|
||||||
|
"RUL Report: Use param_trends from S3"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_strips_draft_prefix() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Draft: Implement JWT authentication"),
|
||||||
|
"Implement JWT authentication"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_strips_wip_prefix() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("WIP: Implement JWT authentication"),
|
||||||
|
"Implement JWT authentication"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_strips_draft_wip_combined() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Draft: WIP: Fix auth"),
|
||||||
|
"Fix auth"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_no_change_for_normal_title() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Implement JWT authentication"),
|
||||||
|
"Implement JWT authentication"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_case_insensitive_draft() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("draft: Resolve \"Some Issue\""),
|
||||||
|
"Some Issue"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_case_insensitive_wip() {
|
||||||
|
assert_eq!(normalize_title_for_embedding("wip: Something"), "Something");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_untitled_passthrough() {
|
||||||
|
assert_eq!(normalize_title_for_embedding("(untitled)"), "(untitled)");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_resolve_without_quotes_unchanged() {
|
||||||
|
// "Resolve something" without quotes is not the GitLab pattern
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Resolve the flaky test"),
|
||||||
|
"Resolve the flaky test"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_empty_after_strip_returns_original() {
|
||||||
|
// Edge case: "Draft: " with nothing after → return original
|
||||||
|
assert_eq!(normalize_title_for_embedding("Draft: "), "Draft: ");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_resolve_empty_quotes() {
|
||||||
|
// Edge case: Resolve "" → return original (empty inner text)
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("Resolve \"\""),
|
||||||
|
"Resolve \"\""
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_normalize_title_non_ascii_does_not_panic() {
|
||||||
|
// Emoji at start: byte offsets 4 and 8 fall mid-character.
|
||||||
|
// Must not panic — should return the title unchanged.
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("\u{1F389}\u{1F389} celebration"),
|
||||||
|
"\u{1F389}\u{1F389} celebration"
|
||||||
|
);
|
||||||
|
// Accented characters
|
||||||
|
assert_eq!(
|
||||||
|
normalize_title_for_embedding("\u{00DC}berpr\u{00FC}fung der Daten"),
|
||||||
|
"\u{00DC}berpr\u{00FC}fung der Daten"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- MR document uses normalized title in content_text ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_mr_document_normalizes_draft_resolve_title() {
|
||||||
|
let conn = setup_mr_test_db();
|
||||||
|
insert_mr(
|
||||||
|
&conn,
|
||||||
|
1,
|
||||||
|
4064,
|
||||||
|
Some("Draft: Resolve \"Analytics Studio: Subformulas\""),
|
||||||
|
Some("Implements subformula support"),
|
||||||
|
Some("opened"),
|
||||||
|
Some("dev"),
|
||||||
|
Some("feature/subformulas"),
|
||||||
|
Some("main"),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
|
||||||
|
// content_text should use the normalized title (no boilerplate)
|
||||||
|
assert!(
|
||||||
|
doc.content_text
|
||||||
|
.starts_with("[[MergeRequest]] !4064: Analytics Studio: Subformulas\n")
|
||||||
|
);
|
||||||
|
// but DocumentData.title preserves the original for display
|
||||||
|
assert_eq!(
|
||||||
|
doc.title,
|
||||||
|
Some("Draft: Resolve \"Analytics Studio: Subformulas\"".to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Issue document uses normalized title in content_text ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_issue_document_normalizes_draft_title() {
|
||||||
|
let conn = setup_test_db();
|
||||||
|
insert_issue(
|
||||||
|
&conn,
|
||||||
|
1,
|
||||||
|
100,
|
||||||
|
Some("Draft: WIP: Rethink caching strategy"),
|
||||||
|
Some("We should reconsider..."),
|
||||||
|
"opened",
|
||||||
|
Some("alice"),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
|
||||||
|
assert!(
|
||||||
|
doc.content_text
|
||||||
|
.starts_with("[[Issue]] #100: Rethink caching strategy\n")
|
||||||
|
);
|
||||||
|
// Original title preserved for display
|
||||||
|
assert_eq!(
|
||||||
|
doc.title,
|
||||||
|
Some("Draft: WIP: Rethink caching strategy".to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_source_type_parse_aliases() {
|
fn test_source_type_parse_aliases() {
|
||||||
assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue));
|
assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue));
|
||||||
|
|||||||
@@ -55,9 +55,10 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
|
|||||||
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
|
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
|
||||||
|
|
||||||
let display_title = title.as_deref().unwrap_or("(untitled)");
|
let display_title = title.as_deref().unwrap_or("(untitled)");
|
||||||
|
let embed_title = normalize_title_for_embedding(display_title);
|
||||||
let mut content = format!(
|
let mut content = format!(
|
||||||
"[[Issue]] #{}: {}\nProject: {}\n",
|
"[[Issue]] #{}: {}\nProject: {}\n",
|
||||||
iid, display_title, path_with_namespace
|
iid, embed_title, path_with_namespace
|
||||||
);
|
);
|
||||||
if let Some(ref url) = web_url {
|
if let Some(ref url) = web_url {
|
||||||
let _ = writeln!(content, "URL: {}", url);
|
let _ = writeln!(content, "URL: {}", url);
|
||||||
|
|||||||
@@ -60,10 +60,11 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
|
|||||||
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
|
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
|
||||||
|
|
||||||
let display_title = title.as_deref().unwrap_or("(untitled)");
|
let display_title = title.as_deref().unwrap_or("(untitled)");
|
||||||
|
let embed_title = normalize_title_for_embedding(display_title);
|
||||||
let display_state = state.as_deref().unwrap_or("unknown");
|
let display_state = state.as_deref().unwrap_or("unknown");
|
||||||
let mut content = format!(
|
let mut content = format!(
|
||||||
"[[MergeRequest]] !{}: {}\nProject: {}\n",
|
"[[MergeRequest]] !{}: {}\nProject: {}\n",
|
||||||
iid, display_title, path_with_namespace
|
iid, embed_title, path_with_namespace
|
||||||
);
|
);
|
||||||
if let Some(ref url) = web_url {
|
if let Some(ref url) = web_url {
|
||||||
let _ = writeln!(content, "URL: {}", url);
|
let _ = writeln!(content, "URL: {}", url);
|
||||||
|
|||||||
@@ -439,6 +439,7 @@ fn build_note_document(
|
|||||||
let url = parent_web_url.map(|wu| format!("{}#note_{}", wu, gitlab_id));
|
let url = parent_web_url.map(|wu| format!("{}#note_{}", wu, gitlab_id));
|
||||||
|
|
||||||
let display_title = parent_title.unwrap_or("(untitled)");
|
let display_title = parent_title.unwrap_or("(untitled)");
|
||||||
|
let embed_title = normalize_title_for_embedding(display_title);
|
||||||
let display_note_type = note_type.as_deref().unwrap_or("Note");
|
let display_note_type = note_type.as_deref().unwrap_or("Note");
|
||||||
let display_author = author_username.as_deref().unwrap_or("unknown");
|
let display_author = author_username.as_deref().unwrap_or("unknown");
|
||||||
let parent_prefix = if parent_type_label == "Issue" {
|
let parent_prefix = if parent_type_label == "Issue" {
|
||||||
@@ -447,6 +448,7 @@ fn build_note_document(
|
|||||||
format!("MR !{}", parent_iid)
|
format!("MR !{}", parent_iid)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Display title uses original (for human-readable output)
|
||||||
let title = format!(
|
let title = format!(
|
||||||
"Note by @{} on {}: {}",
|
"Note by @{} on {}: {}",
|
||||||
display_author, parent_prefix, display_title
|
display_author, parent_prefix, display_title
|
||||||
@@ -461,7 +463,7 @@ fn build_note_document(
|
|||||||
let _ = writeln!(content, "project: {}", path_with_namespace);
|
let _ = writeln!(content, "project: {}", path_with_namespace);
|
||||||
let _ = writeln!(content, "parent_type: {}", parent_type_label);
|
let _ = writeln!(content, "parent_type: {}", parent_type_label);
|
||||||
let _ = writeln!(content, "parent_iid: {}", parent_iid);
|
let _ = writeln!(content, "parent_iid: {}", parent_iid);
|
||||||
let _ = writeln!(content, "parent_title: {}", display_title);
|
let _ = writeln!(content, "parent_title: {}", embed_title);
|
||||||
let _ = writeln!(content, "note_type: {}", display_note_type);
|
let _ = writeln!(content, "note_type: {}", display_note_type);
|
||||||
let _ = writeln!(content, "author: @{}", display_author);
|
let _ = writeln!(content, "author: @{}", display_author);
|
||||||
let _ = writeln!(content, "created_at: {}", ms_to_iso(created_at));
|
let _ = writeln!(content, "created_at: {}", ms_to_iso(created_at));
|
||||||
|
|||||||
Reference in New Issue
Block a user