2026-03-12 17:07:10 -04:00
parent 8ab65a3401
commit fe7d210988
6 changed files with 228 additions and 4 deletions
						
						
							
						
						
						
@@ -73,6 +73,59 @@ pub fn compute_list_hash(items: &[String]) -> String {
    format!("{:x}", hasher.finalize())
    format!("{:x}", hasher.finalize())
}
}
/// Strip GitLab-generated boilerplate from titles before embedding.
///
/// Common patterns that inflate embedding similarity between unrelated entities:
/// - `Draft: Resolve "Actual Title"` → `Actual Title`
/// - `Resolve "Actual Title"` → `Actual Title`
/// - `Draft: Some Title` → `Some Title`
/// - `WIP: Some Title` → `Some Title`
///
/// The original title is preserved in `DocumentData.title` for display;
/// this function only affects `content_text` (what gets embedded).
fn normalize_title_for_embedding(title: &str) -> &str {
    let mut s = title;
    // Strip leading "Draft: " and/or "WIP: " (case-insensitive, repeatable).
    // Use `get()` for slicing — direct `str[..N]` panics if byte N is mid-character
    // (e.g. titles starting with emoji or accented characters).
    loop {
        let trimmed = s.trim_start();
        if trimmed
            .get(..6)
            .is_some_and(|p| p.eq_ignore_ascii_case("draft:"))
        {
            s = trimmed[6..].trim_start();
        } else if trimmed
            .get(..4)
            .is_some_and(|p| p.eq_ignore_ascii_case("wip:"))
        {
            s = trimmed[4..].trim_start();
        } else {
            break;
        }
    }
    // Strip `Resolve "..."` wrapper (case-insensitive)
    if s.len() >= 10
        && s.get(..8).is_some_and(|p| p.eq_ignore_ascii_case("resolve "))
        && s.as_bytes()[8] == b'"'
        && let Some(end) = s[9..].rfind('"')
    {
        let inner = &s[9..9 + end];
        if !inner.is_empty() {
            return inner;
        }
    }
    // Guard: if stripping left us with nothing, return the original
    if s.is_empty() {
        return title;
    }
    s
}
fn format_date(ms: i64) -> String {
fn format_date(ms: i64) -> String {
    DateTime::from_timestamp_millis(ms)
    DateTime::from_timestamp_millis(ms)
        .map(|dt| dt.format("%Y-%m-%d").to_string())
        .map(|dt| dt.format("%Y-%m-%d").to_string())
						
							
						
						
						
						
 
						
						
							
						
						
						
@@ -156,12 +156,13 @@ pub fn extract_discussion_document(
    let author_username = notes[0].author.clone();
    let author_username = notes[0].author.clone();
    let display_title = parent_title.as_deref().unwrap_or("(untitled)");
    let display_title = parent_title.as_deref().unwrap_or("(untitled)");
    let embed_title = normalize_title_for_embedding(display_title);
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());
    let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());
    let mut content = format!(
    let mut content = format!(
        "[[Discussion]] {}: {}\nProject: {}\n",
        "[[Discussion]] {}: {}\nProject: {}\n",
        parent_type_prefix, display_title, path_with_namespace
        parent_type_prefix, embed_title, path_with_namespace
    );
    );
    if let Some(ref u) = url {
    if let Some(ref u) = url {
        let _ = writeln!(content, "URL: {}", u);
        let _ = writeln!(content, "URL: {}", u);
						
							
						
						
						
						
 
						
						
						
						
@@ -1,5 +1,171 @@
use super::*;
use super::*;
// --- normalize_title_for_embedding tests ---
#[test]
fn test_normalize_title_strips_draft_resolve_quotes() {
    assert_eq!(
        normalize_title_for_embedding("Draft: Resolve \"Analytics Studio: Subformulas\""),
        "Analytics Studio: Subformulas"
    );
}
#[test]
fn test_normalize_title_strips_resolve_quotes() {
    assert_eq!(
        normalize_title_for_embedding("Resolve \"RUL Report: Use param_trends from S3\""),
        "RUL Report: Use param_trends from S3"
    );
}
#[test]
fn test_normalize_title_strips_draft_prefix() {
    assert_eq!(
        normalize_title_for_embedding("Draft: Implement JWT authentication"),
        "Implement JWT authentication"
    );
}
#[test]
fn test_normalize_title_strips_wip_prefix() {
    assert_eq!(
        normalize_title_for_embedding("WIP: Implement JWT authentication"),
        "Implement JWT authentication"
    );
}
#[test]
fn test_normalize_title_strips_draft_wip_combined() {
    assert_eq!(
        normalize_title_for_embedding("Draft: WIP: Fix auth"),
        "Fix auth"
    );
}
#[test]
fn test_normalize_title_no_change_for_normal_title() {
    assert_eq!(
        normalize_title_for_embedding("Implement JWT authentication"),
        "Implement JWT authentication"
    );
}
#[test]
fn test_normalize_title_case_insensitive_draft() {
    assert_eq!(
        normalize_title_for_embedding("draft: Resolve \"Some Issue\""),
        "Some Issue"
    );
}
#[test]
fn test_normalize_title_case_insensitive_wip() {
    assert_eq!(normalize_title_for_embedding("wip: Something"), "Something");
}
#[test]
fn test_normalize_title_untitled_passthrough() {
    assert_eq!(normalize_title_for_embedding("(untitled)"), "(untitled)");
}
#[test]
fn test_normalize_title_resolve_without_quotes_unchanged() {
    // "Resolve something" without quotes is not the GitLab pattern
    assert_eq!(
        normalize_title_for_embedding("Resolve the flaky test"),
        "Resolve the flaky test"
    );
}
#[test]
fn test_normalize_title_empty_after_strip_returns_original() {
    // Edge case: "Draft: " with nothing after → return original
    assert_eq!(normalize_title_for_embedding("Draft: "), "Draft: ");
}
#[test]
fn test_normalize_title_resolve_empty_quotes() {
    // Edge case: Resolve "" → return original (empty inner text)
    assert_eq!(
        normalize_title_for_embedding("Resolve \"\""),
        "Resolve \"\""
    );
}
#[test]
fn test_normalize_title_non_ascii_does_not_panic() {
    // Emoji at start: byte offsets 4 and 8 fall mid-character.
    // Must not panic — should return the title unchanged.
    assert_eq!(
        normalize_title_for_embedding("\u{1F389}\u{1F389} celebration"),
        "\u{1F389}\u{1F389} celebration"
    );
    // Accented characters
    assert_eq!(
        normalize_title_for_embedding("\u{00DC}berpr\u{00FC}fung der Daten"),
        "\u{00DC}berpr\u{00FC}fung der Daten"
    );
}
// --- MR document uses normalized title in content_text ---
#[test]
fn test_mr_document_normalizes_draft_resolve_title() {
    let conn = setup_mr_test_db();
    insert_mr(
        &conn,
        1,
        4064,
        Some("Draft: Resolve \"Analytics Studio: Subformulas\""),
        Some("Implements subformula support"),
        Some("opened"),
        Some("dev"),
        Some("feature/subformulas"),
        Some("main"),
        None,
    );
    let doc = extract_mr_document(&conn, 1).unwrap().unwrap();
    // content_text should use the normalized title (no boilerplate)
    assert!(
        doc.content_text
            .starts_with("[[MergeRequest]] !4064: Analytics Studio: Subformulas\n")
    );
    // but DocumentData.title preserves the original for display
    assert_eq!(
        doc.title,
        Some("Draft: Resolve \"Analytics Studio: Subformulas\"".to_string())
    );
}
// --- Issue document uses normalized title in content_text ---
#[test]
fn test_issue_document_normalizes_draft_title() {
    let conn = setup_test_db();
    insert_issue(
        &conn,
        1,
        100,
        Some("Draft: WIP: Rethink caching strategy"),
        Some("We should reconsider..."),
        "opened",
        Some("alice"),
        None,
    );
    let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
    assert!(
        doc.content_text
            .starts_with("[[Issue]] #100: Rethink caching strategy\n")
    );
    // Original title preserved for display
    assert_eq!(
        doc.title,
        Some("Draft: WIP: Rethink caching strategy".to_string())
    );
}
#[test]
#[test]
fn test_source_type_parse_aliases() {
fn test_source_type_parse_aliases() {
    assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue));
    assert_eq!(SourceType::parse("issue"), Some(SourceType::Issue));
						
							
						
						
						
						
 
						
						
							
						
						
						
@@ -55,9 +55,10 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let display_title = title.as_deref().unwrap_or("(untitled)");
    let display_title = title.as_deref().unwrap_or("(untitled)");
    let embed_title = normalize_title_for_embedding(display_title);
    let mut content = format!(
    let mut content = format!(
        "[[Issue]] #{}: {}\nProject: {}\n",
        "[[Issue]] #{}: {}\nProject: {}\n",
        iid, display_title, path_with_namespace
        iid, embed_title, path_with_namespace
    );
    );
    if let Some(ref url) = web_url {
    if let Some(ref url) = web_url {
        let _ = writeln!(content, "URL: {}", url);
        let _ = writeln!(content, "URL: {}", url);
						
							
						
						
						
						
 
						
						
							
						
						
						
@@ -60,10 +60,11 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
    let display_title = title.as_deref().unwrap_or("(untitled)");
    let display_title = title.as_deref().unwrap_or("(untitled)");
    let embed_title = normalize_title_for_embedding(display_title);
    let display_state = state.as_deref().unwrap_or("unknown");
    let display_state = state.as_deref().unwrap_or("unknown");
    let mut content = format!(
    let mut content = format!(
        "[[MergeRequest]] !{}: {}\nProject: {}\n",
        "[[MergeRequest]] !{}: {}\nProject: {}\n",
        iid, display_title, path_with_namespace
        iid, embed_title, path_with_namespace
    );
    );
    if let Some(ref url) = web_url {
    if let Some(ref url) = web_url {
        let _ = writeln!(content, "URL: {}", url);
        let _ = writeln!(content, "URL: {}", url);
						
							
						
						
						
						
 
						
						
							
						
						
						
@@ -439,6 +439,7 @@ fn build_note_document(
    let url = parent_web_url.map(|wu| format!("{}#note_{}", wu, gitlab_id));
    let url = parent_web_url.map(|wu| format!("{}#note_{}", wu, gitlab_id));
    let display_title = parent_title.unwrap_or("(untitled)");
    let display_title = parent_title.unwrap_or("(untitled)");
    let embed_title = normalize_title_for_embedding(display_title);
    let display_note_type = note_type.as_deref().unwrap_or("Note");
    let display_note_type = note_type.as_deref().unwrap_or("Note");
    let display_author = author_username.as_deref().unwrap_or("unknown");
    let display_author = author_username.as_deref().unwrap_or("unknown");
    let parent_prefix = if parent_type_label == "Issue" {
    let parent_prefix = if parent_type_label == "Issue" {
						
						
						
							
						
						
@@ -447,6 +448,7 @@ fn build_note_document(
        format!("MR !{}", parent_iid)
        format!("MR !{}", parent_iid)
    };
    };
    // Display title uses original (for human-readable output)
    let title = format!(
    let title = format!(
        "Note by @{} on {}: {}",
        "Note by @{} on {}: {}",
        display_author, parent_prefix, display_title
        display_author, parent_prefix, display_title
						
						
						
							
						
						
@@ -461,7 +463,7 @@ fn build_note_document(
    let _ = writeln!(content, "project: {}", path_with_namespace);
    let _ = writeln!(content, "project: {}", path_with_namespace);
    let _ = writeln!(content, "parent_type: {}", parent_type_label);
    let _ = writeln!(content, "parent_type: {}", parent_type_label);
    let _ = writeln!(content, "parent_iid: {}", parent_iid);
    let _ = writeln!(content, "parent_iid: {}", parent_iid);
    let _ = writeln!(content, "parent_title: {}", display_title);
    let _ = writeln!(content, "parent_title: {}", embed_title);
    let _ = writeln!(content, "note_type: {}", display_note_type);
    let _ = writeln!(content, "note_type: {}", display_note_type);
    let _ = writeln!(content, "author: @{}", display_author);
    let _ = writeln!(content, "author: @{}", display_author);
    let _ = writeln!(content, "created_at: {}", ms_to_iso(created_at));
    let _ = writeln!(content, "created_at: {}", ms_to_iso(created_at));