Files
gitlore/src/documents/extractor/issues.rs
teernisse fe7d210988 feat(embedding): strip GitLab boilerplate from titles before embedding
GitLab auto-generates MR titles like "Draft: Resolve \"Issue Title\""
when creating MRs from issues. This 4-token boilerplate prefix dominated
the embedding vectors, causing unrelated MRs with the same title structure
to appear as highly similar in "lore related" results (0.667 similarity
vs 0.674 for the actual parent issue — a difference of only 0.007).

Add normalize_title_for_embedding() which deterministically strips:
- "Draft: " prefix (case-insensitive)
- "WIP: " prefix (case-insensitive)
- "Resolve \"...\"" wrapper (extracts inner title)
- Combinations: "Draft: Resolve \"...\""

The normalization is applied in all four document extractors (issues, MRs,
discussions, notes) to the content_text field only. DocumentData.title
preserves the original title for human-readable display in CLI output.

Since content_text changes, content_hash will differ from stored values,
triggering automatic re-embedding on the next "lore embed" run.

Uses str::get() for all byte-offset slicing to prevent panics on titles
containing emoji or other multi-byte UTF-8 characters.

15 new tests covering: all boilerplate patterns, case insensitivity,
edge cases (empty inner text, no-op for normal titles), UTF-8 safety,
and end-to-end document extraction with boilerplate titles.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 17:07:23 -04:00

112 lines
3.6 KiB
Rust

pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option<DocumentData>> {
let row = conn.query_row(
"SELECT i.id, i.iid, i.title, i.description, i.state, i.author_username,
i.created_at, i.updated_at, i.web_url,
p.path_with_namespace, p.id AS project_id
FROM issues i
JOIN projects p ON p.id = i.project_id
WHERE i.id = ?1",
rusqlite::params![issue_id],
|row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, i64>(1)?,
row.get::<_, Option<String>>(2)?,
row.get::<_, Option<String>>(3)?,
row.get::<_, String>(4)?,
row.get::<_, Option<String>>(5)?,
row.get::<_, i64>(6)?,
row.get::<_, i64>(7)?,
row.get::<_, Option<String>>(8)?,
row.get::<_, String>(9)?,
row.get::<_, i64>(10)?,
))
},
);
let (
id,
iid,
title,
description,
state,
author_username,
created_at,
updated_at,
web_url,
path_with_namespace,
project_id,
) = match row {
Ok(r) => r,
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
Err(e) => return Err(e.into()),
};
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id
WHERE il.issue_id = ?1
ORDER BY l.name",
)?;
let labels: Vec<String> = label_stmt
.query_map(rusqlite::params![id], |row| row.get(0))?
.collect::<std::result::Result<Vec<_>, _>>()?;
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
let display_title = title.as_deref().unwrap_or("(untitled)");
let embed_title = normalize_title_for_embedding(display_title);
let mut content = format!(
"[[Issue]] #{}: {}\nProject: {}\n",
iid, embed_title, path_with_namespace
);
if let Some(ref url) = web_url {
let _ = writeln!(content, "URL: {}", url);
}
let _ = writeln!(content, "Labels: {}", labels_json);
let _ = writeln!(content, "State: {}", state);
if let Some(ref author) = author_username {
let _ = writeln!(content, "Author: @{}", author);
}
if let Some(ref desc) = description {
content.push_str("\n--- Description ---\n\n");
// Pre-truncate to avoid unbounded memory allocation for huge descriptions
let pre_trunc = pre_truncate_description(desc, MAX_DOCUMENT_BYTES_HARD);
if pre_trunc.was_truncated {
warn!(
iid,
original_bytes = pre_trunc.original_bytes,
"Issue description truncated (oversized)"
);
}
content.push_str(&pre_trunc.content);
}
let labels_hash = compute_list_hash(&labels);
let paths_hash = compute_list_hash(&[]);
let hard_cap = truncate_hard_cap(&content);
let content_hash = compute_content_hash(&hard_cap.content);
Ok(Some(DocumentData {
source_type: SourceType::Issue,
source_id: id,
project_id,
author_username,
labels,
paths: Vec::new(),
labels_hash,
paths_hash,
created_at,
updated_at,
url: web_url,
title: Some(display_title.to_string()),
content_text: hard_cap.content,
content_hash,
is_truncated: hard_cap.is_truncated,
truncated_reason: hard_cap.reason.map(|r| r.as_str().to_string()),
}))
}