GitLab auto-generates MR titles like "Draft: Resolve \"Issue Title\"" when creating MRs from issues. This 4-token boilerplate prefix dominated the embedding vectors, causing unrelated MRs with the same title structure to appear as highly similar in "lore related" results (0.667 similarity vs 0.674 for the actual parent issue — a difference of only 0.007). Add normalize_title_for_embedding() which deterministically strips: - "Draft: " prefix (case-insensitive) - "WIP: " prefix (case-insensitive) - "Resolve \"...\"" wrapper (extracts inner title) - Combinations: "Draft: Resolve \"...\"" The normalization is applied in all four document extractors (issues, MRs, discussions, notes) to the content_text field only. DocumentData.title preserves the original title for human-readable display in CLI output. Since content_text changes, content_hash will differ from stored values, triggering automatic re-embedding on the next "lore embed" run. Uses str::get() for all byte-offset slicing to prevent panics on titles containing emoji or other multi-byte UTF-8 characters. 15 new tests covering: all boilerplate patterns, case insensitivity, edge cases (empty inner text, no-op for normal titles), UTF-8 safety, and end-to-end document extraction with boilerplate titles. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
517 lines
17 KiB
Rust
517 lines
17 KiB
Rust
pub fn extract_note_document(conn: &Connection, note_id: i64) -> Result<Option<DocumentData>> {
|
|
let row = conn.query_row(
|
|
"SELECT n.id, n.gitlab_id, n.author_username, n.body, n.note_type, n.is_system,
|
|
n.created_at, n.updated_at, n.position_new_path, n.position_new_line,
|
|
n.position_old_path, n.position_old_line, n.resolvable, n.resolved, n.resolved_by,
|
|
d.noteable_type, d.issue_id, d.merge_request_id,
|
|
p.path_with_namespace, p.id AS project_id
|
|
FROM notes n
|
|
JOIN discussions d ON n.discussion_id = d.id
|
|
JOIN projects p ON n.project_id = p.id
|
|
WHERE n.id = ?1",
|
|
rusqlite::params![note_id],
|
|
|row| {
|
|
Ok((
|
|
row.get::<_, i64>(0)?,
|
|
row.get::<_, i64>(1)?,
|
|
row.get::<_, Option<String>>(2)?,
|
|
row.get::<_, Option<String>>(3)?,
|
|
row.get::<_, Option<String>>(4)?,
|
|
row.get::<_, bool>(5)?,
|
|
row.get::<_, i64>(6)?,
|
|
row.get::<_, i64>(7)?,
|
|
row.get::<_, Option<String>>(8)?,
|
|
row.get::<_, Option<i64>>(9)?,
|
|
row.get::<_, Option<String>>(10)?,
|
|
row.get::<_, Option<i64>>(11)?,
|
|
row.get::<_, bool>(12)?,
|
|
row.get::<_, bool>(13)?,
|
|
row.get::<_, Option<String>>(14)?,
|
|
row.get::<_, String>(15)?,
|
|
row.get::<_, Option<i64>>(16)?,
|
|
row.get::<_, Option<i64>>(17)?,
|
|
row.get::<_, String>(18)?,
|
|
row.get::<_, i64>(19)?,
|
|
))
|
|
},
|
|
);
|
|
|
|
let (
|
|
_id,
|
|
gitlab_id,
|
|
author_username,
|
|
body,
|
|
note_type,
|
|
is_system,
|
|
created_at,
|
|
updated_at,
|
|
position_new_path,
|
|
position_new_line,
|
|
position_old_path,
|
|
_position_old_line,
|
|
resolvable,
|
|
resolved,
|
|
_resolved_by,
|
|
noteable_type,
|
|
issue_id,
|
|
merge_request_id,
|
|
path_with_namespace,
|
|
project_id,
|
|
) = match row {
|
|
Ok(r) => r,
|
|
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
|
|
if is_system {
|
|
return Ok(None);
|
|
}
|
|
|
|
let (parent_iid, parent_title, parent_web_url, parent_type_label, labels) =
|
|
match noteable_type.as_str() {
|
|
"Issue" => {
|
|
let parent_id = match issue_id {
|
|
Some(pid) => pid,
|
|
None => return Ok(None),
|
|
};
|
|
let parent = conn.query_row(
|
|
"SELECT i.iid, i.title, i.web_url FROM issues i WHERE i.id = ?1",
|
|
rusqlite::params![parent_id],
|
|
|row| {
|
|
Ok((
|
|
row.get::<_, i64>(0)?,
|
|
row.get::<_, Option<String>>(1)?,
|
|
row.get::<_, Option<String>>(2)?,
|
|
))
|
|
},
|
|
);
|
|
let (iid, title, web_url) = match parent {
|
|
Ok(r) => r,
|
|
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let mut label_stmt = conn.prepare_cached(
|
|
"SELECT l.name FROM issue_labels il
|
|
JOIN labels l ON l.id = il.label_id
|
|
WHERE il.issue_id = ?1
|
|
ORDER BY l.name",
|
|
)?;
|
|
let labels: Vec<String> = label_stmt
|
|
.query_map(rusqlite::params![parent_id], |row| row.get(0))?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
(iid, title, web_url, "Issue", labels)
|
|
}
|
|
"MergeRequest" => {
|
|
let parent_id = match merge_request_id {
|
|
Some(pid) => pid,
|
|
None => return Ok(None),
|
|
};
|
|
let parent = conn.query_row(
|
|
"SELECT m.iid, m.title, m.web_url FROM merge_requests m WHERE m.id = ?1",
|
|
rusqlite::params![parent_id],
|
|
|row| {
|
|
Ok((
|
|
row.get::<_, i64>(0)?,
|
|
row.get::<_, Option<String>>(1)?,
|
|
row.get::<_, Option<String>>(2)?,
|
|
))
|
|
},
|
|
);
|
|
let (iid, title, web_url) = match parent {
|
|
Ok(r) => r,
|
|
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let mut label_stmt = conn.prepare_cached(
|
|
"SELECT l.name FROM mr_labels ml
|
|
JOIN labels l ON l.id = ml.label_id
|
|
WHERE ml.merge_request_id = ?1
|
|
ORDER BY l.name",
|
|
)?;
|
|
let labels: Vec<String> = label_stmt
|
|
.query_map(rusqlite::params![parent_id], |row| row.get(0))?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
(iid, title, web_url, "MergeRequest", labels)
|
|
}
|
|
_ => return Ok(None),
|
|
};
|
|
|
|
build_note_document(
|
|
note_id,
|
|
gitlab_id,
|
|
author_username,
|
|
body,
|
|
note_type,
|
|
created_at,
|
|
updated_at,
|
|
position_new_path,
|
|
position_new_line,
|
|
position_old_path,
|
|
resolvable,
|
|
resolved,
|
|
parent_iid,
|
|
parent_title.as_deref(),
|
|
parent_web_url.as_deref(),
|
|
&labels,
|
|
parent_type_label,
|
|
&path_with_namespace,
|
|
project_id,
|
|
)
|
|
}
|
|
|
|
pub struct ParentMetadata {
|
|
pub iid: i64,
|
|
pub title: Option<String>,
|
|
pub web_url: Option<String>,
|
|
pub labels: Vec<String>,
|
|
pub project_path: String,
|
|
}
|
|
|
|
pub struct ParentMetadataCache {
|
|
cache: HashMap<(String, i64), Option<ParentMetadata>>,
|
|
}
|
|
|
|
impl Default for ParentMetadataCache {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl ParentMetadataCache {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
cache: HashMap::new(),
|
|
}
|
|
}
|
|
|
|
pub fn get_or_fetch(
|
|
&mut self,
|
|
conn: &Connection,
|
|
noteable_type: &str,
|
|
parent_id: i64,
|
|
project_path: &str,
|
|
) -> Result<Option<&ParentMetadata>> {
|
|
let key = (noteable_type.to_string(), parent_id);
|
|
if !self.cache.contains_key(&key) {
|
|
let meta = fetch_parent_metadata(conn, noteable_type, parent_id, project_path)?;
|
|
self.cache.insert(key.clone(), meta);
|
|
}
|
|
Ok(self.cache.get(&key).and_then(|m| m.as_ref()))
|
|
}
|
|
}
|
|
|
|
fn fetch_parent_metadata(
|
|
conn: &Connection,
|
|
noteable_type: &str,
|
|
parent_id: i64,
|
|
project_path: &str,
|
|
) -> Result<Option<ParentMetadata>> {
|
|
match noteable_type {
|
|
"Issue" => {
|
|
let parent = conn.query_row(
|
|
"SELECT i.iid, i.title, i.web_url FROM issues i WHERE i.id = ?1",
|
|
rusqlite::params![parent_id],
|
|
|row| {
|
|
Ok((
|
|
row.get::<_, i64>(0)?,
|
|
row.get::<_, Option<String>>(1)?,
|
|
row.get::<_, Option<String>>(2)?,
|
|
))
|
|
},
|
|
);
|
|
let (iid, title, web_url) = match parent {
|
|
Ok(r) => r,
|
|
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let mut label_stmt = conn.prepare_cached(
|
|
"SELECT l.name FROM issue_labels il
|
|
JOIN labels l ON l.id = il.label_id
|
|
WHERE il.issue_id = ?1
|
|
ORDER BY l.name",
|
|
)?;
|
|
let labels: Vec<String> = label_stmt
|
|
.query_map(rusqlite::params![parent_id], |row| row.get(0))?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
Ok(Some(ParentMetadata {
|
|
iid,
|
|
title,
|
|
web_url,
|
|
labels,
|
|
project_path: project_path.to_string(),
|
|
}))
|
|
}
|
|
"MergeRequest" => {
|
|
let parent = conn.query_row(
|
|
"SELECT m.iid, m.title, m.web_url FROM merge_requests m WHERE m.id = ?1",
|
|
rusqlite::params![parent_id],
|
|
|row| {
|
|
Ok((
|
|
row.get::<_, i64>(0)?,
|
|
row.get::<_, Option<String>>(1)?,
|
|
row.get::<_, Option<String>>(2)?,
|
|
))
|
|
},
|
|
);
|
|
let (iid, title, web_url) = match parent {
|
|
Ok(r) => r,
|
|
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let mut label_stmt = conn.prepare_cached(
|
|
"SELECT l.name FROM mr_labels ml
|
|
JOIN labels l ON l.id = ml.label_id
|
|
WHERE ml.merge_request_id = ?1
|
|
ORDER BY l.name",
|
|
)?;
|
|
let labels: Vec<String> = label_stmt
|
|
.query_map(rusqlite::params![parent_id], |row| row.get(0))?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
Ok(Some(ParentMetadata {
|
|
iid,
|
|
title,
|
|
web_url,
|
|
labels,
|
|
project_path: project_path.to_string(),
|
|
}))
|
|
}
|
|
_ => Ok(None),
|
|
}
|
|
}
|
|
|
|
pub fn extract_note_document_cached(
|
|
conn: &Connection,
|
|
note_id: i64,
|
|
cache: &mut ParentMetadataCache,
|
|
) -> Result<Option<DocumentData>> {
|
|
let row = conn.query_row(
|
|
"SELECT n.id, n.gitlab_id, n.author_username, n.body, n.note_type, n.is_system,
|
|
n.created_at, n.updated_at, n.position_new_path, n.position_new_line,
|
|
n.position_old_path, n.position_old_line, n.resolvable, n.resolved, n.resolved_by,
|
|
d.noteable_type, d.issue_id, d.merge_request_id,
|
|
p.path_with_namespace, p.id AS project_id
|
|
FROM notes n
|
|
JOIN discussions d ON n.discussion_id = d.id
|
|
JOIN projects p ON n.project_id = p.id
|
|
WHERE n.id = ?1",
|
|
rusqlite::params![note_id],
|
|
|row| {
|
|
Ok((
|
|
row.get::<_, i64>(0)?,
|
|
row.get::<_, i64>(1)?,
|
|
row.get::<_, Option<String>>(2)?,
|
|
row.get::<_, Option<String>>(3)?,
|
|
row.get::<_, Option<String>>(4)?,
|
|
row.get::<_, bool>(5)?,
|
|
row.get::<_, i64>(6)?,
|
|
row.get::<_, i64>(7)?,
|
|
row.get::<_, Option<String>>(8)?,
|
|
row.get::<_, Option<i64>>(9)?,
|
|
row.get::<_, Option<String>>(10)?,
|
|
row.get::<_, Option<i64>>(11)?,
|
|
row.get::<_, bool>(12)?,
|
|
row.get::<_, bool>(13)?,
|
|
row.get::<_, Option<String>>(14)?,
|
|
row.get::<_, String>(15)?,
|
|
row.get::<_, Option<i64>>(16)?,
|
|
row.get::<_, Option<i64>>(17)?,
|
|
row.get::<_, String>(18)?,
|
|
row.get::<_, i64>(19)?,
|
|
))
|
|
},
|
|
);
|
|
|
|
let (
|
|
_id,
|
|
gitlab_id,
|
|
author_username,
|
|
body,
|
|
note_type,
|
|
is_system,
|
|
created_at,
|
|
updated_at,
|
|
position_new_path,
|
|
position_new_line,
|
|
position_old_path,
|
|
_position_old_line,
|
|
resolvable,
|
|
resolved,
|
|
_resolved_by,
|
|
noteable_type,
|
|
issue_id,
|
|
merge_request_id,
|
|
path_with_namespace,
|
|
project_id,
|
|
) = match row {
|
|
Ok(r) => r,
|
|
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
|
|
if is_system {
|
|
return Ok(None);
|
|
}
|
|
|
|
let parent_id = match noteable_type.as_str() {
|
|
"Issue" => match issue_id {
|
|
Some(pid) => pid,
|
|
None => return Ok(None),
|
|
},
|
|
"MergeRequest" => match merge_request_id {
|
|
Some(pid) => pid,
|
|
None => return Ok(None),
|
|
},
|
|
_ => return Ok(None),
|
|
};
|
|
|
|
let parent = cache.get_or_fetch(conn, ¬eable_type, parent_id, &path_with_namespace)?;
|
|
let parent = match parent {
|
|
Some(p) => p,
|
|
None => return Ok(None),
|
|
};
|
|
|
|
let parent_iid = parent.iid;
|
|
let parent_title = parent.title.as_deref();
|
|
let parent_web_url = parent.web_url.as_deref();
|
|
let labels = parent.labels.clone();
|
|
let parent_type_label = noteable_type.as_str();
|
|
|
|
build_note_document(
|
|
note_id,
|
|
gitlab_id,
|
|
author_username,
|
|
body,
|
|
note_type,
|
|
created_at,
|
|
updated_at,
|
|
position_new_path,
|
|
position_new_line,
|
|
position_old_path,
|
|
resolvable,
|
|
resolved,
|
|
parent_iid,
|
|
parent_title,
|
|
parent_web_url,
|
|
&labels,
|
|
parent_type_label,
|
|
&path_with_namespace,
|
|
project_id,
|
|
)
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
fn build_note_document(
|
|
note_id: i64,
|
|
gitlab_id: i64,
|
|
author_username: Option<String>,
|
|
body: Option<String>,
|
|
note_type: Option<String>,
|
|
created_at: i64,
|
|
updated_at: i64,
|
|
position_new_path: Option<String>,
|
|
position_new_line: Option<i64>,
|
|
position_old_path: Option<String>,
|
|
resolvable: bool,
|
|
resolved: bool,
|
|
parent_iid: i64,
|
|
parent_title: Option<&str>,
|
|
parent_web_url: Option<&str>,
|
|
labels: &[String],
|
|
parent_type_label: &str,
|
|
path_with_namespace: &str,
|
|
project_id: i64,
|
|
) -> Result<Option<DocumentData>> {
|
|
let mut path_set = BTreeSet::new();
|
|
if let Some(ref p) = position_old_path
|
|
&& !p.is_empty()
|
|
{
|
|
path_set.insert(p.clone());
|
|
}
|
|
if let Some(ref p) = position_new_path
|
|
&& !p.is_empty()
|
|
{
|
|
path_set.insert(p.clone());
|
|
}
|
|
let paths: Vec<String> = path_set.into_iter().collect();
|
|
|
|
let url = parent_web_url.map(|wu| format!("{}#note_{}", wu, gitlab_id));
|
|
|
|
let display_title = parent_title.unwrap_or("(untitled)");
|
|
let embed_title = normalize_title_for_embedding(display_title);
|
|
let display_note_type = note_type.as_deref().unwrap_or("Note");
|
|
let display_author = author_username.as_deref().unwrap_or("unknown");
|
|
let parent_prefix = if parent_type_label == "Issue" {
|
|
format!("Issue #{}", parent_iid)
|
|
} else {
|
|
format!("MR !{}", parent_iid)
|
|
};
|
|
|
|
// Display title uses original (for human-readable output)
|
|
let title = format!(
|
|
"Note by @{} on {}: {}",
|
|
display_author, parent_prefix, display_title
|
|
);
|
|
|
|
let labels_csv = labels.join(", ");
|
|
|
|
let mut content = String::new();
|
|
let _ = writeln!(content, "[[Note]]");
|
|
let _ = writeln!(content, "source_type: note");
|
|
let _ = writeln!(content, "note_gitlab_id: {}", gitlab_id);
|
|
let _ = writeln!(content, "project: {}", path_with_namespace);
|
|
let _ = writeln!(content, "parent_type: {}", parent_type_label);
|
|
let _ = writeln!(content, "parent_iid: {}", parent_iid);
|
|
let _ = writeln!(content, "parent_title: {}", embed_title);
|
|
let _ = writeln!(content, "note_type: {}", display_note_type);
|
|
let _ = writeln!(content, "author: @{}", display_author);
|
|
let _ = writeln!(content, "created_at: {}", ms_to_iso(created_at));
|
|
if resolvable {
|
|
let _ = writeln!(content, "resolved: {}", resolved);
|
|
}
|
|
if display_note_type == "DiffNote"
|
|
&& let Some(ref p) = position_new_path
|
|
{
|
|
if let Some(line) = position_new_line {
|
|
let _ = writeln!(content, "path: {}:{}", p, line);
|
|
} else {
|
|
let _ = writeln!(content, "path: {}", p);
|
|
}
|
|
}
|
|
if !labels.is_empty() {
|
|
let _ = writeln!(content, "labels: {}", labels_csv);
|
|
}
|
|
if let Some(ref u) = url {
|
|
let _ = writeln!(content, "url: {}", u);
|
|
}
|
|
|
|
content.push_str("\n--- Body ---\n\n");
|
|
content.push_str(body.as_deref().unwrap_or(""));
|
|
|
|
let labels_hash = compute_list_hash(labels);
|
|
let paths_hash = compute_list_hash(&paths);
|
|
|
|
let hard_cap = truncate_hard_cap(&content);
|
|
let content_hash = compute_content_hash(&hard_cap.content);
|
|
|
|
Ok(Some(DocumentData {
|
|
source_type: SourceType::Note,
|
|
source_id: note_id,
|
|
project_id,
|
|
author_username,
|
|
labels: labels.to_vec(),
|
|
paths,
|
|
labels_hash,
|
|
paths_hash,
|
|
created_at,
|
|
updated_at,
|
|
url,
|
|
title: Some(title),
|
|
content_text: hard_cap.content,
|
|
content_hash,
|
|
is_truncated: hard_cap.is_truncated,
|
|
truncated_reason: hard_cap.reason.map(|r| r.as_str().to_string()),
|
|
}))
|
|
}
|