feat(xref): extract cross-references from descriptions, user notes, and fix system note regex

- Fix MENTIONED_RE/CLOSED_BY_RE to match real GitLab format
  ('mentioned in issue #N' / 'mentioned in merge request !N')
- Add GITLAB_URL_RE + parse_url_refs() for full URL extraction
- Add extract_refs_from_descriptions() -> source_method='description_parse'
- Add extract_refs_from_user_notes() -> source_method='note_parse'
- Wire both into orchestrator after system note extraction
- 36 tests: regex fix, URL parsing, integration, idempotency
This commit is contained in:
teernisse
2026-02-13 15:15:43 -05:00
parent c10471ddb9
commit 0aecbf33c0
3 changed files with 721 additions and 20 deletions

View File

@@ -22,20 +22,34 @@ pub struct ExtractResult {
pub parse_failures: usize,
}
// GitLab system notes include the entity type word: "mentioned in issue #5"
// or "mentioned in merge request !730". The word is mandatory in real data,
// but we also keep the old bare-sigil form as a fallback (no data uses it today,
// but other GitLab instances might differ).
static MENTIONED_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"mentioned in (?:(?P<project>[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P<sigil>[#!])(?P<iid>\d+)",
r"mentioned in (?:issue |merge request )?(?:(?P<project>[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P<sigil>[#!])(?P<iid>\d+)",
)
.expect("mentioned regex is valid")
});
static CLOSED_BY_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"closed by (?:(?P<project>[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P<sigil>[#!])(?P<iid>\d+)",
r"closed by (?:issue |merge request )?(?:(?P<project>[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P<sigil>[#!])(?P<iid>\d+)",
)
.expect("closed_by regex is valid")
});
/// Matches full GitLab URLs like:
/// `https://gitlab.example.com/group/project/-/issues/123`
/// `https://gitlab.example.com/group/sub/project/-/merge_requests/456`
static GITLAB_URL_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"https?://[^\s/]+/(?P<project>[^\s]+?)/-/(?P<entity_type>issues|merge_requests)/(?P<iid>\d+)",
)
.expect("gitlab url regex is valid")
});
pub fn parse_cross_refs(body: &str) -> Vec<ParsedCrossRef> {
let mut refs = Vec::new();
@@ -54,6 +68,47 @@ pub fn parse_cross_refs(body: &str) -> Vec<ParsedCrossRef> {
refs
}
/// Extract cross-references from GitLab URLs in free-text bodies (descriptions, user notes).
pub fn parse_url_refs(body: &str) -> Vec<ParsedCrossRef> {
let mut refs = Vec::new();
let mut seen = std::collections::HashSet::new();
for caps in GITLAB_URL_RE.captures_iter(body) {
let Some(entity_type_raw) = caps.name("entity_type").map(|m| m.as_str()) else {
continue;
};
let Some(iid_str) = caps.name("iid").map(|m| m.as_str()) else {
continue;
};
let Some(project) = caps.name("project").map(|m| m.as_str()) else {
continue;
};
let Ok(iid) = iid_str.parse::<i64>() else {
continue;
};
let target_entity_type = match entity_type_raw {
"issues" => "issue",
"merge_requests" => "merge_request",
_ => continue,
};
let key = (target_entity_type, project.to_owned(), iid);
if !seen.insert(key) {
continue; // deduplicate within same body
}
refs.push(ParsedCrossRef {
reference_type: "mentioned".to_owned(),
target_entity_type: target_entity_type.to_owned(),
target_iid: iid,
target_project_path: Some(project.to_owned()),
});
}
refs
}
fn capture_to_cross_ref(
caps: &regex::Captures<'_>,
reference_type: &str,
@@ -233,6 +288,189 @@ fn resolve_cross_project_entity(
resolve_entity_id(conn, project_id, entity_type, iid)
}
/// Extract cross-references from issue and MR descriptions (GitLab URLs only).
pub fn extract_refs_from_descriptions(conn: &Connection, project_id: i64) -> Result<ExtractResult> {
let mut result = ExtractResult::default();
let mut insert_stmt = conn.prepare_cached(
"INSERT OR IGNORE INTO entity_references
(project_id, source_entity_type, source_entity_id,
target_entity_type, target_entity_id,
target_project_path, target_entity_iid,
reference_type, source_method, created_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'description_parse', ?9)",
)?;
let now = now_ms();
// Issues with descriptions
let mut issue_stmt = conn.prepare_cached(
"SELECT id, iid, description FROM issues
WHERE project_id = ?1 AND description IS NOT NULL AND description != ''",
)?;
let issues: Vec<(i64, i64, String)> = issue_stmt
.query_map([project_id], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?))
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
for (entity_id, _iid, description) in &issues {
insert_url_refs(
conn,
&mut insert_stmt,
&mut result,
project_id,
"issue",
*entity_id,
description,
now,
)?;
}
// Merge requests with descriptions
let mut mr_stmt = conn.prepare_cached(
"SELECT id, iid, description FROM merge_requests
WHERE project_id = ?1 AND description IS NOT NULL AND description != ''",
)?;
let mrs: Vec<(i64, i64, String)> = mr_stmt
.query_map([project_id], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?))
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
for (entity_id, _iid, description) in &mrs {
insert_url_refs(
conn,
&mut insert_stmt,
&mut result,
project_id,
"merge_request",
*entity_id,
description,
now,
)?;
}
if result.inserted > 0 || result.skipped_unresolvable > 0 {
debug!(
inserted = result.inserted,
unresolvable = result.skipped_unresolvable,
"Description cross-reference extraction complete"
);
}
Ok(result)
}
/// Extract cross-references from user (non-system) notes (GitLab URLs only).
pub fn extract_refs_from_user_notes(conn: &Connection, project_id: i64) -> Result<ExtractResult> {
let mut result = ExtractResult::default();
let mut note_stmt = conn.prepare_cached(
"SELECT n.id, n.body, d.noteable_type,
COALESCE(d.issue_id, d.merge_request_id) AS entity_id
FROM notes n
JOIN discussions d ON n.discussion_id = d.id
WHERE n.is_system = 0
AND n.project_id = ?1
AND n.body IS NOT NULL",
)?;
let notes: Vec<(i64, String, String, i64)> = note_stmt
.query_map([project_id], |row| {
Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
if notes.is_empty() {
return Ok(result);
}
let mut insert_stmt = conn.prepare_cached(
"INSERT OR IGNORE INTO entity_references
(project_id, source_entity_type, source_entity_id,
target_entity_type, target_entity_id,
target_project_path, target_entity_iid,
reference_type, source_method, created_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'note_parse', ?9)",
)?;
let now = now_ms();
for (_, body, noteable_type, entity_id) in &notes {
let source_entity_type = noteable_type_to_entity_type(noteable_type);
insert_url_refs(
conn,
&mut insert_stmt,
&mut result,
project_id,
source_entity_type,
*entity_id,
body,
now,
)?;
}
if result.inserted > 0 || result.skipped_unresolvable > 0 {
debug!(
inserted = result.inserted,
unresolvable = result.skipped_unresolvable,
"User note cross-reference extraction complete"
);
}
Ok(result)
}
/// Shared helper: parse URL refs from a body and insert into entity_references.
#[allow(clippy::too_many_arguments)]
fn insert_url_refs(
conn: &Connection,
insert_stmt: &mut rusqlite::CachedStatement<'_>,
result: &mut ExtractResult,
project_id: i64,
source_entity_type: &str,
source_entity_id: i64,
body: &str,
now: i64,
) -> Result<()> {
let url_refs = parse_url_refs(body);
for xref in &url_refs {
let target_entity_id = if let Some(ref path) = xref.target_project_path {
resolve_cross_project_entity(conn, path, &xref.target_entity_type, xref.target_iid)
} else {
resolve_entity_id(conn, project_id, &xref.target_entity_type, xref.target_iid)
};
let rows_changed = insert_stmt.execute(rusqlite::params![
project_id,
source_entity_type,
source_entity_id,
xref.target_entity_type,
target_entity_id,
xref.target_project_path,
if target_entity_id.is_none() {
Some(xref.target_iid)
} else {
None
},
xref.reference_type,
now,
])?;
if rows_changed > 0 {
if target_entity_id.is_none() {
result.skipped_unresolvable += 1;
} else {
result.inserted += 1;
}
}
}
Ok(())
}
#[cfg(test)]
#[path = "note_parser_tests.rs"]
mod tests;