- Fix MENTIONED_RE/CLOSED_BY_RE to match real GitLab format
('mentioned in issue #N' / 'mentioned in merge request !N')
- Add GITLAB_URL_RE + parse_url_refs() for full URL extraction
- Add extract_refs_from_descriptions() -> source_method='description_parse'
- Add extract_refs_from_user_notes() -> source_method='note_parse'
- Wire both into orchestrator after system note extraction
- 36 tests: regex fix, URL parsing, integration, idempotency
477 lines
14 KiB
Rust
477 lines
14 KiB
Rust
use std::sync::LazyLock;
|
|
|
|
use regex::Regex;
|
|
use rusqlite::Connection;
|
|
use tracing::debug;
|
|
|
|
use super::error::Result;
|
|
use super::time::now_ms;
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct ParsedCrossRef {
|
|
pub reference_type: String,
|
|
pub target_entity_type: String,
|
|
pub target_iid: i64,
|
|
pub target_project_path: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Default)]
|
|
pub struct ExtractResult {
|
|
pub inserted: usize,
|
|
pub skipped_unresolvable: usize,
|
|
pub parse_failures: usize,
|
|
}
|
|
|
|
// GitLab system notes include the entity type word: "mentioned in issue #5"
|
|
// or "mentioned in merge request !730". The word is mandatory in real data,
|
|
// but we also keep the old bare-sigil form as a fallback (no data uses it today,
|
|
// but other GitLab instances might differ).
|
|
static MENTIONED_RE: LazyLock<Regex> = LazyLock::new(|| {
|
|
Regex::new(
|
|
r"mentioned in (?:issue |merge request )?(?:(?P<project>[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P<sigil>[#!])(?P<iid>\d+)",
|
|
)
|
|
.expect("mentioned regex is valid")
|
|
});
|
|
|
|
static CLOSED_BY_RE: LazyLock<Regex> = LazyLock::new(|| {
|
|
Regex::new(
|
|
r"closed by (?:issue |merge request )?(?:(?P<project>[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P<sigil>[#!])(?P<iid>\d+)",
|
|
)
|
|
.expect("closed_by regex is valid")
|
|
});
|
|
|
|
/// Matches full GitLab URLs like:
|
|
/// `https://gitlab.example.com/group/project/-/issues/123`
|
|
/// `https://gitlab.example.com/group/sub/project/-/merge_requests/456`
|
|
static GITLAB_URL_RE: LazyLock<Regex> = LazyLock::new(|| {
|
|
Regex::new(
|
|
r"https?://[^\s/]+/(?P<project>[^\s]+?)/-/(?P<entity_type>issues|merge_requests)/(?P<iid>\d+)",
|
|
)
|
|
.expect("gitlab url regex is valid")
|
|
});
|
|
|
|
pub fn parse_cross_refs(body: &str) -> Vec<ParsedCrossRef> {
|
|
let mut refs = Vec::new();
|
|
|
|
for caps in MENTIONED_RE.captures_iter(body) {
|
|
if let Some(parsed) = capture_to_cross_ref(&caps, "mentioned") {
|
|
refs.push(parsed);
|
|
}
|
|
}
|
|
|
|
for caps in CLOSED_BY_RE.captures_iter(body) {
|
|
if let Some(parsed) = capture_to_cross_ref(&caps, "closes") {
|
|
refs.push(parsed);
|
|
}
|
|
}
|
|
|
|
refs
|
|
}
|
|
|
|
/// Extract cross-references from GitLab URLs in free-text bodies (descriptions, user notes).
|
|
pub fn parse_url_refs(body: &str) -> Vec<ParsedCrossRef> {
|
|
let mut refs = Vec::new();
|
|
let mut seen = std::collections::HashSet::new();
|
|
|
|
for caps in GITLAB_URL_RE.captures_iter(body) {
|
|
let Some(entity_type_raw) = caps.name("entity_type").map(|m| m.as_str()) else {
|
|
continue;
|
|
};
|
|
let Some(iid_str) = caps.name("iid").map(|m| m.as_str()) else {
|
|
continue;
|
|
};
|
|
let Some(project) = caps.name("project").map(|m| m.as_str()) else {
|
|
continue;
|
|
};
|
|
let Ok(iid) = iid_str.parse::<i64>() else {
|
|
continue;
|
|
};
|
|
|
|
let target_entity_type = match entity_type_raw {
|
|
"issues" => "issue",
|
|
"merge_requests" => "merge_request",
|
|
_ => continue,
|
|
};
|
|
|
|
let key = (target_entity_type, project.to_owned(), iid);
|
|
if !seen.insert(key) {
|
|
continue; // deduplicate within same body
|
|
}
|
|
|
|
refs.push(ParsedCrossRef {
|
|
reference_type: "mentioned".to_owned(),
|
|
target_entity_type: target_entity_type.to_owned(),
|
|
target_iid: iid,
|
|
target_project_path: Some(project.to_owned()),
|
|
});
|
|
}
|
|
|
|
refs
|
|
}
|
|
|
|
fn capture_to_cross_ref(
|
|
caps: ®ex::Captures<'_>,
|
|
reference_type: &str,
|
|
) -> Option<ParsedCrossRef> {
|
|
let sigil = caps.name("sigil")?.as_str();
|
|
let iid_str = caps.name("iid")?.as_str();
|
|
let iid: i64 = iid_str.parse().ok()?;
|
|
let project = caps.name("project").map(|m| m.as_str().to_owned());
|
|
|
|
let target_entity_type = match sigil {
|
|
"#" => "issue",
|
|
"!" => "merge_request",
|
|
_ => return None,
|
|
};
|
|
|
|
Some(ParsedCrossRef {
|
|
reference_type: reference_type.to_owned(),
|
|
target_entity_type: target_entity_type.to_owned(),
|
|
target_iid: iid,
|
|
target_project_path: project,
|
|
})
|
|
}
|
|
|
|
struct SystemNote {
|
|
note_id: i64,
|
|
body: String,
|
|
noteable_type: String,
|
|
entity_id: i64,
|
|
}
|
|
|
|
pub fn extract_refs_from_system_notes(conn: &Connection, project_id: i64) -> Result<ExtractResult> {
|
|
let mut result = ExtractResult::default();
|
|
|
|
let mut stmt = conn.prepare_cached(
|
|
"SELECT n.id, n.body, d.noteable_type,
|
|
COALESCE(d.issue_id, d.merge_request_id) AS entity_id
|
|
FROM notes n
|
|
JOIN discussions d ON n.discussion_id = d.id
|
|
WHERE n.is_system = 1
|
|
AND n.project_id = ?1
|
|
AND n.body IS NOT NULL",
|
|
)?;
|
|
|
|
let notes: Vec<SystemNote> = stmt
|
|
.query_map([project_id], |row| {
|
|
Ok(SystemNote {
|
|
note_id: row.get(0)?,
|
|
body: row.get(1)?,
|
|
noteable_type: row.get(2)?,
|
|
entity_id: row.get(3)?,
|
|
})
|
|
})?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
if notes.is_empty() {
|
|
return Ok(result);
|
|
}
|
|
|
|
let mut insert_stmt = conn.prepare_cached(
|
|
"INSERT OR IGNORE INTO entity_references
|
|
(project_id, source_entity_type, source_entity_id,
|
|
target_entity_type, target_entity_id,
|
|
target_project_path, target_entity_iid,
|
|
reference_type, source_method, created_at)
|
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'note_parse', ?9)",
|
|
)?;
|
|
|
|
let now = now_ms();
|
|
|
|
for note in ¬es {
|
|
let cross_refs = parse_cross_refs(¬e.body);
|
|
|
|
if cross_refs.is_empty() {
|
|
debug!(
|
|
note_id = note.note_id,
|
|
body = %note.body,
|
|
"System note did not match any cross-reference pattern"
|
|
);
|
|
result.parse_failures += 1;
|
|
continue;
|
|
}
|
|
|
|
let source_entity_type = noteable_type_to_entity_type(¬e.noteable_type);
|
|
|
|
for xref in &cross_refs {
|
|
let target_entity_id = if xref.target_project_path.is_none() {
|
|
resolve_entity_id(conn, project_id, &xref.target_entity_type, xref.target_iid)
|
|
} else {
|
|
resolve_cross_project_entity(
|
|
conn,
|
|
xref.target_project_path.as_deref().unwrap_or_default(),
|
|
&xref.target_entity_type,
|
|
xref.target_iid,
|
|
)
|
|
};
|
|
|
|
let rows_changed = insert_stmt.execute(rusqlite::params![
|
|
project_id,
|
|
source_entity_type,
|
|
note.entity_id,
|
|
xref.target_entity_type,
|
|
target_entity_id,
|
|
xref.target_project_path,
|
|
if target_entity_id.is_none() {
|
|
Some(xref.target_iid)
|
|
} else {
|
|
None
|
|
},
|
|
xref.reference_type,
|
|
now,
|
|
])?;
|
|
|
|
if rows_changed > 0 {
|
|
if target_entity_id.is_none() {
|
|
result.skipped_unresolvable += 1;
|
|
} else {
|
|
result.inserted += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if result.inserted > 0 || result.skipped_unresolvable > 0 {
|
|
debug!(
|
|
inserted = result.inserted,
|
|
unresolvable = result.skipped_unresolvable,
|
|
parse_failures = result.parse_failures,
|
|
"System note cross-reference extraction complete"
|
|
);
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
fn noteable_type_to_entity_type(noteable_type: &str) -> &str {
|
|
match noteable_type {
|
|
"Issue" => "issue",
|
|
"MergeRequest" => "merge_request",
|
|
other => {
|
|
debug!(noteable_type = %other, "Unknown noteable_type, defaulting to issue");
|
|
"issue"
|
|
}
|
|
}
|
|
}
|
|
|
|
fn resolve_entity_id(
|
|
conn: &Connection,
|
|
project_id: i64,
|
|
entity_type: &str,
|
|
iid: i64,
|
|
) -> Option<i64> {
|
|
let (table, id_col) = match entity_type {
|
|
"issue" => ("issues", "id"),
|
|
"merge_request" => ("merge_requests", "id"),
|
|
_ => return None,
|
|
};
|
|
|
|
let sql = format!("SELECT {id_col} FROM {table} WHERE project_id = ?1 AND iid = ?2");
|
|
conn.query_row(&sql, rusqlite::params![project_id, iid], |row| row.get(0))
|
|
.ok()
|
|
}
|
|
|
|
fn resolve_cross_project_entity(
|
|
conn: &Connection,
|
|
project_path: &str,
|
|
entity_type: &str,
|
|
iid: i64,
|
|
) -> Option<i64> {
|
|
let project_id: i64 = conn
|
|
.query_row(
|
|
"SELECT id FROM projects WHERE path_with_namespace = ?1",
|
|
[project_path],
|
|
|row| row.get(0),
|
|
)
|
|
.ok()?;
|
|
|
|
resolve_entity_id(conn, project_id, entity_type, iid)
|
|
}
|
|
|
|
/// Extract cross-references from issue and MR descriptions (GitLab URLs only).
|
|
pub fn extract_refs_from_descriptions(conn: &Connection, project_id: i64) -> Result<ExtractResult> {
|
|
let mut result = ExtractResult::default();
|
|
|
|
let mut insert_stmt = conn.prepare_cached(
|
|
"INSERT OR IGNORE INTO entity_references
|
|
(project_id, source_entity_type, source_entity_id,
|
|
target_entity_type, target_entity_id,
|
|
target_project_path, target_entity_iid,
|
|
reference_type, source_method, created_at)
|
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'description_parse', ?9)",
|
|
)?;
|
|
|
|
let now = now_ms();
|
|
|
|
// Issues with descriptions
|
|
let mut issue_stmt = conn.prepare_cached(
|
|
"SELECT id, iid, description FROM issues
|
|
WHERE project_id = ?1 AND description IS NOT NULL AND description != ''",
|
|
)?;
|
|
let issues: Vec<(i64, i64, String)> = issue_stmt
|
|
.query_map([project_id], |row| {
|
|
Ok((row.get(0)?, row.get(1)?, row.get(2)?))
|
|
})?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
for (entity_id, _iid, description) in &issues {
|
|
insert_url_refs(
|
|
conn,
|
|
&mut insert_stmt,
|
|
&mut result,
|
|
project_id,
|
|
"issue",
|
|
*entity_id,
|
|
description,
|
|
now,
|
|
)?;
|
|
}
|
|
|
|
// Merge requests with descriptions
|
|
let mut mr_stmt = conn.prepare_cached(
|
|
"SELECT id, iid, description FROM merge_requests
|
|
WHERE project_id = ?1 AND description IS NOT NULL AND description != ''",
|
|
)?;
|
|
let mrs: Vec<(i64, i64, String)> = mr_stmt
|
|
.query_map([project_id], |row| {
|
|
Ok((row.get(0)?, row.get(1)?, row.get(2)?))
|
|
})?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
for (entity_id, _iid, description) in &mrs {
|
|
insert_url_refs(
|
|
conn,
|
|
&mut insert_stmt,
|
|
&mut result,
|
|
project_id,
|
|
"merge_request",
|
|
*entity_id,
|
|
description,
|
|
now,
|
|
)?;
|
|
}
|
|
|
|
if result.inserted > 0 || result.skipped_unresolvable > 0 {
|
|
debug!(
|
|
inserted = result.inserted,
|
|
unresolvable = result.skipped_unresolvable,
|
|
"Description cross-reference extraction complete"
|
|
);
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Extract cross-references from user (non-system) notes (GitLab URLs only).
|
|
pub fn extract_refs_from_user_notes(conn: &Connection, project_id: i64) -> Result<ExtractResult> {
|
|
let mut result = ExtractResult::default();
|
|
|
|
let mut note_stmt = conn.prepare_cached(
|
|
"SELECT n.id, n.body, d.noteable_type,
|
|
COALESCE(d.issue_id, d.merge_request_id) AS entity_id
|
|
FROM notes n
|
|
JOIN discussions d ON n.discussion_id = d.id
|
|
WHERE n.is_system = 0
|
|
AND n.project_id = ?1
|
|
AND n.body IS NOT NULL",
|
|
)?;
|
|
|
|
let notes: Vec<(i64, String, String, i64)> = note_stmt
|
|
.query_map([project_id], |row| {
|
|
Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
|
|
})?
|
|
.collect::<std::result::Result<Vec<_>, _>>()?;
|
|
|
|
if notes.is_empty() {
|
|
return Ok(result);
|
|
}
|
|
|
|
let mut insert_stmt = conn.prepare_cached(
|
|
"INSERT OR IGNORE INTO entity_references
|
|
(project_id, source_entity_type, source_entity_id,
|
|
target_entity_type, target_entity_id,
|
|
target_project_path, target_entity_iid,
|
|
reference_type, source_method, created_at)
|
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'note_parse', ?9)",
|
|
)?;
|
|
|
|
let now = now_ms();
|
|
|
|
for (_, body, noteable_type, entity_id) in ¬es {
|
|
let source_entity_type = noteable_type_to_entity_type(noteable_type);
|
|
insert_url_refs(
|
|
conn,
|
|
&mut insert_stmt,
|
|
&mut result,
|
|
project_id,
|
|
source_entity_type,
|
|
*entity_id,
|
|
body,
|
|
now,
|
|
)?;
|
|
}
|
|
|
|
if result.inserted > 0 || result.skipped_unresolvable > 0 {
|
|
debug!(
|
|
inserted = result.inserted,
|
|
unresolvable = result.skipped_unresolvable,
|
|
"User note cross-reference extraction complete"
|
|
);
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Shared helper: parse URL refs from a body and insert into entity_references.
|
|
#[allow(clippy::too_many_arguments)]
|
|
fn insert_url_refs(
|
|
conn: &Connection,
|
|
insert_stmt: &mut rusqlite::CachedStatement<'_>,
|
|
result: &mut ExtractResult,
|
|
project_id: i64,
|
|
source_entity_type: &str,
|
|
source_entity_id: i64,
|
|
body: &str,
|
|
now: i64,
|
|
) -> Result<()> {
|
|
let url_refs = parse_url_refs(body);
|
|
|
|
for xref in &url_refs {
|
|
let target_entity_id = if let Some(ref path) = xref.target_project_path {
|
|
resolve_cross_project_entity(conn, path, &xref.target_entity_type, xref.target_iid)
|
|
} else {
|
|
resolve_entity_id(conn, project_id, &xref.target_entity_type, xref.target_iid)
|
|
};
|
|
|
|
let rows_changed = insert_stmt.execute(rusqlite::params![
|
|
project_id,
|
|
source_entity_type,
|
|
source_entity_id,
|
|
xref.target_entity_type,
|
|
target_entity_id,
|
|
xref.target_project_path,
|
|
if target_entity_id.is_none() {
|
|
Some(xref.target_iid)
|
|
} else {
|
|
None
|
|
},
|
|
xref.reference_type,
|
|
now,
|
|
])?;
|
|
|
|
if rows_changed > 0 {
|
|
if target_entity_id.is_none() {
|
|
result.skipped_unresolvable += 1;
|
|
} else {
|
|
result.inserted += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[path = "note_parser_tests.rs"]
|
|
mod tests;
|