use std::sync::LazyLock; use regex::Regex; use rusqlite::Connection; use tracing::debug; use super::error::Result; use super::time::now_ms; #[derive(Debug, Clone, PartialEq, Eq)] pub struct ParsedCrossRef { pub reference_type: String, pub target_entity_type: String, pub target_iid: i64, pub target_project_path: Option, } #[derive(Debug, Default)] pub struct ExtractResult { pub inserted: usize, pub skipped_unresolvable: usize, pub parse_failures: usize, } // GitLab system notes include the entity type word: "mentioned in issue #5" // or "mentioned in merge request !730". The word is mandatory in real data, // but we also keep the old bare-sigil form as a fallback (no data uses it today, // but other GitLab instances might differ). static MENTIONED_RE: LazyLock = LazyLock::new(|| { Regex::new( r"mentioned in (?:issue |merge request )?(?:(?P[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P[#!])(?P\d+)", ) .expect("mentioned regex is valid") }); static CLOSED_BY_RE: LazyLock = LazyLock::new(|| { Regex::new( r"closed by (?:issue |merge request )?(?:(?P[\w][\w.\-]*(?:/[\w][\w.\-]*)+))?(?P[#!])(?P\d+)", ) .expect("closed_by regex is valid") }); /// Matches full GitLab URLs like: /// `https://gitlab.example.com/group/project/-/issues/123` /// `https://gitlab.example.com/group/sub/project/-/merge_requests/456` static GITLAB_URL_RE: LazyLock = LazyLock::new(|| { Regex::new( r"https?://[^\s/]+/(?P[^\s]+?)/-/(?Pissues|merge_requests)/(?P\d+)", ) .expect("gitlab url regex is valid") }); pub fn parse_cross_refs(body: &str) -> Vec { let mut refs = Vec::new(); for caps in MENTIONED_RE.captures_iter(body) { if let Some(parsed) = capture_to_cross_ref(&caps, "mentioned") { refs.push(parsed); } } for caps in CLOSED_BY_RE.captures_iter(body) { if let Some(parsed) = capture_to_cross_ref(&caps, "closes") { refs.push(parsed); } } refs } /// Extract cross-references from GitLab URLs in free-text bodies (descriptions, user notes). pub fn parse_url_refs(body: &str) -> Vec { let mut refs = Vec::new(); let mut seen = std::collections::HashSet::new(); for caps in GITLAB_URL_RE.captures_iter(body) { let Some(entity_type_raw) = caps.name("entity_type").map(|m| m.as_str()) else { continue; }; let Some(iid_str) = caps.name("iid").map(|m| m.as_str()) else { continue; }; let Some(project) = caps.name("project").map(|m| m.as_str()) else { continue; }; let Ok(iid) = iid_str.parse::() else { continue; }; let target_entity_type = match entity_type_raw { "issues" => "issue", "merge_requests" => "merge_request", _ => continue, }; let key = (target_entity_type, project.to_owned(), iid); if !seen.insert(key) { continue; // deduplicate within same body } refs.push(ParsedCrossRef { reference_type: "mentioned".to_owned(), target_entity_type: target_entity_type.to_owned(), target_iid: iid, target_project_path: Some(project.to_owned()), }); } refs } fn capture_to_cross_ref( caps: ®ex::Captures<'_>, reference_type: &str, ) -> Option { let sigil = caps.name("sigil")?.as_str(); let iid_str = caps.name("iid")?.as_str(); let iid: i64 = iid_str.parse().ok()?; let project = caps.name("project").map(|m| m.as_str().to_owned()); let target_entity_type = match sigil { "#" => "issue", "!" => "merge_request", _ => return None, }; Some(ParsedCrossRef { reference_type: reference_type.to_owned(), target_entity_type: target_entity_type.to_owned(), target_iid: iid, target_project_path: project, }) } struct SystemNote { note_id: i64, body: String, noteable_type: String, entity_id: i64, } pub fn extract_refs_from_system_notes(conn: &Connection, project_id: i64) -> Result { let mut result = ExtractResult::default(); let mut stmt = conn.prepare_cached( "SELECT n.id, n.body, d.noteable_type, COALESCE(d.issue_id, d.merge_request_id) AS entity_id FROM notes n JOIN discussions d ON n.discussion_id = d.id WHERE n.is_system = 1 AND n.project_id = ?1 AND n.body IS NOT NULL", )?; let notes: Vec = stmt .query_map([project_id], |row| { Ok(SystemNote { note_id: row.get(0)?, body: row.get(1)?, noteable_type: row.get(2)?, entity_id: row.get(3)?, }) })? .collect::, _>>()?; if notes.is_empty() { return Ok(result); } let mut insert_stmt = conn.prepare_cached( "INSERT OR IGNORE INTO entity_references (project_id, source_entity_type, source_entity_id, target_entity_type, target_entity_id, target_project_path, target_entity_iid, reference_type, source_method, created_at) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'note_parse', ?9)", )?; let now = now_ms(); for note in ¬es { let cross_refs = parse_cross_refs(¬e.body); if cross_refs.is_empty() { debug!( note_id = note.note_id, body = %note.body, "System note did not match any cross-reference pattern" ); result.parse_failures += 1; continue; } let source_entity_type = noteable_type_to_entity_type(¬e.noteable_type); for xref in &cross_refs { let target_entity_id = if xref.target_project_path.is_none() { resolve_entity_id(conn, project_id, &xref.target_entity_type, xref.target_iid) } else { resolve_cross_project_entity( conn, xref.target_project_path.as_deref().unwrap_or_default(), &xref.target_entity_type, xref.target_iid, ) }; let rows_changed = insert_stmt.execute(rusqlite::params![ project_id, source_entity_type, note.entity_id, xref.target_entity_type, target_entity_id, xref.target_project_path, if target_entity_id.is_none() { Some(xref.target_iid) } else { None }, xref.reference_type, now, ])?; if rows_changed > 0 { if target_entity_id.is_none() { result.skipped_unresolvable += 1; } else { result.inserted += 1; } } } } if result.inserted > 0 || result.skipped_unresolvable > 0 { debug!( inserted = result.inserted, unresolvable = result.skipped_unresolvable, parse_failures = result.parse_failures, "System note cross-reference extraction complete" ); } Ok(result) } fn noteable_type_to_entity_type(noteable_type: &str) -> &str { match noteable_type { "Issue" => "issue", "MergeRequest" => "merge_request", other => { debug!(noteable_type = %other, "Unknown noteable_type, defaulting to issue"); "issue" } } } fn resolve_entity_id( conn: &Connection, project_id: i64, entity_type: &str, iid: i64, ) -> Option { let (table, id_col) = match entity_type { "issue" => ("issues", "id"), "merge_request" => ("merge_requests", "id"), _ => return None, }; let sql = format!("SELECT {id_col} FROM {table} WHERE project_id = ?1 AND iid = ?2"); conn.query_row(&sql, rusqlite::params![project_id, iid], |row| row.get(0)) .ok() } fn resolve_cross_project_entity( conn: &Connection, project_path: &str, entity_type: &str, iid: i64, ) -> Option { let project_id: i64 = conn .query_row( "SELECT id FROM projects WHERE path_with_namespace = ?1", [project_path], |row| row.get(0), ) .ok()?; resolve_entity_id(conn, project_id, entity_type, iid) } /// Extract cross-references from issue and MR descriptions (GitLab URLs only). pub fn extract_refs_from_descriptions(conn: &Connection, project_id: i64) -> Result { let mut result = ExtractResult::default(); let mut insert_stmt = conn.prepare_cached( "INSERT OR IGNORE INTO entity_references (project_id, source_entity_type, source_entity_id, target_entity_type, target_entity_id, target_project_path, target_entity_iid, reference_type, source_method, created_at) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'description_parse', ?9)", )?; let now = now_ms(); // Issues with descriptions let mut issue_stmt = conn.prepare_cached( "SELECT id, iid, description FROM issues WHERE project_id = ?1 AND description IS NOT NULL AND description != ''", )?; let issues: Vec<(i64, i64, String)> = issue_stmt .query_map([project_id], |row| { Ok((row.get(0)?, row.get(1)?, row.get(2)?)) })? .collect::, _>>()?; for (entity_id, _iid, description) in &issues { insert_url_refs( conn, &mut insert_stmt, &mut result, project_id, "issue", *entity_id, description, now, )?; } // Merge requests with descriptions let mut mr_stmt = conn.prepare_cached( "SELECT id, iid, description FROM merge_requests WHERE project_id = ?1 AND description IS NOT NULL AND description != ''", )?; let mrs: Vec<(i64, i64, String)> = mr_stmt .query_map([project_id], |row| { Ok((row.get(0)?, row.get(1)?, row.get(2)?)) })? .collect::, _>>()?; for (entity_id, _iid, description) in &mrs { insert_url_refs( conn, &mut insert_stmt, &mut result, project_id, "merge_request", *entity_id, description, now, )?; } if result.inserted > 0 || result.skipped_unresolvable > 0 { debug!( inserted = result.inserted, unresolvable = result.skipped_unresolvable, "Description cross-reference extraction complete" ); } Ok(result) } /// Extract cross-references from user (non-system) notes (GitLab URLs only). pub fn extract_refs_from_user_notes(conn: &Connection, project_id: i64) -> Result { let mut result = ExtractResult::default(); let mut note_stmt = conn.prepare_cached( "SELECT n.id, n.body, d.noteable_type, COALESCE(d.issue_id, d.merge_request_id) AS entity_id FROM notes n JOIN discussions d ON n.discussion_id = d.id WHERE n.is_system = 0 AND n.project_id = ?1 AND n.body IS NOT NULL", )?; let notes: Vec<(i64, String, String, i64)> = note_stmt .query_map([project_id], |row| { Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) })? .collect::, _>>()?; if notes.is_empty() { return Ok(result); } let mut insert_stmt = conn.prepare_cached( "INSERT OR IGNORE INTO entity_references (project_id, source_entity_type, source_entity_id, target_entity_type, target_entity_id, target_project_path, target_entity_iid, reference_type, source_method, created_at) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, 'note_parse', ?9)", )?; let now = now_ms(); for (_, body, noteable_type, entity_id) in ¬es { let source_entity_type = noteable_type_to_entity_type(noteable_type); insert_url_refs( conn, &mut insert_stmt, &mut result, project_id, source_entity_type, *entity_id, body, now, )?; } if result.inserted > 0 || result.skipped_unresolvable > 0 { debug!( inserted = result.inserted, unresolvable = result.skipped_unresolvable, "User note cross-reference extraction complete" ); } Ok(result) } /// Shared helper: parse URL refs from a body and insert into entity_references. #[allow(clippy::too_many_arguments)] fn insert_url_refs( conn: &Connection, insert_stmt: &mut rusqlite::CachedStatement<'_>, result: &mut ExtractResult, project_id: i64, source_entity_type: &str, source_entity_id: i64, body: &str, now: i64, ) -> Result<()> { let url_refs = parse_url_refs(body); for xref in &url_refs { let target_entity_id = if let Some(ref path) = xref.target_project_path { resolve_cross_project_entity(conn, path, &xref.target_entity_type, xref.target_iid) } else { resolve_entity_id(conn, project_id, &xref.target_entity_type, xref.target_iid) }; let rows_changed = insert_stmt.execute(rusqlite::params![ project_id, source_entity_type, source_entity_id, xref.target_entity_type, target_entity_id, xref.target_project_path, if target_entity_id.is_none() { Some(xref.target_iid) } else { None }, xref.reference_type, now, ])?; if rows_changed > 0 { if target_entity_id.is_none() { result.skipped_unresolvable += 1; } else { result.inserted += 1; } } } Ok(()) } #[cfg(test)] #[path = "note_parser_tests.rs"] mod tests;