From 233eb546af68172de463300ea20d8510057bf88a Mon Sep 17 00:00:00 2001 From: Taylor Eernisse Date: Thu, 5 Feb 2026 15:29:51 -0500 Subject: [PATCH] feat: Add commit SHAs, closes_issues watermark, and PRD alignment Migration 015 adds merge_commit_sha/squash_commit_sha to merge_requests (Gate 4/5 prerequisites), closes_issues_synced_for_updated_at watermark for incremental sync, and the missing idx_label_events_label index. The MR transformer and ingestion pipeline now populate commit SHAs during sync. The orchestrator uses watermark-based filtering for closes_issues jobs instead of re-enqueuing all MRs every sync. The Phase B PRD is updated to match the actual codebase: corrected migration numbering (011-015), documented nullable label/milestone fields (migration 012), watermark patterns (013), observability infrastructure (014), simplified source_method values, and updated entity_references schema to match implementation. Co-Authored-By: Claude Opus 4.6 --- docs/phase-b-temporal-intelligence.md | 172 ++++++++++++------ .../015_commit_shas_and_closes_watermark.sql | 17 ++ src/gitlab/transformers/merge_request.rs | 4 + src/gitlab/types.rs | 2 + src/ingestion/merge_requests.rs | 14 +- src/ingestion/orchestrator.rs | 29 ++- tests/mr_transformer_tests.rs | 14 ++ 7 files changed, 189 insertions(+), 63 deletions(-) create mode 100644 migrations/015_commit_shas_and_closes_watermark.sql diff --git a/docs/phase-b-temporal-intelligence.md b/docs/phase-b-temporal-intelligence.md index f1482e2..d3b0771 100644 --- a/docs/phase-b-temporal-intelligence.md +++ b/docs/phase-b-temporal-intelligence.md @@ -39,7 +39,7 @@ Five gates, each independently verifiable and shippable: - **Opt-in event ingestion.** New config flag `sync.fetchResourceEvents` (default `true`) controls whether the sync pipeline fetches event data. Users who don't need temporal features skip the additional API calls. - **Application-level graph traversal.** Cross-reference expansion uses BFS in Rust, not recursive SQL CTEs. Capped at configurable depth (default 1) for predictable performance. - **Evolutionary library extraction.** New commands are built with typed return structs from day one. Old commands are not retrofitted until a concrete consumer (MCP server, web UI) requires it. -- **Phase A fields cherry-picked as needed.** `merge_commit_sha` and `squash_commit_sha` are added in this phase's migration. Remaining Phase A fields are handled in their own migration later. +- **Phase A fields cherry-picked as needed.** `merge_commit_sha` and `squash_commit_sha` are added in migration 015 and populated during MR ingestion. Remaining Phase A fields are handled in their own migration later. ### Scope Boundaries @@ -71,9 +71,9 @@ The original approach was to parse system note body text with regex to extract s System note parsing is still used for events without structured APIs (see Gate 2), but with the explicit understanding that it's best-effort and fragile for non-English instances. -### 1.2 Schema (Migration 010) +### 1.2 Schema (Migration 011) -**File:** `migrations/010_resource_events.sql` +**File:** `migrations/011_resource_events.sql` ```sql -- State change events (opened, closed, reopened, merged, locked) @@ -89,16 +89,16 @@ CREATE TABLE resource_state_events ( actor_gitlab_id INTEGER, -- GitLab user ID (stable; usernames can change) actor_username TEXT, -- display/search convenience created_at INTEGER NOT NULL, -- ms epoch UTC - -- "closed by MR" link: structured by GitLab, not parsed from text - source_merge_request_id INTEGER, -- GitLab's MR iid that caused this state change source_commit TEXT, -- commit SHA that caused this state change - UNIQUE(gitlab_id, project_id), + source_merge_request_iid INTEGER, -- iid from source_merge_request ref CHECK ( (issue_id IS NOT NULL AND merge_request_id IS NULL) OR (issue_id IS NULL AND merge_request_id IS NOT NULL) ) ); +CREATE UNIQUE INDEX uq_state_events_gitlab ON resource_state_events(gitlab_id, project_id); + CREATE INDEX idx_state_events_issue ON resource_state_events(issue_id) WHERE issue_id IS NOT NULL; CREATE INDEX idx_state_events_mr ON resource_state_events(merge_request_id) @@ -114,24 +114,25 @@ CREATE TABLE resource_label_events ( project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE, issue_id INTEGER REFERENCES issues(id) ON DELETE CASCADE, merge_request_id INTEGER REFERENCES merge_requests(id) ON DELETE CASCADE, - label_name TEXT NOT NULL, action TEXT NOT NULL CHECK (action IN ('add', 'remove')), - actor_gitlab_id INTEGER, -- GitLab user ID (stable; usernames can change) - actor_username TEXT, -- display/search convenience + label_name TEXT, -- nullable: GitLab returns null for deleted labels (see §1.2.1) + actor_gitlab_id INTEGER, + actor_username TEXT, created_at INTEGER NOT NULL, -- ms epoch UTC - UNIQUE(gitlab_id, project_id), CHECK ( (issue_id IS NOT NULL AND merge_request_id IS NULL) OR (issue_id IS NULL AND merge_request_id IS NOT NULL) ) ); +CREATE UNIQUE INDEX uq_label_events_gitlab ON resource_label_events(gitlab_id, project_id); + CREATE INDEX idx_label_events_issue ON resource_label_events(issue_id) WHERE issue_id IS NOT NULL; CREATE INDEX idx_label_events_mr ON resource_label_events(merge_request_id) WHERE merge_request_id IS NOT NULL; CREATE INDEX idx_label_events_created ON resource_label_events(created_at); -CREATE INDEX idx_label_events_label ON resource_label_events(label_name); +-- Note: idx_label_events_label was added in migration 015 (not in the original 011) -- Milestone change events (add, remove) -- Source: GET /projects/:id/issues/:iid/resource_milestone_events @@ -142,19 +143,20 @@ CREATE TABLE resource_milestone_events ( project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE, issue_id INTEGER REFERENCES issues(id) ON DELETE CASCADE, merge_request_id INTEGER REFERENCES merge_requests(id) ON DELETE CASCADE, - milestone_title TEXT NOT NULL, - milestone_id INTEGER, action TEXT NOT NULL CHECK (action IN ('add', 'remove')), - actor_gitlab_id INTEGER, -- GitLab user ID (stable; usernames can change) - actor_username TEXT, -- display/search convenience + milestone_title TEXT, -- nullable: GitLab returns null for deleted milestones (see §1.2.1) + milestone_id INTEGER, + actor_gitlab_id INTEGER, + actor_username TEXT, created_at INTEGER NOT NULL, -- ms epoch UTC - UNIQUE(gitlab_id, project_id), CHECK ( (issue_id IS NOT NULL AND merge_request_id IS NULL) OR (issue_id IS NULL AND merge_request_id IS NOT NULL) ) ); +CREATE UNIQUE INDEX uq_milestone_events_gitlab ON resource_milestone_events(gitlab_id, project_id); + CREATE INDEX idx_milestone_events_issue ON resource_milestone_events(issue_id) WHERE issue_id IS NOT NULL; CREATE INDEX idx_milestone_events_mr ON resource_milestone_events(merge_request_id) @@ -162,6 +164,27 @@ CREATE INDEX idx_milestone_events_mr ON resource_milestone_events(merge_request_ CREATE INDEX idx_milestone_events_created ON resource_milestone_events(created_at); ``` +#### 1.2.1 Nullable Label and Milestone Fields (Migration 012) + +GitLab returns `null` for `label` and `milestone` in Resource Events when the referenced label or milestone has been deleted from the project. This was discovered in production after the initial schema deployed with `NOT NULL` constraints. + +**Migration 012** recreates `resource_label_events` and `resource_milestone_events` with nullable `label_name` and `milestone_title` columns. The table-swap approach (create new → copy → drop old → rename) is required because SQLite doesn't support `ALTER COLUMN`. + +Timeline queries that encounter null labels/milestones display `"[deleted label]"` or `"[deleted milestone]"` in human output and omit the name field in robot JSON. + +#### 1.2.2 Resource Event Watermarks (Migration 013) + +To avoid re-fetching resource events for every entity on every sync, a watermark column tracks the `updated_at` value at the time of last successful event fetch: + +```sql +ALTER TABLE issues ADD COLUMN resource_events_synced_for_updated_at INTEGER; +ALTER TABLE merge_requests ADD COLUMN resource_events_synced_for_updated_at INTEGER; +``` + +**Incremental behavior:** During sync, only entities where `updated_at > COALESCE(resource_events_synced_for_updated_at, 0)` are enqueued for resource event fetching. On `--full` sync, these watermarks are reset to `NULL`, causing all entities to be re-enqueued. + +This mirrors the existing `discussions_synced_for_updated_at` pattern and works in conjunction with the dependent fetch queue. + ### 1.3 Config Extension **File:** `src/core/config.rs` @@ -223,7 +246,7 @@ pub struct GitLabLabelEvent { pub created_at: String, pub resource_type: String, pub resource_id: i64, - pub label: GitLabLabelRef, + pub label: Option, // nullable: deleted labels return null pub action: String, // "add" | "remove" } @@ -234,7 +257,7 @@ pub struct GitLabMilestoneEvent { pub created_at: String, pub resource_type: String, pub resource_id: i64, - pub milestone: GitLabMilestoneRef, + pub milestone: Option, // nullable: deleted milestones return null pub action: String, // "add" | "remove" } ``` @@ -243,7 +266,7 @@ pub struct GitLabMilestoneEvent { **Architecture:** Generic dependent-fetch queue, generalizing the `pending_discussion_fetches` pattern. A single queue table serves all dependent resource types across Gates 1, 2, and 4, avoiding schema churn as new fetch types are added. -**New queue table (in migration 010):** +**New queue table (in migration 011):** ```sql -- Generic queue for all dependent resource fetches (events, closes_issues, diffs) @@ -302,16 +325,32 @@ Acceptable for initial sync. Incremental sync adds negligible overhead. ### 1.7 Acceptance Criteria -- [ ] Migration 010 creates all three event tables + generic dependent fetch queue -- [ ] `lore sync` fetches resource events for changed entities when `fetchResourceEvents` is true -- [ ] `lore sync --no-events` skips event fetching -- [ ] Event fetch failures are queued for retry with exponential backoff -- [ ] Stale locks (crashed sync) automatically reclaimed on next run -- [ ] `lore count events` shows event counts by type +- [x] Migration 011 creates all three event tables + generic dependent fetch queue +- [x] `lore sync` fetches resource events for changed entities when `fetchResourceEvents` is true +- [x] `lore sync --no-events` skips event fetching +- [x] Event fetch failures are queued for retry with exponential backoff +- [x] Stale locks (crashed sync) automatically reclaimed on next run +- [x] `lore count events` shows event counts by type - [ ] `lore stats --check` validates event table referential integrity - [ ] `lore stats --check` validates dependent job queue health (no stuck locks, retryable jobs visible) - [ ] Robot mode JSON for all new commands +### 1.8 Observability Infrastructure (Migration 014) + +The sync pipeline includes lightweight observability via `sync_runs` enrichment. Migration 014 adds: + +```sql +ALTER TABLE sync_runs ADD COLUMN run_id TEXT; -- correlation ID for log tracing +ALTER TABLE sync_runs ADD COLUMN total_items_processed INTEGER DEFAULT 0; +ALTER TABLE sync_runs ADD COLUMN total_errors INTEGER DEFAULT 0; + +CREATE INDEX IF NOT EXISTS idx_sync_runs_run_id ON sync_runs(run_id); +``` + +**Purpose:** The `run_id` column correlates log entries (via `tracing`) with sync run records. `total_items_processed` and `total_errors` provide aggregate counts for `lore sync-status` and robot mode health checks without requiring log parsing. + +This is separate from the event tables but supports the same operational workflow — answering "did the last sync succeed?" and "how many entities were processed?" programmatically. + --- ## Gate 2: Cross-Reference Extraction @@ -320,10 +359,10 @@ Acceptable for initial sync. Incremental sync adds negligible overhead. Temporal queries need to follow links between entities: "MR !567 closed issue #234", "issue #234 mentioned in MR !567", "#299 was opened as a follow-up to !567". These relationships are captured in two places: -1. **Structured API:** `GET /projects/:id/merge_requests/:iid/closes_issues` returns issues that close when the MR merges. Also, `resource_state_events` includes `source_merge_request_id` for "closed by MR" events. +1. **Structured API:** `GET /projects/:id/merge_requests/:iid/closes_issues` returns issues that close when the MR merges. Also, `resource_state_events` includes `source_merge_request_iid` for "closed by MR" events. 2. **System notes:** Cross-references like "mentioned in !456" and "closed by !789" appear in system note body text. -### 2.2 Schema (in Migration 010) +### 2.2 Schema (in Migration 011) ```sql -- Cross-references between entities @@ -340,33 +379,49 @@ Temporal queries need to follow links between entities: "MR !567 closed issue #2 -- silently dropping them. Timeline output marks these as "[external]". CREATE TABLE entity_references ( id INTEGER PRIMARY KEY, + project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE, source_entity_type TEXT NOT NULL CHECK (source_entity_type IN ('issue', 'merge_request')), source_entity_id INTEGER NOT NULL, -- local DB id target_entity_type TEXT NOT NULL CHECK (target_entity_type IN ('issue', 'merge_request')), target_entity_id INTEGER, -- local DB id (NULL when target is unresolved/external) target_project_path TEXT, -- e.g. "group/other-repo" (populated for cross-project refs) target_entity_iid INTEGER, -- GitLab iid (populated when target_entity_id is NULL) - reference_type TEXT NOT NULL, -- 'closes' | 'mentioned' | 'related' - source_method TEXT NOT NULL, -- 'api_closes_issues' | 'api_state_event' | 'system_note_parse' - created_at INTEGER, -- when the reference was created (if known) - UNIQUE(source_entity_type, source_entity_id, target_entity_type, - COALESCE(target_entity_id, -1), COALESCE(target_project_path, ''), - COALESCE(target_entity_iid, -1), reference_type) + reference_type TEXT NOT NULL CHECK (reference_type IN ('closes', 'mentioned', 'related')), + source_method TEXT NOT NULL CHECK (source_method IN ('api', 'note_parse', 'description_parse')), + created_at INTEGER NOT NULL -- ms epoch UTC ); -CREATE INDEX idx_refs_source ON entity_references(source_entity_type, source_entity_id); -CREATE INDEX idx_refs_target ON entity_references(target_entity_type, target_entity_id) +-- Unique constraint includes source_method: the same relationship can be discovered by +-- multiple methods (e.g., closes_issues API and a state event), and we store both for provenance. +CREATE UNIQUE INDEX uq_entity_refs ON entity_references( + project_id, source_entity_type, source_entity_id, target_entity_type, + COALESCE(target_entity_id, -1), COALESCE(target_project_path, ''), + COALESCE(target_entity_iid, -1), reference_type, source_method +); + +CREATE INDEX idx_entity_refs_source ON entity_references(source_entity_type, source_entity_id); +CREATE INDEX idx_entity_refs_target ON entity_references(target_entity_id) WHERE target_entity_id IS NOT NULL; -CREATE INDEX idx_refs_unresolved ON entity_references(target_project_path, target_entity_iid) +CREATE INDEX idx_entity_refs_unresolved ON entity_references(target_project_path, target_entity_iid) WHERE target_entity_id IS NULL; ``` +**`source_method` values:** + +| Value | Meaning | +|-------|---------| +| `'api'` | Populated from structured GitLab APIs (`closes_issues`, `resource_state_events`) | +| `'note_parse'` | Extracted from system note body text (best-effort, English only) | +| `'description_parse'` | Extracted from issue/MR description body text (future) | + +The original design used more granular values (`'api_closes_issues'`, `'api_state_event'`, `'system_note_parse'`). In practice, the API-sourced references don't need sub-method distinction — the `reference_type` already captures the semantic relationship — so the implementation simplified to three values. + ### 2.3 Population Strategy **Tier 1 — Structured APIs (reliable):** -1. **`closes_issues` endpoint:** After MR ingestion, fetch `GET /projects/:id/merge_requests/:iid/closes_issues`. Insert `reference_type = 'closes'`, `source_method = 'api_closes_issues'`. Source = MR, target = issue. -2. **State events:** When `resource_state_events` contains `source_merge_request_id`, insert `reference_type = 'closes'`, `source_method = 'api_state_event'`. Source = MR (referenced by iid), target = issue (that received the state change). +1. **`closes_issues` endpoint:** After MR ingestion, fetch `GET /projects/:id/merge_requests/:iid/closes_issues`. Insert `reference_type = 'closes'`, `source_method = 'api'`. Source = MR, target = issue. +2. **State events:** When `resource_state_events` contains `source_merge_request_iid`, insert `reference_type = 'closes'`, `source_method = 'api'`. Source = MR (referenced by iid), target = issue (that received the state change). **Tier 2 — System note parsing (best-effort):** @@ -385,14 +440,14 @@ closed by #{iid} **Cross-project references:** When a system note references `{group}/{project}#{iid}` and the target project is not synced locally, store with `target_entity_id = NULL`, `target_project_path = '{group}/{project}'`, `target_entity_iid = {iid}`. These unresolved references are still valuable for timeline narratives — they indicate external dependencies and decision context even when we can't traverse further. -Insert with `source_method = 'system_note_parse'`. Accept that: +Insert with `source_method = 'note_parse'`. Accept that: - This breaks on non-English GitLab instances - Format may vary across GitLab versions - Log parse failures at `debug` level for monitoring -**Tier 3 — Description/body parsing (deferred):** +**Tier 3 — Description/body parsing (`source_method = 'description_parse'`, deferred):** -Issue and MR descriptions often contain `#123` or `!456` references. Parsing these is lower confidence (mentions != relationships) and is deferred to a future iteration. +Issue and MR descriptions often contain `#123` or `!456` references. Parsing these is lower confidence (mentions != relationships) and is deferred to a future iteration. The `source_method` value `'description_parse'` is reserved in the CHECK constraint for this future work. ### 2.4 Ingestion Flow @@ -401,6 +456,8 @@ The `closes_issues` fetch uses the generic dependent fetch queue (`job_type = 'm - One additional API call per MR: `GET /projects/:id/merge_requests/:iid/closes_issues` - Cross-reference parsing from system notes runs as a local post-processing step (no API calls) after all dependent fetches complete +**Watermark pattern (migration 015):** A `closes_issues_synced_for_updated_at` column on `merge_requests` tracks the last `updated_at` value at which closes_issues data was fetched. Only MRs where `updated_at > COALESCE(closes_issues_synced_for_updated_at, 0)` are enqueued for re-fetching. The watermark is updated after successful fetch or after a permanent API error (e.g., 404 for external MRs). On `--full` sync, the watermark is reset to `NULL`. + ### 2.5 Acceptance Criteria - [ ] `entity_references` table populated from `closes_issues` API for all synced MRs @@ -562,7 +619,7 @@ Evidence notes (`NOTE` events) show the first ~200 characters of FTS5-matched no "via": { "from": { "type": "merge_request", "iid": 567, "project": "group/repo" }, "reference_type": "closes", - "source_method": "api_closes_issues" + "source_method": "api" } } ], @@ -639,9 +696,13 @@ Evidence notes (`NOTE` events) show the first ~200 characters of FTS5-matched no ## Gate 4: File Decision History (`lore file-history`) -### 4.1 Schema (Migration 011) +### 4.1 Schema -**File:** `migrations/011_file_changes.sql` +**Commit SHAs (Migration 015 — already applied):** + +`merge_commit_sha` and `squash_commit_sha` were added to `merge_requests` in migration 015. These are now populated during MR ingestion and available for Gate 4/5 queries. + +**File changes table (future migration — not yet created):** ```sql -- Files changed by each merge request @@ -660,11 +721,6 @@ CREATE INDEX idx_mr_files_new_path ON mr_file_changes(new_path); CREATE INDEX idx_mr_files_old_path ON mr_file_changes(old_path) WHERE old_path IS NOT NULL; CREATE INDEX idx_mr_files_mr ON mr_file_changes(merge_request_id); - --- Add commit SHAs to merge_requests (cherry-picked from Phase A) --- These link MRs to actual git history -ALTER TABLE merge_requests ADD COLUMN merge_commit_sha TEXT; -ALTER TABLE merge_requests ADD COLUMN squash_commit_sha TEXT; ``` ### 4.2 Config Extension @@ -881,14 +937,16 @@ When git integration is added: ### Migration Numbering -Phase B uses migration numbers starting at 010: +Phase B uses migration numbers 011–015. The original plan assumed migration 010 was available, but chunk config (`010_chunk_config.sql`) was implemented first, shifting everything by +1. -| Migration | Content | Gate | -|-----------|---------|------| -| 010 | Resource event tables, generic dependent fetch queue, entity_references | Gates 1, 2 | -| 011 | mr_file_changes, merge_commit_sha, squash_commit_sha | Gate 4 | - -Phase A's complete field capture migration should use 012+ when implemented, skipping fields already added by 011 (`merge_commit_sha`, `squash_commit_sha`). +| Migration | File | Content | Gate | +|-----------|------|---------|------| +| 011 | `011_resource_events.sql` | Resource event tables (state, label, milestone), entity_references, generic dependent fetch queue | Gates 1, 2 | +| 012 | `012_nullable_label_milestone.sql` | Make `label_name` and `milestone_title` nullable for deleted labels/milestones | Gate 1 (fix) | +| 013 | `013_resource_event_watermarks.sql` | Add `resource_events_synced_for_updated_at` to issues and merge_requests | Gate 1 (optimization) | +| 014 | `014_sync_runs_enrichment.sql` | Observability: `run_id`, `total_items_processed`, `total_errors` on sync_runs | Observability | +| 015 | `015_commit_shas_and_closes_watermark.sql` | `merge_commit_sha`, `squash_commit_sha`, `closes_issues_synced_for_updated_at` on merge_requests; `idx_label_events_label` index | Gates 2, 4 | +| TBD | — | `mr_file_changes` table for MR diff data | Gate 4 | ### Backward Compatibility @@ -909,7 +967,7 @@ Phase A's complete field capture migration should use 012+ when implemented, ski | GitLab diffs API returns large payloads | Low | Extract file metadata only, discard diff content | | Cross-reference graph traversal unbounded | Medium | BFS depth capped at configurable limit (default 1); `mentioned` edges excluded by default | | Cross-project references lost when target not synced | Medium | Unresolved references stored with `target_entity_id = NULL`; still appear in timeline output | -| Phase A migration numbering conflict | Low | Phase B uses 010-011; Phase A uses 012+ | +| Phase A migration numbering conflict | Low | Resolved: chunk config took 010; Phase B shifted to 011-015 | | Timeline output lacks "why" evidence | Medium | Evidence-bearing notes from FTS5 included as first-class timeline events | | Squash commits break blame-to-MR mapping | Medium | Tier 2 (git integration) deferred; Tier 1 uses file-level MR matching | diff --git a/migrations/015_commit_shas_and_closes_watermark.sql b/migrations/015_commit_shas_and_closes_watermark.sql new file mode 100644 index 0000000..0dec4f5 --- /dev/null +++ b/migrations/015_commit_shas_and_closes_watermark.sql @@ -0,0 +1,17 @@ +-- Migration 015: Add commit SHAs to merge_requests, closes_issues watermark, +-- and missing label_name index on resource_label_events. + +-- Commit SHAs link MRs to actual git history (needed for Gate 4: file-history, Gate 5: trace) +ALTER TABLE merge_requests ADD COLUMN merge_commit_sha TEXT; +ALTER TABLE merge_requests ADD COLUMN squash_commit_sha TEXT; + +-- Watermark for closes_issues sync (same pattern as resource_events_synced_for_updated_at) +-- Prevents re-fetching closes_issues for MRs that haven't changed since last sync +ALTER TABLE merge_requests ADD COLUMN closes_issues_synced_for_updated_at INTEGER; + +-- Missing index from original spec: enables efficient label-name filtering in timeline queries +CREATE INDEX IF NOT EXISTS idx_label_events_label ON resource_label_events(label_name); + +-- Update schema version +INSERT INTO schema_version (version, applied_at, description) +VALUES (15, strftime('%s', 'now') * 1000, 'Add commit SHAs, closes_issues watermark, and label event index'); diff --git a/src/gitlab/transformers/merge_request.rs b/src/gitlab/transformers/merge_request.rs index 0cb0470..9d274e6 100644 --- a/src/gitlab/transformers/merge_request.rs +++ b/src/gitlab/transformers/merge_request.rs @@ -24,6 +24,8 @@ pub struct NormalizedMergeRequest { pub closed_at: Option, pub last_seen_at: i64, pub web_url: String, + pub merge_commit_sha: Option, + pub squash_commit_sha: Option, } #[derive(Debug, Clone)] @@ -100,6 +102,8 @@ pub fn transform_merge_request( closed_at, last_seen_at: now_ms(), web_url: gitlab_mr.web_url.clone(), + merge_commit_sha: gitlab_mr.merge_commit_sha.clone(), + squash_commit_sha: gitlab_mr.squash_commit_sha.clone(), }, label_names: gitlab_mr.labels.clone(), assignee_usernames, diff --git a/src/gitlab/types.rs b/src/gitlab/types.rs index 9513b9d..1e7303e 100644 --- a/src/gitlab/types.rs +++ b/src/gitlab/types.rs @@ -247,4 +247,6 @@ pub struct GitLabMergeRequest { #[serde(default)] pub reviewers: Vec, pub web_url: String, + pub merge_commit_sha: Option, + pub squash_commit_sha: Option, } diff --git a/src/ingestion/merge_requests.rs b/src/ingestion/merge_requests.rs index 2dff3bd..46ed77b 100644 --- a/src/ingestion/merge_requests.rs +++ b/src/ingestion/merge_requests.rs @@ -180,8 +180,9 @@ fn process_mr_in_transaction( author_username, source_branch, target_branch, head_sha, references_short, references_full, detailed_merge_status, merge_user_username, created_at, updated_at, merged_at, closed_at, - last_seen_at, web_url, raw_payload_id - ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18, ?19, ?20, ?21, ?22) + last_seen_at, web_url, raw_payload_id, + merge_commit_sha, squash_commit_sha + ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18, ?19, ?20, ?21, ?22, ?23, ?24) ON CONFLICT(gitlab_id) DO UPDATE SET title = excluded.title, description = excluded.description, @@ -200,7 +201,9 @@ fn process_mr_in_transaction( closed_at = excluded.closed_at, last_seen_at = excluded.last_seen_at, web_url = excluded.web_url, - raw_payload_id = excluded.raw_payload_id", + raw_payload_id = excluded.raw_payload_id, + merge_commit_sha = excluded.merge_commit_sha, + squash_commit_sha = excluded.squash_commit_sha", params![ mr_row.gitlab_id, project_id, @@ -224,6 +227,8 @@ fn process_mr_in_transaction( now, &mr_row.web_url, payload_id, + &mr_row.merge_commit_sha, + &mr_row.squash_commit_sha, ], )?; @@ -368,7 +373,8 @@ fn reset_discussion_watermarks(conn: &Connection, project_id: i64) -> Result<()> SET discussions_synced_for_updated_at = NULL, discussions_sync_attempts = 0, discussions_sync_last_error = NULL, - resource_events_synced_for_updated_at = NULL + resource_events_synced_for_updated_at = NULL, + closes_issues_synced_for_updated_at = NULL WHERE project_id = ?", [project_id], )?; diff --git a/src/ingestion/orchestrator.rs b/src/ingestion/orchestrator.rs index 8dc53a8..d1ac576 100644 --- a/src/ingestion/orchestrator.rs +++ b/src/ingestion/orchestrator.rs @@ -785,9 +785,32 @@ fn update_resource_event_watermark( Ok(()) } +fn update_closes_issues_watermark(conn: &Connection, mr_local_id: i64) -> Result<()> { + conn.execute( + "UPDATE merge_requests SET closes_issues_synced_for_updated_at = updated_at WHERE id = ?", + [mr_local_id], + )?; + Ok(()) +} + fn enqueue_mr_closes_issues_jobs(conn: &Connection, project_id: i64) -> Result { - let mut stmt = - conn.prepare_cached("SELECT id, iid FROM merge_requests WHERE project_id = ?1")?; + // Remove stale jobs for MRs that haven't changed since their last closes_issues sync + conn.execute( + "DELETE FROM pending_dependent_fetches \ + WHERE project_id = ?1 AND entity_type = 'merge_request' AND job_type = 'mr_closes_issues' \ + AND entity_local_id IN ( \ + SELECT id FROM merge_requests \ + WHERE project_id = ?1 \ + AND updated_at <= COALESCE(closes_issues_synced_for_updated_at, 0) \ + )", + [project_id], + )?; + + let mut stmt = conn.prepare_cached( + "SELECT id, iid FROM merge_requests \ + WHERE project_id = ?1 \ + AND updated_at > COALESCE(closes_issues_synced_for_updated_at, 0)", + )?; let entities: Vec<(i64, i64)> = stmt .query_map([project_id], |row| Ok((row.get(0)?, row.get(1)?)))? .collect::, _>>()?; @@ -886,6 +909,7 @@ async fn drain_mr_closes_issues( match store_result { Ok(()) => { complete_job(conn, job.id)?; + update_closes_issues_watermark(conn, job.entity_local_id)?; result.fetched += 1; } Err(e) => { @@ -907,6 +931,7 @@ async fn drain_mr_closes_issues( "Permanent API error for closes_issues, marking complete" ); complete_job(conn, job.id)?; + update_closes_issues_watermark(conn, job.entity_local_id)?; result.skipped_not_found += 1; } else { warn!( diff --git a/tests/mr_transformer_tests.rs b/tests/mr_transformer_tests.rs index 8dd5b9b..dc08f6c 100644 --- a/tests/mr_transformer_tests.rs +++ b/tests/mr_transformer_tests.rs @@ -51,6 +51,8 @@ fn make_test_mr() -> GitLabMergeRequest { name: "Alice Wong".to_string(), }], web_url: "https://gitlab.example.com/group/project/-/merge_requests/42".to_string(), + merge_commit_sha: Some("merge1234567890".to_string()), + squash_commit_sha: None, } } @@ -98,6 +100,18 @@ fn transforms_mr_with_all_fields() { ); } +#[test] +fn preserves_commit_shas() { + let mr = make_test_mr(); + let result = transform_merge_request(&mr, 200).unwrap(); + + assert_eq!( + result.merge_request.merge_commit_sha, + Some("merge1234567890".to_string()) + ); + assert!(result.merge_request.squash_commit_sha.is_none()); +} + #[test] fn parses_timestamps_to_ms_epoch() { let mr = make_test_mr();