diff --git a/docs/prd-per-note-search.feedback-1.md b/docs/prd-per-note-search.feedback-1.md new file mode 100644 index 0000000..74fb019 --- /dev/null +++ b/docs/prd-per-note-search.feedback-1.md @@ -0,0 +1,174 @@ +Highest-impact gaps I see in the current plan: + +1. `for-issue` / `for-mr` filtering is ambiguous across projects and can return incorrect rows. +2. `lore notes` has no pagination contract, so large exports and deterministic resumption are weak. +3. Migration `022` is high-risk (table rebuild + FTS + junction tables) without explicit integrity gates. +4. Note-doc freshness is incomplete for upstream note deletions and parent metadata changes (labels/title). + +Below are my best revisions, each with rationale and a git-diff-style plan edit. + +--- + +1. **Add gated rollout + rollback controls** +Rationale: You can still “ship together” while reducing blast radius. This makes recovery fast if note-doc generation causes DB/embedding pressure. + +```diff +@@ ## Design +-Two phases, shipped together as one feature: ++Two phases, shipped together as one feature, but with runtime gates: ++ ++- `feature.notes_cli` (Phase 1 surface) ++- `feature.note_documents` (Phase 2 indexing/extraction path) ++ ++Rollout order: ++1) Enable `notes_cli` ++2) Run note-doc backfill in bounded batches ++3) Enable `note_documents` for continuous updates ++ ++Rollback: ++- Disabling `feature.note_documents` stops new note-doc generation without affecting issue/MR/discussion docs. +``` + +2. **Add keyset pagination + deterministic ordering** +Rationale: Needed for year-long reviewer analysis and reliable “continue where I left off” behavior under concurrent updates. + +```diff +@@ pub struct NoteListFilters<'a> { + pub limit: usize, ++ pub cursor: Option<&'a str>, // keyset token ":" ++ pub include_total_count: bool, // avoid COUNT(*) in hot paths +@@ +- pub sort: &'a str, // "created" (default) | "updated" ++ pub sort: &'a str, // "created" | "updated" +@@ query_notes SQL +-ORDER BY {sort_column} {order} ++ORDER BY {sort_column} {order}, n.id {order} + LIMIT ? +``` + +3. **Make `for-issue` / `for-mr` project-scoped** +Rationale: IIDs are not globally unique. Requiring project avoids false positives and hard-to-debug cross-project leakage. + +```diff +@@ pub struct NotesArgs { +- #[arg(long = "for-issue", help_heading = "Filters", conflicts_with = "for_mr")] ++ #[arg(long = "for-issue", help_heading = "Filters", conflicts_with = "for_mr", requires = "project")] + pub for_issue: Option, +@@ +- #[arg(long = "for-mr", help_heading = "Filters", conflicts_with = "for_issue")] ++ #[arg(long = "for-mr", help_heading = "Filters", conflicts_with = "for_issue", requires = "project")] + pub for_mr: Option, +``` + +4. **Upgrade path filtering semantics** +Rationale: Review comments often reference renames/moves. Restricting to `position_new_path` misses relevant notes. + +```diff +@@ pub struct NotesArgs { +- /// Filter by file path (trailing / for prefix match) ++ /// Filter by file path + #[arg(long, help_heading = "Filters")] + pub path: Option, ++ /// Path mode: exact|prefix|glob ++ #[arg(long = "path-mode", value_parser = ["exact","prefix","glob"], default_value = "exact", help_heading = "Filters")] ++ pub path_mode: String, ++ /// Match against old path as well as new path ++ #[arg(long = "match-old-path", help_heading = "Filters")] ++ pub match_old_path: bool, +@@ query_notes filter mappings +-- `path` ... n.position_new_path ... ++- `path` applies to `n.position_new_path` and optionally `n.position_old_path`. ++- `glob` mode translates `*`/`?` to SQL LIKE with escaping. +``` + +5. **Add explicit performance indexes (new migration)** +Rationale: `notes` becomes a first-class query surface; without indexes, filters degrade quickly at 10k+ note scale. + +```diff +@@ ## Phase 1: `lore notes` Command ++### Work Chunk 1E: Query Performance Indexes ++**Files:** `migrations/023_notes_query_indexes.sql`, `src/core/db.rs` ++ ++Add indexes: ++- `notes(project_id, created_at DESC, id DESC)` ++- `notes(author_username, created_at DESC, id DESC) WHERE is_system = 0` ++- `notes(discussion_id)` ++- `notes(position_new_path)` ++- `notes(position_old_path)` ++- `discussions(issue_id)` ++- `discussions(merge_request_id)` +``` + +6. **Harden migration 022 with transactional integrity checks** +Rationale: This is the riskiest part of the plan. Add hard fail-fast checks so corruption cannot silently pass. + +```diff +@@ ### Work Chunk 2A: Schema Migration (022) ++Migration safety requirements: ++- Execute in a single `BEGIN IMMEDIATE ... COMMIT` transaction. ++- Capture and compare pre/post row counts for `documents`, `document_labels`, `document_paths`, `dirty_sources`. ++- Run `PRAGMA foreign_key_check` and abort on any violation. ++- Run `PRAGMA integrity_check` and abort on non-`ok`. ++- Rebuild FTS and assert `documents_fts` rowcount equals `documents` rowcount. +``` + +7. **Add note deletion + parent-change propagation** +Rationale: Current plan handles create/update ingestion but not all staleness paths. Without this, note documents drift. + +```diff +@@ ## Phase 2: Per-Note Documents ++### Work Chunk 2G: Freshness Propagation ++**Files:** `src/ingestion/discussions.rs`, `src/ingestion/mr_discussions.rs`, `src/documents/regenerator.rs` ++ ++Rules: ++- If a previously stored note is missing from upstream payload, delete local note row and enqueue `(note, id)` for document deletion. ++- When parent issue/MR title or labels change, enqueue descendant note docs dirty (notes inherit parent metadata). ++- Keep idempotent behavior for repeated syncs. +``` + +8. **Separate FTS coverage from embedding coverage** +Rationale: Biggest cost/perf risk is embeddings. Index all notes in FTS, but embed selectively with policy knobs. + +```diff +@@ ## Estimated Document Volume Impact +-FTS5 handles this comfortably. Embedding generation time scales linearly (~4x increase). ++FTS5 handles this comfortably. Embedding generation is policy-controlled: ++- FTS: index all non-system note docs ++- Embeddings default: only notes with body length >= 40 chars (configurable) ++- Add config: `documents.note_embeddings.min_chars`, `documents.note_embeddings.enabled` ++- Prioritize unresolved DiffNotes before other notes during embedding backfill +``` + +9. **Bring structured reviewer profiling into scope (not narrative reporting)** +Rationale: This directly serves the stated use case and makes the feature compelling immediately. + +```diff +@@ ## Non-Goals +-- Adding a "reviewer profile" report command (that's a downstream use case built on this infrastructure) ++- Generating free-form narrative reviewer reports. ++ A structured profiling command is in scope. ++ ++## Phase 3: Structured Reviewer Profiling ++Add `lore notes profile --author --since ` returning: ++- top commented paths ++- top parent labels ++- unresolved-comment ratio ++- note-type distribution ++- median comment length +``` + +10. **Add operational SLOs + robot-mode status for note pipeline** +Rationale: Reliability improves when regressions are observable, not inferred from failures. + +```diff +@@ ## Verification Checklist ++Operational checks: ++- `lore -J stats` includes per-`source_type` document counts (including `note`) ++- Add queue lag metrics: oldest dirty note age, retry backlog size ++- Add extraction error breakdown by `source_type` ++- Add smoke assertion: disabling `feature.note_documents` leaves other source regeneration unaffected +``` + +--- + +If you want, I can produce a single consolidated revised PRD draft (fully merged text, not just diffs) as the next step. \ No newline at end of file diff --git a/docs/prd-per-note-search.feedback-2.md b/docs/prd-per-note-search.feedback-2.md new file mode 100644 index 0000000..9d99601 --- /dev/null +++ b/docs/prd-per-note-search.feedback-2.md @@ -0,0 +1,200 @@ +Below are the strongest revisions I’d make, excluding everything in your `## Rejected Recommendations` list. + +1. **Add a Phase 0 for stable note identity before any note-doc generation** +Rationale: your current plan still allows note document churn because Issue discussion ingestion is delete/reinsert-based. That makes local `notes.id` unstable, causing unnecessary dirtying/regeneration and potential stale-doc edge cases. Stabilizing identity first (upsert-by-GitLab-ID + sweep stale) improves correctness and cuts repeated work. + +```diff +@@ ## Design +-Two phases, shipped together as one feature: ++Three phases, shipped together as one feature: ++- **Phase 0 (Foundation):** Stable note identity in local DB (upsert + sweep, no delete/reinsert churn) + - **Phase 1 (Option A):** `lore notes` command — direct SQL query over the `notes` table with rich filtering + - **Phase 2 (Option B):** Per-note documents — each non-system note becomes its own searchable document in the FTS/embedding pipeline +@@ ++## Phase 0: Stable Note Identity ++ ++### Work Chunk 0A: Upsert/Sweep for Issue Discussion Notes ++**Files:** `src/ingestion/discussions.rs`, `migrations/022_notes_identity_index.sql`, `src/core/db.rs` ++**Implementation:** ++- Add unique index: `UNIQUE(project_id, gitlab_id)` on `notes` ++- Replace delete/reinsert issue-note flow with upsert + `last_seen_at` sweep (same durability model as MR note sweep) ++- Ensure `insert_note/upsert_note` returns the stable local row id for both insert and update paths +``` + +2. **Replace `source_type` CHECK constraints with a registry table + FK in migration** +Rationale: table CHECKs force full table rebuild for every new source type forever. A `source_types` table with FK keeps DB-level integrity and future extensibility without rebuilding `documents`/`dirty_sources` every time. This is a major architecture hardening win. + +```diff +@@ ### Work Chunk 2A: Schema Migration (023) +-Current migration ... CHECK constraints limiting `source_type` ... ++Current migration ... CHECK constraints limiting `source_type` ... ++Revision: migrate to `source_types` registry table + FK constraints. +@@ +-1. `dirty_sources` — add `'note'` to source_type CHECK +-2. `documents` — add `'note'` to source_type CHECK ++1. Create `source_types(name TEXT PRIMARY KEY)` and seed: `issue, merge_request, discussion, note` ++2. Rebuild `dirty_sources` and `documents` to replace CHECK with `REFERENCES source_types(name)` ++3. Future source-type additions become `INSERT INTO source_types(name) VALUES (?)` (no table rebuild) +@@ ++#### Additional integrity tests ++#[test] ++fn test_source_types_registry_contains_note() { ... } ++#[test] ++fn test_documents_source_type_fk_enforced() { ... } ++#[test] ++fn test_dirty_sources_source_type_fk_enforced() { ... } +``` + +3. **Mark note documents dirty only when note semantics actually changed** +Rationale: current loops mark every non-system note dirty every sync. With 8k+ notes this creates avoidable queue pressure and regeneration time. Change-aware dirtying (inserted/changed only) gives major performance and stability improvements. + +```diff +@@ ### Work Chunk 2D: Regenerator & Dirty Tracking Integration +-for note in notes { +- let local_note_id = insert_note(&tx, local_discussion_id, ¬e, None)?; +- if !note.is_system { +- dirty_tracker::mark_dirty_tx(&tx, SourceType::Note, local_note_id)?; +- } +-} ++for note in notes { ++ let outcome = upsert_note(&tx, local_discussion_id, ¬e, None)?; ++ if !note.is_system && outcome.changed_semantics { ++ dirty_tracker::mark_dirty_tx(&tx, SourceType::Note, outcome.local_note_id)?; ++ } ++} +@@ ++// changed_semantics should include: body, note_type, path/line positions, resolvable/resolved/resolved_by, updated_at +``` + +4. **Expand filters to support real analysis windows and resolution state** +Rationale: reviewer profiling usually needs bounded windows and both resolved/unresolved views. Current `unresolved: bool` is too narrow and one-sided. Add `--until` and tri-state resolution filtering for better analytical power. + +```diff +@@ pub struct NoteListFilters<'a> { +- pub since: Option<&'a str>, ++ pub since: Option<&'a str>, ++ pub until: Option<&'a str>, +@@ +- pub unresolved: bool, ++ pub resolution: &'a str, // "any" (default) | "unresolved" | "resolved" +@@ +- pub author: Option<&'a str>, ++ pub author: Option<&'a str>, // case-insensitive match +@@ +- // Filter by time (7d, 2w, 1m, or YYYY-MM-DD) ++ // Filter by start time (7d, 2w, 1m, or YYYY-MM-DD) + pub since: Option, ++ /// Filter by end time (7d, 2w, 1m, or YYYY-MM-DD) ++ #[arg(long, help_heading = "Filters")] ++ pub until: Option, +@@ +- /// Only show unresolved review comments +- pub unresolved: bool, ++ /// Resolution filter: any, unresolved, resolved ++ #[arg(long, value_parser = ["any", "unresolved", "resolved"], default_value = "any", help_heading = "Filters")] ++ pub resolution: String, +``` + +5. **Broaden index strategy to match actual query shapes, not just author queries** +Rationale: `idx_notes_user_created` helps one path, but common usage also includes project+time scans and unresolved filters. Add two more partial composites for high-selectivity paths. + +```diff +@@ ### Work Chunk 1E: Composite Query Index + CREATE INDEX IF NOT EXISTS idx_notes_user_created + ON notes(project_id, author_username, created_at DESC, id DESC) + WHERE is_system = 0; ++ ++CREATE INDEX IF NOT EXISTS idx_notes_project_created ++ON notes(project_id, created_at DESC, id DESC) ++WHERE is_system = 0; ++ ++CREATE INDEX IF NOT EXISTS idx_notes_unresolved_project_created ++ON notes(project_id, created_at DESC, id DESC) ++WHERE is_system = 0 AND resolvable = 1 AND resolved = 0; +@@ ++#[test] ++fn test_notes_query_plan_uses_project_created_index_for_default_listing() { ... } ++#[test] ++fn test_notes_query_plan_uses_unresolved_index_when_resolution_unresolved() { ... } +``` + +6. **Improve per-note document payload with structured metadata header + minimal thread context** +Rationale: isolated single-note docs can lose meaning. A small structured header plus lightweight context (parent + one preceding note excerpt) improves semantic retrieval quality substantially without re-bundling full threads. + +```diff +@@ ### Work Chunk 2C: Note Document Extractor +-// 6. Format content: +-// [[Note]] {note_type or "Comment"} on {parent_type_prefix}: {parent_title} +-// Project: {path_with_namespace} +-// URL: {url} +-// Author: @{author} +-// Date: {format_date(created_at)} +-// Labels: {labels_json} +-// File: {position_new_path}:{position_new_line} (if DiffNote) +-// +-// --- Body --- +-// +-// {body} ++// 6. Format content with machine-readable header: ++// [[Note]] ++// source_type: note ++// note_gitlab_id: {gitlab_id} ++// project: {path_with_namespace} ++// parent_type: {Issue|MergeRequest} ++// parent_iid: {iid} ++// note_type: {DiffNote|DiscussionNote|Comment} ++// author: @{author} ++// created_at: {iso8601} ++// resolved: {true|false} ++// path: {position_new_path}:{position_new_line} ++// url: {url} ++// ++// --- Context --- ++// parent_title: {title} ++// previous_note_excerpt: {optional, max 200 chars} ++// ++// --- Body --- ++// {body} +``` + +7. **Add first-class export modes for downstream profiling pipelines** +Rationale: this makes the feature much more useful immediately (LLM prompts, notebook analysis, external scripts) without adding a profiling command. It stays within your non-goals and increases adoption. + +```diff +@@ pub struct NotesArgs { ++ /// Output format ++ #[arg(long, value_parser = ["table", "json", "jsonl", "csv"], default_value = "table", help_heading = "Output")] ++ pub format: String, +@@ +- if robot_mode { ++ if robot_mode || args.format == "json" || args.format == "jsonl" || args.format == "csv" { + print_list_notes_json(...) + } else { + print_list_notes(&result); + } +@@ ### Work Chunk 1C: Human & Robot Output Formatting ++Add `print_list_notes_csv()` and `print_list_notes_jsonl()`: ++- CSV columns mirror `NoteListRowJson` field names ++- JSONL emits one note object per line for streaming pipelines +``` + +8. **Strengthen verification with idempotence + migration data-preservation checks** +Rationale: this feature touches ingestion, migrations, indexing, and regeneration. Add explicit idempotence/perf checks so regressions surface early. + +```diff +@@ ## Verification Checklist + cargo test + cargo clippy --all-targets -- -D warnings + cargo fmt --check ++cargo test test_note_ingestion_idempotent_across_two_syncs ++cargo test test_note_document_count_stable_after_second_generate_docs_full +@@ ++lore sync ++lore generate-docs --full ++lore -J stats > /tmp/stats1.json ++lore generate-docs --full ++lore -J stats > /tmp/stats2.json ++# assert note doc count unchanged and dirty queue drains to zero +``` + +If you want, I can turn this into a fully rewritten PRD v2 draft with these changes merged in-place and renumbered work chunks end-to-end. \ No newline at end of file diff --git a/docs/prd-per-note-search.feedback-3.md b/docs/prd-per-note-search.feedback-3.md new file mode 100644 index 0000000..f7f19b7 --- /dev/null +++ b/docs/prd-per-note-search.feedback-3.md @@ -0,0 +1,162 @@ +These are the highest-impact revisions I’d make. They avoid everything in your `## Rejected Recommendations` list. + +1. Add immediate note-document deletion propagation (don’t wait for `generate-docs --full`) +Why: right now, deleted notes can leave stale `source_type='note'` documents until a full rebuild. That creates incorrect search/reporting results and weakens trust in the dataset. +```diff +@@ Phase 0: Stable Note Identity ++### Work Chunk 0B: Immediate Deletion Propagation ++ ++When sweep deletes stale notes, propagate deletion to documents in the same transaction. ++Do not rely on eventual cleanup via `generate-docs --full`. ++ ++#### Tests to Write First ++#[test] ++fn test_issue_note_sweep_deletes_note_documents_immediately() { ... } ++#[test] ++fn test_mr_note_sweep_deletes_note_documents_immediately() { ... } ++ ++#### Implementation ++Use `DELETE ... RETURNING id, is_system` in note sweep functions. ++For returned non-system note ids: ++1) `DELETE FROM documents WHERE source_type='note' AND source_id=?` ++2) `DELETE FROM dirty_sources WHERE source_type='note' AND source_id=?` +``` + +2. Add one-time upgrade backfill for existing notes (migration 024) +Why: existing DBs will otherwise only get note-documents for changed/new notes. Historical notes remain invisible unless users manually run full rebuild. +```diff +@@ Phase 2: Per-Note Documents ++### Work Chunk 2H: Backfill Existing Notes After Upgrade (Migration 024) ++ ++Create migration `024_note_dirty_backfill.sql`: ++INSERT INTO dirty_sources (source_type, source_id, queued_at) ++SELECT 'note', n.id, unixepoch('now') * 1000 ++FROM notes n ++LEFT JOIN documents d ++ ON d.source_type='note' AND d.source_id=n.id ++WHERE n.is_system=0 AND d.id IS NULL ++ON CONFLICT(source_type, source_id) DO NOTHING; ++ ++Add migration test asserting idempotence and expected queue size. +``` + +3. Fix `--since/--until` semantics and validation +Why: reusing `parse_since` for `until` creates ambiguous windows and off-by-boundary behavior; your own example `--since 90d --until 180d` is chronologically reversed. +```diff +@@ Work Chunk 1A: Data Types & Query Layer +- since: parse_since(since_str) then n.created_at >= ? +- until: parse_since(until_str) then n.created_at <= ? ++ since: parse_since_start_bound(since_str) then n.created_at >= ? ++ until: parse_until_end_bound(until_str) then n.created_at <= ? ++ Validate since <= until; otherwise return a clear user error. ++ ++#### Tests to Write First ++#[test] fn test_query_notes_invalid_time_window_rejected() { ... } ++#[test] fn test_query_notes_until_date_is_end_of_day_inclusive() { ... } +``` + +4. Separate semantic-change detection from housekeeping updates +Why: current proposed `WHERE` includes `updated_at`, which will cause unnecessary dirty churn. You want `last_seen_at` to always refresh, but regeneration only when searchable semantics changed. +```diff +@@ Work Chunk 0A: Upsert/Sweep for Issue Discussion Notes +- OR notes.updated_at IS NOT excluded.updated_at ++ -- updated_at-only changes should not mark semantic dirty ++ ++Perform two-step logic: ++1) Upsert always updates persistence/housekeeping fields (`updated_at`, `last_seen_at`). ++2) `changed_semantics` is computed only from fields used by note documents/search filters ++ (body, note_type, resolved flags, paths, author, parent linkage). ++ ++#### Tests to Write First ++#[test] ++fn test_issue_note_upsert_updated_at_only_does_not_mark_semantic_change() { ... } +``` + +5. Make indexes align with actual query collation and join strategy +Why: `author` uses `COLLATE NOCASE`; without collation-aware index, SQLite can skip index use. Also, IID filters via scalar subqueries are harder for planner than direct join predicates. +```diff +@@ Work Chunk 1E: Composite Query Index +-CREATE INDEX ... ON notes(project_id, author_username, created_at DESC, id DESC) WHERE is_system = 0; ++CREATE INDEX ... ON notes(project_id, author_username COLLATE NOCASE, created_at DESC, id DESC) WHERE is_system = 0; ++ ++CREATE INDEX IF NOT EXISTS idx_discussions_issue_id ON discussions(issue_id); ++CREATE INDEX IF NOT EXISTS idx_discussions_mr_id ON discussions(merge_request_id); +``` + +```diff +@@ Work Chunk 1A: query_notes() +- d.issue_id = (SELECT id FROM issues WHERE iid = ? AND project_id = ?) ++ i.iid = ? AND i.project_id = ? +- d.merge_request_id = (SELECT id FROM merge_requests WHERE iid = ? AND project_id = ?) ++ m.iid = ? AND m.project_id = ? +``` + +6. Replace manual CSV escaping with `csv` crate +Why: manual RFC4180 escaping is fragile (quotes/newlines/multi-byte edge cases). This is exactly where a mature library reduces long-term bug risk. +```diff +@@ Work Chunk 1C: Human & Robot Output Formatting +- Uses a minimal CSV writer (no external dependency — the format is simple enough for manual escaping). ++ Uses `csv::Writer` for RFC4180-compliant escaping and stable output across edge cases. ++ ++#### Tests to Write First ++#[test] fn test_csv_output_multiline_and_quotes_roundtrip() { ... } +``` + +7. Add `--contains` lexical body filter to `lore notes` +Why: useful middle ground between metadata filtering and semantic search; great for reviewer-pattern mining without requiring FTS query syntax. +```diff +@@ Work Chunk 1B: CLI Arguments & Command Wiring ++/// Filter by case-insensitive substring in note body ++#[arg(long, help_heading = "Filters")] ++pub contains: Option; +``` + +```diff +@@ Work Chunk 1A: NoteListFilters ++ pub contains: Option<&'a str>, +@@ query_notes dynamic filters ++ if contains.is_some() { ++ where_clauses.push("n.body LIKE ? COLLATE NOCASE"); ++ params.push(format!("%{}%", escape_like(contains.unwrap()))); ++ } +``` + +8. Reduce note-document embedding noise by slimming metadata header +Why: current verbose key-value header repeats low-signal tokens and consumes embedding budget. Keep context, but bias tokens toward actual review text. +```diff +@@ Work Chunk 2C: Note Document Extractor +- Build content with structured metadata header: +- [[Note]] +- source_type: note +- note_gitlab_id: ... +- project: ... +- ... +- --- Body --- +- {body} ++ Build content with compact, high-signal layout: ++ [[Note]] ++ @{author} on {Issue#|MR!}{iid} in {project_path} ++ path: {path:line} (only when available) ++ state: {resolved|unresolved} (only when resolvable) ++ ++ {body} ++ ++Keep detailed metadata in structured document columns/labels/paths/url, ++not repeated in verbose text. +``` + +9. Add explicit performance regression checks for the new hot paths +Why: this feature increases document volume ~4x; you should pin acceptable query behavior now so future changes don’t silently degrade. +```diff +@@ Verification Checklist ++Performance/plan checks: ++1) `EXPLAIN QUERY PLAN` for: ++ - author+since query ++ - project+date query ++ - for-mr / for-issue query ++2) Seed 50k-note synthetic fixture and assert: ++ - `lore notes --author ... --limit 100` stays under agreed local threshold ++ - `lore search --type note ...` remains deterministic and completes successfully +``` + +If you want, I can also provide a fully merged “iteration 3” PRD text with these edits applied end-to-end so you can drop it in directly. \ No newline at end of file diff --git a/docs/prd-per-note-search.feedback-4.md b/docs/prd-per-note-search.feedback-4.md new file mode 100644 index 0000000..d1016c7 --- /dev/null +++ b/docs/prd-per-note-search.feedback-4.md @@ -0,0 +1,187 @@ +1. **Canonical note identity for documents: use `notes.gitlab_id` as `source_id`** +Why this is better: the current plan still couples document identity to local row IDs. Even with upsert+sweep, local IDs are a storage artifact and can be reused in edge cases. Using GitLab note IDs as canonical document IDs makes regeneration, backfill, and deletion propagation more stable and portable. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Phase 0: Stable Note Identity +-Phase 2 depends on `notes.id` as the `source_id` for note documents. ++Phase 2 uses `notes.gitlab_id` as the `source_id` for note documents. ++`notes.id` remains an internal relational key only. + +@@ Work Chunk 0A + pub struct NoteUpsertOutcome { + pub local_note_id: i64, ++ pub document_source_id: i64, // notes.gitlab_id + pub changed_semantics: bool, + } + +@@ Work Chunk 2D +-if !note.is_system && outcome.changed_semantics { +- dirty_tracker::mark_dirty_tx(&tx, SourceType::Note, outcome.local_note_id)?; ++if !note.is_system && outcome.changed_semantics { ++ dirty_tracker::mark_dirty_tx(&tx, SourceType::Note, outcome.document_source_id)?; + } + +@@ Work Chunk 2E +-SELECT 'note', n.id, ?1 ++SELECT 'note', n.gitlab_id, ?1 + +@@ Work Chunk 2H +-ON d.source_type = 'note' AND d.source_id = n.id ++ON d.source_type = 'note' AND d.source_id = n.gitlab_id +``` + +2. **Prevent false deletions on partial/incomplete syncs** +Why this is better: sweep-based deletion is correct only when a discussion’s notes were fully fetched. If a page fails mid-fetch, current logic can incorrectly delete valid notes. Add an explicit “fetch complete” guard before sweep. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Phase 0 ++### Work Chunk 0C: Sweep Safety Guard (Partial Fetch Protection) ++ ++Only run stale-note sweep when note pagination completed successfully for that discussion. ++If fetch is partial/interrupted, skip sweep and keep prior notes intact. + ++#### Tests to Write First ++#[test] ++fn test_partial_fetch_does_not_sweep_notes() { /* ... */ } ++ ++#[test] ++fn test_complete_fetch_runs_sweep_notes() { /* ... */ } + ++#### Implementation ++if discussion_fetch_complete { ++ sweep_stale_issue_notes(...)?; ++} else { ++ tracing::warn!("Skipping stale sweep for discussion {} due to partial fetch", discussion_gitlab_id); ++} +``` + +3. **Make deletion propagation set-based (not per-note loop)** +Why this is better: the current per-note DELETE loop is O(N) statements and gets slow on large threads. A temp-table/CTE set-based delete is faster, simpler to reason about, and remains atomic. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Work Chunk 0B Implementation +- for note_id in stale_note_ids { +- conn.execute("DELETE FROM documents WHERE source_type = 'note' AND source_id = ?", [note_id])?; +- conn.execute("DELETE FROM dirty_sources WHERE source_type = 'note' AND source_id = ?", [note_id])?; +- } ++ CREATE TEMP TABLE _stale_note_source_ids(source_id INTEGER PRIMARY KEY) WITHOUT ROWID; ++ INSERT INTO _stale_note_source_ids ++ SELECT gitlab_id ++ FROM notes ++ WHERE discussion_id = ? AND last_seen_at < ? AND is_system = 0; ++ ++ DELETE FROM notes ++ WHERE discussion_id = ? AND last_seen_at < ?; ++ ++ DELETE FROM documents ++ WHERE source_type = 'note' ++ AND source_id IN (SELECT source_id FROM _stale_note_source_ids); ++ ++ DELETE FROM dirty_sources ++ WHERE source_type = 'note' ++ AND source_id IN (SELECT source_id FROM _stale_note_source_ids); ++ ++ DROP TABLE _stale_note_source_ids; +``` + +4. **Fix project-scoping and time-window semantics in `lore notes`** +Why this is better: the plan currently has a contradiction: clap `requires = "project"` blocks use of `defaultProject`, while query layer says default fallback is allowed. Also, `since/until` parsing should use one shared “now” to avoid subtle drift and inverted windows. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Work Chunk 1B NotesArgs +-#[arg(long = "for-issue", ..., requires = "project")] ++#[arg(long = "for-issue", ...)] + pub for_issue: Option; + +-#[arg(long = "for-mr", ..., requires = "project")] ++#[arg(long = "for-mr", ...)] + pub for_mr: Option; + +@@ Work Chunk 1A Query Notes +-- `since`: `parse_since(since_str)` then `n.created_at >= ?` +-- `until`: `parse_since(until_str)` then `n.created_at <= ?` ++- Parse `since` and `until` with a single anchored `now_ms` captured once per command. ++- If user supplies `YYYY-MM-DD` for `--until`, interpret as end-of-day (23:59:59.999 UTC). ++- Validate `since <= until` after both parse with same anchor. +``` + +5. **Add an analytics mode (not a profile command): `lore notes --aggregate`** +Why this is better: this directly supports the stated use case (review patterns) without introducing the rejected “profile report” command. It keeps scope narrow and reuses existing filters. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Phase 1 ++### Work Chunk 1F: Aggregation Mode for Notes Listing ++ ++Add optional aggregation on top of `lore notes`: ++- `--aggregate author|note_type|path|resolution` ++- `--top N` (default 20) ++ ++Behavior: ++- Reuses all existing filters (`--since`, `--project`, `--for-mr`, etc.) ++- Returns grouped counts (+ percentage of filtered corpus) ++- Works in table/json/jsonl/csv ++ ++Non-goal alignment: ++- This is not a narrative “reviewer profile” command. ++- It is a query primitive for downstream analysis. +``` + +6. **Prevent note backfill from starving other document regeneration** +Why this is better: after migration/backfill, note dirty entries can dominate the queue and delay issue/MR/discussion updates. Add source-type fairness in regenerator scheduling. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Work Chunk 2D ++#### Scheduling Revision ++Process dirty sources with weighted fairness instead of strict FIFO: ++- issue: 3 ++- merge_request: 3 ++- discussion: 2 ++- note: 1 ++ ++Implementation sketch: ++- fetch next batch by source_type buckets ++- interleave according to weights ++- preserve retry semantics per source + ++#### Tests to Write First ++#[test] ++fn test_note_backfill_does_not_starve_issue_and_mr_regeneration() { /* ... */ } +``` + +7. **Harden migration 023: remove invalid SQL assertions and move integrity checks to tests** +Why this is better: `RAISE(ABORT, ...)` in standalone `SELECT` is not valid SQLite usage outside triggers/check expressions. Keep migration SQL minimal/portable and enforce invariants in migration tests. + +```diff +--- a/PRD.md ++++ b/PRD.md +@@ Work Chunk 2A Migration SQL +--- Step 10: Integrity verification +-SELECT CASE +- WHEN ... THEN RAISE(ABORT, '...') +-END; ++-- Step 10 removed from SQL migration. ++-- Integrity verification is enforced in migration tests: ++-- 1) pre/post row-count equality ++-- 2) `PRAGMA foreign_key_check` is empty ++-- 3) documents_fts row count matches documents row count after rebuild + +@@ Work Chunk 2A Tests ++#[test] ++fn test_migration_023_integrity_checks_pass() { ++ // pre/post counts, foreign_key_check empty, fts parity ++} +``` + +These 7 revisions improve correctness under failure, reduce churn risk, improve large-sync performance, and make the feature materially more useful for reviewer-analysis workflows without reintroducing any rejected recommendations. \ No newline at end of file diff --git a/docs/prd-per-note-search.feedback-5.md b/docs/prd-per-note-search.feedback-5.md new file mode 100644 index 0000000..9c03177 --- /dev/null +++ b/docs/prd-per-note-search.feedback-5.md @@ -0,0 +1,190 @@ +Here are the highest-impact revisions I’d make. None of these repeat anything in your `## Rejected Recommendations`. + +1. **Add immutable reviewer identity (`author_id`) as a first-class key** +Why this improves the plan: the PRD’s core use case is year-scale reviewer profiling. Usernames are mutable in GitLab, so username-only filtering will fragment one reviewer into multiple identities over time. Adding `author_id` closes that correctness hole and makes historical analysis reliable. + +```diff +@@ Problem Statement +-1. **Query individual notes by author** — the `--author` filter on `lore search` only matches the first note's author per discussion thread ++1. **Query individual notes by reviewer identity** — support both mutable username and immutable GitLab `author_id` for stable longitudinal analysis + +@@ Phase 0: Stable Note Identity ++### Work Chunk 0D: Immutable Author Identity Capture ++**Files:** `migrations/025_notes_author_id.sql`, `src/ingestion/discussions.rs`, `src/ingestion/mr_discussions.rs`, `src/cli/commands/list.rs` ++ ++#### Implementation ++- Add nullable `notes.author_id INTEGER` and backfill from future syncs. ++- Populate `author_id` from GitLab note payload (`note.author.id`) on both issue and MR note ingestion paths. ++- Add `--author-id ` filter to `lore notes`. ++- Keep `--author` for ergonomics; when both provided, require both to match. ++ ++#### Indexing ++- Add `idx_notes_author_id_created ON notes(project_id, author_id, created_at DESC, id DESC) WHERE is_system = 0;` ++ ++#### Tests ++- `test_query_notes_filter_author_id_survives_username_change` ++- `test_query_notes_author_and_author_id_intersection` +``` + +2. **Strengthen partial-fetch safety from a boolean to an explicit fetch state contract** +Why this improves the plan: `fetch_complete: bool` is easy to misuse and fragile under retries/crashes. A run-scoped state model makes sweep correctness auditable and prevents accidental deletions when ingestion aborts midway. + +```diff +@@ Phase 0: Stable Note Identity +-### Work Chunk 0C: Sweep Safety Guard (Partial Fetch Protection) ++### Work Chunk 0C: Sweep Safety Guard with Run-Scoped Fetch State + +@@ Implementation +-Add a `fetch_complete` parameter to the discussion ingestion functions. Only run the stale-note sweep when the fetch completed successfully: ++Add a run-scoped fetch state: ++- `FetchState::Complete` ++- `FetchState::Partial` ++- `FetchState::Failed` ++ ++Only run sweep on `FetchState::Complete`. ++Persist `run_seen_at` once per sync run and pass unchanged through all discussion/note upserts. ++Require `run_seen_at` monotonicity per discussion before sweep (skip and warn otherwise). + +@@ Tests to Write First ++#[test] ++fn test_failed_fetch_never_sweeps_even_after_partial_upserts() { ... } ++#[test] ++fn test_non_monotonic_run_seen_at_skips_sweep() { ... } ++#[test] ++fn test_retry_after_failed_fetch_then_complete_sweeps_correctly() { ... } +``` + +3. **Add DB-level cleanup triggers for note-document referential integrity** +Why this improves the plan: Work Chunk 0B handles the sweep path, but not every possible delete path. DB triggers give defense-in-depth so stale note docs cannot survive even if a future code path deletes notes differently. + +```diff +@@ Work Chunk 0B: Immediate Deletion Propagation +-Update both sweep functions to propagate deletion to documents and dirty_sources using set-based SQL ++Keep set-based SQL in sweep functions, and add DB-level cleanup triggers as a safety net. + +@@ Work Chunk 2A: Schema Migration (023) ++-- Cleanup trigger: deleting a non-system note must delete note document + dirty queue row ++CREATE TRIGGER notes_ad_cleanup AFTER DELETE ON notes ++WHEN old.is_system = 0 ++BEGIN ++ DELETE FROM documents ++ WHERE source_type = 'note' AND source_id = old.id; ++ DELETE FROM dirty_sources ++ WHERE source_type = 'note' AND source_id = old.id; ++END; ++ ++-- Cleanup trigger: if note flips to system, remove its document artifacts ++CREATE TRIGGER notes_au_system_cleanup AFTER UPDATE OF is_system ON notes ++WHEN old.is_system = 0 AND new.is_system = 1 ++BEGIN ++ DELETE FROM documents ++ WHERE source_type = 'note' AND source_id = new.id; ++ DELETE FROM dirty_sources ++ WHERE source_type = 'note' AND source_id = new.id; ++END; +``` + +4. **Eliminate N+1 extraction cost with parent metadata caching in regeneration** +Why this improves the plan: backfilling ~8k notes with per-note parent/label lookups creates avoidable query amplification. Batch caching turns repeated joins into one-time lookups per parent entity and materially reduces rebuild time. + +```diff +@@ Phase 2: Per-Note Documents ++### Work Chunk 2I: Batch Parent Metadata Cache for Note Regeneration ++**Files:** `src/documents/regenerator.rs`, `src/documents/extractor.rs` ++ ++#### Implementation ++- Add `NoteExtractionContext` cache keyed by `(noteable_type, parent_id)` containing: ++ - parent iid/title/url ++ - parent labels ++ - project path ++- In batch regeneration, prefetch parent metadata for note IDs in the current chunk. ++- Use cached metadata in `extract_note_document()` to avoid repeated parent/label queries. ++ ++#### Tests ++- `test_note_regeneration_uses_parent_cache_consistently` ++- `test_note_regeneration_cache_hit_preserves_hash_determinism` +``` + +5. **Add embedding dedup cache keyed by semantic text hash** +Why this improves the plan: note docs will contain repeated short comments (“LGTM”, “nit: …”). Current doc-level hashing includes metadata, so identical semantic comments still re-embed many times. A semantic embedding hash cache cuts cost and speeds full rebuild/backfill without changing search behavior. + +```diff +@@ Phase 2: Per-Note Documents ++### Work Chunk 2J: Semantic Embedding Dedup for Notes ++**Files:** `migrations/026_embedding_cache.sql`, embedding pipeline module(s), `src/documents/extractor.rs` ++ ++#### Implementation ++- Compute `embedding_text` for notes as: normalized note body + compact stable context (`parent_type`, `path`, `resolution`), excluding volatile fields. ++- Compute `embedding_hash = sha256(embedding_text)`. ++- Before embedding generation, lookup existing vector by `(model, embedding_hash)`. ++- Reuse cached vector when present; only call embedding model on misses. ++ ++#### Tests ++- `test_identical_note_bodies_reuse_embedding_vector` ++- `test_embedding_hash_changes_when_semantic_context_changes` +``` + +6. **Add deterministic review-signal tags as derived labels** +Why this improves the plan: this makes output immediately more useful for reviewer-pattern analysis without adding a profile command (which is explicitly out of scope). It increases practical value of both `lore notes` and `lore search --type note` with low complexity. + +```diff +@@ Non-Goals +-- Adding a "reviewer profile" report command (that's a downstream use case built on this infrastructure) ++- Adding a "reviewer profile" report command (downstream), while allowing low-level derived signal tags as indexing primitives + +@@ Phase 2: Per-Note Documents ++### Work Chunk 2K: Derived Review Signal Labels ++**Files:** `src/documents/extractor.rs` ++ ++#### Implementation ++- Derive deterministic labels from note text + metadata: ++ - `signal:nit` ++ - `signal:blocking` ++ - `signal:security` ++ - `signal:performance` ++ - `signal:testing` ++- Attach via existing `document_labels` flow for note documents. ++- No new CLI mode required; existing label filters can consume these labels. ++ ++#### Tests ++- `test_note_document_derives_signal_labels_nit` ++- `test_note_document_derives_signal_labels_security` ++- `test_signal_label_derivation_is_deterministic` +``` + +7. **Add high-precision note targeting filters (`--note-id`, `--gitlab-note-id`, `--discussion-id`)** +Why this improves the plan: debugging, incident response, and reproducibility all benefit from exact addressing. This is especially useful when validating sync correctness and cross-checking a specific note/document lifecycle. + +```diff +@@ Work Chunk 1B: CLI Arguments & Command Wiring + pub struct NotesArgs { ++ /// Filter by local note row id ++ #[arg(long = "note-id", help_heading = "Filters")] ++ pub note_id: Option, ++ ++ /// Filter by GitLab note id ++ #[arg(long = "gitlab-note-id", help_heading = "Filters")] ++ pub gitlab_note_id: Option, ++ ++ /// Filter by local discussion id ++ #[arg(long = "discussion-id", help_heading = "Filters")] ++ pub discussion_id: Option, + } + +@@ Work Chunk 1A: Filter struct + pub struct NoteListFilters<'a> { ++ pub note_id: Option, ++ pub gitlab_note_id: Option, ++ pub discussion_id: Option, + } + +@@ Tests to Write First ++#[test] ++fn test_query_notes_filter_note_id_exact() { ... } ++#[test] ++fn test_query_notes_filter_gitlab_note_id_exact() { ... } ++#[test] ++fn test_query_notes_filter_discussion_id_exact() { ... } +``` + +If you want, I can produce a single consolidated “iteration 5” PRD diff that merges these into your exact section ordering and updates the dependency graph/migration numbering end-to-end. \ No newline at end of file diff --git a/docs/prd-per-note-search.md b/docs/prd-per-note-search.md new file mode 100644 index 0000000..e2870b3 --- /dev/null +++ b/docs/prd-per-note-search.md @@ -0,0 +1,2371 @@ +--- +plan: true +title: "" +status: iterating +iteration: 6 +target_iterations: 8 +beads_revision: 0 +related_plans: [] +created: 2026-02-11 +updated: 2026-02-12 +--- + +# PRD: Per-Note Search & Reviewer Profiling + +## Problem Statement + +Lore ingests all GitLab discussion notes with full metadata (author, body, diff positions, timestamps), but the data is only accessible through aggregated discussion documents. There is no way to: + +1. **Query individual notes by author** — the `--author` filter on `lore search` only matches the first note's author per discussion thread, and relies solely on mutable usernames (no immutable author identity for longitudinal analysis) +2. **List raw notes with metadata** — no CLI surface exposes the `notes` table directly +3. **Semantically search individual comments** — notes are bundled into thread documents, diluting per-note relevance + +**Use case:** "Search through jdefting's code review comments over the past year to build a comprehensive report of their code smell preferences and review patterns." + +## Design + +Three phases, shipped together as one feature: + +- **Phase 0 (Foundation):** Stable note identity — unify issue discussion note ingestion to use upsert+sweep (matching the MR pattern), ensuring `notes.id` is stable across syncs, capturing immutable `author_id` for longitudinal analysis, and enabling change-aware dirty marking +- **Phase 1 (Option A):** `lore notes` command — direct SQL query over the `notes` table with rich filtering and multiple export formats (table, JSON, JSONL, CSV) +- **Phase 2 (Option B):** Per-note documents — each non-system note becomes its own searchable document in the FTS/embedding pipeline + +All three phases are required. Phase 0 gives stable identity for reliable document tracking; Phase 1 gives structured data extraction; Phase 2 gives semantic search. + +## Non-Goals + +- Changing existing discussion document behavior (those remain as-is) +- Adding a "reviewer profile" report command (that's a downstream use case built on this infrastructure) +- Modifying the ingestion pipeline from GitLab (data is already captured — Phase 0 only changes local storage strategy) +- Adding pagination/cursor support (no existing list command has this; high `--limit` covers the year-long analysis use case) +- Feature flag gating (no external users; single-user CLI in early dev) + +--- + +## Phase 0: Stable Note Identity + +### Rationale + +Issue discussion note ingestion currently uses a delete/reinsert pattern (`DELETE FROM notes WHERE discussion_id = ?` then re-insert). This makes `notes.id` (the local row ID) unstable across syncs — every sync assigns new IDs to the same notes. MR discussion notes already use an upsert pattern (`ON CONFLICT(gitlab_id) DO UPDATE`), producing stable IDs. + +Phase 2 depends on `notes.id` as the `source_id` for note documents. Unstable IDs would cause: +- Unnecessary document churn (old ID deleted, new ID created for identical content) +- Stale document accumulation (orphaned documents from old IDs) +- Wasted regeneration cycles on every sync + +Unifying both paths to upsert+sweep gives stable identity, enables change-aware dirty marking, and reduces sync overhead. + +### Work Chunk 0A: Upsert/Sweep for Issue Discussion Notes + +**Files:** `src/ingestion/discussions.rs` + +**Depends on:** Nothing (standalone) + +**Context:** `src/ingestion/mr_discussions.rs` already has `upsert_note()` (line ~470) which uses `ON CONFLICT(gitlab_id) DO UPDATE` and `sweep_stale_notes()` (line ~551) which deletes notes with `last_seen_at < run_seen_at`. The `notes` table already has `UNIQUE(gitlab_id)`. We need to bring the issue discussion path to the same pattern. + +#### Tests to Write First + +```rust +#[test] +fn test_issue_note_upsert_stable_id() { + // Insert a discussion with 2 notes via the new upsert path + // Record their local IDs + // "Re-sync" the same notes (same gitlab_ids, same content) + // Assert: local IDs are unchanged +} + +#[test] +fn test_issue_note_upsert_detects_body_change() { + // Insert note with body "old text" + // Re-sync same gitlab_id with body "new text" + // Assert: upsert returns changed_semantics = true + // Assert: local ID is unchanged +} + +#[test] +fn test_issue_note_upsert_unchanged_returns_false() { + // Insert note, re-sync identical note + // Assert: upsert returns changed_semantics = false +} + +#[test] +fn test_issue_note_upsert_updated_at_only_does_not_mark_semantic_change() { + // Insert note with body "text", updated_at = 1000 + // Re-sync same gitlab_id with body "text", updated_at = 2000 + // Assert: upsert returns changed_semantics = false + // Assert: updated_at is updated in the DB (housekeeping fields always refresh) +} + +#[test] +fn test_issue_note_sweep_removes_stale() { + // Insert 3 notes for a discussion + // Re-sync with only 2 of the 3 (different last_seen_at) + // Run sweep + // Assert: stale note is deleted, 2 remain +} + +#[test] +fn test_issue_note_upsert_returns_local_id() { + // Insert a note via upsert + // Assert: returned local_id matches conn.last_insert_rowid() + // Or for update path: matches the existing row's id +} +``` + +#### Implementation + +**1. Create shared `NoteUpsertOutcome` struct** (in `src/ingestion/discussions.rs` or a shared module): + +```rust +pub struct NoteUpsertOutcome { + pub local_note_id: i64, + pub changed_semantics: bool, +} +``` + +**2. Refactor `insert_note()` → `upsert_note_for_issue()`:** + +Replace the current `DELETE FROM notes WHERE discussion_id = ?` + loop insert pattern (lines 132-139) with: + +```rust +for note in &normalized_notes { + let outcome = upsert_note_for_issue(&tx, local_discussion_id, note, last_seen_at)?; + // outcome.local_note_id and outcome.changed_semantics available for Phase 2 +} +// After loop: sweep stale notes for this discussion +sweep_stale_issue_notes(&tx, local_discussion_id, last_seen_at)?; +``` + +The upsert SQL follows the MR pattern: +```sql +INSERT INTO notes (gitlab_id, discussion_id, project_id, author_username, body, note_type, + is_system, created_at, updated_at, last_seen_at, ...) +VALUES (?1, ?2, ?3, ...) +ON CONFLICT(gitlab_id) DO UPDATE SET + body = excluded.body, + note_type = excluded.note_type, + updated_at = excluded.updated_at, + last_seen_at = excluded.last_seen_at, + ... +``` + +**Change detection:** Semantic change is computed separately from housekeeping updates. The upsert always updates persistence fields (`updated_at`, `last_seen_at`), but `changed_semantics` is derived only from fields that affect note documents and search filters: + +```sql +ON CONFLICT(gitlab_id) DO UPDATE SET + body = excluded.body, + note_type = excluded.note_type, + updated_at = excluded.updated_at, + last_seen_at = excluded.last_seen_at, + resolved = excluded.resolved, + resolved_by = excluded.resolved_by, + position_new_path = excluded.position_new_path, + position_new_line = excluded.position_new_line, + position_old_path = excluded.position_old_path, + position_old_line = excluded.position_old_line, + ... +``` + +Then detect semantic change with a separate check that excludes `updated_at` and `last_seen_at` (housekeeping-only fields): + +```sql +WHERE notes.body IS NOT excluded.body + OR notes.note_type IS NOT excluded.note_type + OR notes.author_username IS NOT excluded.author_username + OR notes.resolved IS NOT excluded.resolved + OR notes.resolved_by IS NOT excluded.resolved_by + OR notes.position_new_path IS NOT excluded.position_new_path + OR notes.position_new_line IS NOT excluded.position_new_line +``` + +**Why `author_username` is semantic:** Note documents embed the username in both the content header (`author: @{author}`) and the title (`Note by @{author} on Issue #42`). If a GitLab user changes their username (e.g., `jdefting` -> `jd-engineering`), the existing note documents become stale — search results show the old username, inconsistent with what the API returns. Treating username changes as semantic ensures documents stay accurate. + +**Note:** `author_id` changes do NOT trigger `changed_semantics`. The `author_id` is an immutable identity anchor — it never changes in practice, and even if it did (data migration), it doesn't affect document content. + +**Rationale:** `updated_at` changes alone (e.g., GitLab touching the timestamp without modifying content) should NOT trigger document regeneration. This avoids unnecessary dirty queue churn on large datasets. The WHERE clause fires the DO UPDATE unconditionally (to refresh `last_seen_at`), and `changed_semantics` is derived from `conn.changes()` after a second query that checks only semantic fields: + +```rust +// Two-step: always upsert (refreshes housekeeping), then check semantic change +let upserted = /* run upsert SQL */; +let local_id = conn.query_row("SELECT id FROM notes WHERE gitlab_id = ?", [gitlab_id], |r| r.get(0))?; +let changed = conn.query_row( + "SELECT COUNT(*) FROM notes WHERE id = ? AND (body IS NOT ? OR note_type IS NOT ? OR ...)", + params![local_id, body, note_type, ...], + |r| r.get::<_, i64>(0), +)? == 0 && /* was an update, not insert */; +``` + +Actually, simpler approach: use `conn.changes()` from the initial upsert (which always runs the SET clause), then separately track whether the note already existed: + +```rust +// Check if note exists before upsert +let existed = conn.query_row( + "SELECT id, body, note_type, resolved, resolved_by, position_new_path, position_new_line FROM notes WHERE gitlab_id = ?", + [gitlab_id], + |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option>(1)?, /* ... */)), +).optional()?; + +// Run upsert (always updates housekeeping fields) +conn.execute(upsert_sql, params![...])?; + +let local_id = match &existed { + Some((id, ..)) => *id, + None => conn.last_insert_rowid(), +}; + +let changed_semantics = match &existed { + None => true, // New insert = always changed + Some((_, old_body, old_note_type, old_author_username, old_resolved, old_path, old_line)) => { + old_body.as_deref() != body || old_note_type.as_deref() != note_type || old_author_username.as_deref() != author_username || /* ... */ + } +}; +``` + +This pre-read approach is clearest and avoids any SQLite edge cases with `changes()` counting. The pre-read is a single-row lookup on the UNIQUE(gitlab_id) index — negligible cost. + +**3. Also update `upsert_note()` in `src/ingestion/mr_discussions.rs`** to return `NoteUpsertOutcome` instead of `Result<()>`. Same semantic-change-only detection (exclude `updated_at`). + +**4. Sweep function for issue notes:** + +```rust +fn sweep_stale_issue_notes(conn: &Connection, discussion_id: i64, last_seen_at: i64) -> Result<()> { + conn.execute( + "DELETE FROM notes WHERE discussion_id = ? AND last_seen_at < ?", + rusqlite::params![discussion_id, last_seen_at], + )?; + Ok(()) +} +``` + +--- + +### Work Chunk 0B: Immediate Deletion Propagation + +**Files:** `src/ingestion/discussions.rs`, `src/ingestion/mr_discussions.rs` + +**Depends on:** Work Chunk 0A (uses sweep functions from 0A), Work Chunk 2A (documents table must accept `source_type='note'`) + +**Context:** When sweep deletes stale notes, the current plan relies on eventual cleanup via `generate-docs --full` for orphaned note documents. This creates a window where deleted notes still appear in search results, eroding trust in the dataset. Instead, propagate deletion to documents immediately in the same transaction. + +#### Tests to Write First + +```rust +#[test] +fn test_issue_note_sweep_deletes_note_documents_immediately() { + // Setup: project, issue, discussion, 3 non-system notes + // Generate note documents for all 3 + // Re-sync with only 2 of the 3 notes (different last_seen_at) + // Run sweep + // Assert: stale note row is deleted + // Assert: stale note's document is deleted from documents table + // Assert: stale note's dirty_sources entry (if any) is deleted + // Assert: remaining 2 notes' documents are untouched +} + +#[test] +fn test_mr_note_sweep_deletes_note_documents_immediately() { + // Same pattern as above but for MR discussion notes +} + +#[test] +fn test_sweep_deletion_handles_note_without_document() { + // Setup: note exists but was never turned into a document (e.g., system note) + // Sweep deletes the note + // Assert: no error (DELETE WHERE on non-existent document is a no-op) +} + +#[test] +fn test_set_based_deletion_atomicity() { + // Setup: project, issue, discussion, 5 non-system notes with documents + // Mark 3 as stale (different last_seen_at) + // Run sweep + // Assert: exactly 3 note rows deleted + // Assert: exactly 3 documents deleted + // Assert: exactly 3 dirty_sources entries deleted (if any existed) + // Assert: remaining 2 note rows, documents, and dirty_sources untouched +} +``` + +#### Implementation + +Update both sweep functions to propagate deletion to documents and dirty_sources using **set-based SQL** (not per-note loops). This is both faster on large threads and simpler to reason about atomically: + +```rust +fn sweep_stale_issue_notes(conn: &Connection, discussion_id: i64, last_seen_at: i64) -> Result<()> { + // Set-based: identify stale non-system note IDs, delete their documents + // and dirty_sources entries, then delete the note rows themselves. + // All in one transaction scope — no per-note loop needed. + + // Step 1: Delete documents for stale non-system notes (cascades to + // document_labels and document_paths via ON DELETE CASCADE; + // FTS trigger documents_ad auto-removes FTS entry) + conn.execute( + "DELETE FROM documents WHERE source_type = 'note' AND source_id IN ( + SELECT id FROM notes + WHERE discussion_id = ? AND last_seen_at < ? AND is_system = 0 + )", + rusqlite::params![discussion_id, last_seen_at], + )?; + + // Step 2: Delete dirty_sources entries for stale non-system notes + conn.execute( + "DELETE FROM dirty_sources WHERE source_type = 'note' AND source_id IN ( + SELECT id FROM notes + WHERE discussion_id = ? AND last_seen_at < ? AND is_system = 0 + )", + rusqlite::params![discussion_id, last_seen_at], + )?; + + // Step 3: Delete all stale note rows (system and non-system) + conn.execute( + "DELETE FROM notes WHERE discussion_id = ? AND last_seen_at < ?", + rusqlite::params![discussion_id, last_seen_at], + )?; + + Ok(()) +} +``` + +Same pattern for `sweep_stale_notes()` in `src/ingestion/mr_discussions.rs`. + +**Note:** The document DELETE cascades to `document_labels` and `document_paths` via ON DELETE CASCADE. The FTS trigger (`documents_ad`) automatically removes the FTS entry. No additional cleanup needed. + +**Why set-based:** The subquery `SELECT id FROM notes WHERE discussion_id = ? AND last_seen_at < ? AND is_system = 0` runs once per step against the UNIQUE(gitlab_id) index. This is O(1) SQL statements regardless of how many stale notes exist, vs O(N) individual DELETE statements in a loop. On large threads (100+ notes), this is measurably faster and avoids the risk of partial completion if the loop is interrupted. + +**Defense-in-depth:** Work Chunk 2A's migration also creates DB-level cleanup triggers (`notes_ad_cleanup`, `notes_au_system_cleanup`) that fire on ANY note deletion/system-flip, not just sweep. The sweep functions handle the common path with explicit set-based SQL; the triggers are a safety net for any future code path that deletes notes outside the sweep functions. Both mechanisms coexist — the explicit SQL in sweep is preferred (clearer intent, predictable cost), and the triggers catch edge cases. + +--- + +### Work Chunk 0C: Sweep Safety Guard (Partial Fetch Protection) + +**Files:** `src/ingestion/discussions.rs`, `src/ingestion/mr_discussions.rs` + +**Depends on:** Work Chunk 0A (modifies the sweep call site from 0A) + +**Context:** The sweep-based deletion pattern (delete notes where `last_seen_at < run_seen_at`) is correct only when a discussion's notes were fully fetched from GitLab. If a page fails mid-fetch (network timeout, rate limit, partial API response), the current logic would incorrectly delete valid notes that simply weren't seen during the incomplete fetch. This is especially dangerous for long threads with many notes that span multiple API pages. + +#### Tests to Write First + +```rust +#[test] +fn test_partial_fetch_does_not_sweep_notes() { + // Setup: project, issue, discussion, 5 notes already in DB + // Simulate a partial fetch: only 2 of 5 notes returned + // (set last_seen_at for 2 notes to current run, 3 to previous run) + // Call the ingestion function with fetch_complete = false + // Assert: all 5 notes still exist (sweep was skipped) + // Assert: the 2 re-synced notes have updated last_seen_at +} + +#[test] +fn test_complete_fetch_runs_sweep_normally() { + // Setup: project, issue, discussion, 5 notes + // Simulate a complete fetch: all 5 notes returned + // Call the ingestion function with fetch_complete = true + // Assert: sweep runs normally (no stale notes in this case) +} + +#[test] +fn test_partial_fetch_then_complete_fetch_cleans_up() { + // Setup: project, issue, discussion, 5 notes + // First sync: partial fetch (3 of 5), sweep skipped + // Second sync: complete fetch (only 3 notes exist on GitLab now) + // Assert: sweep runs and removes the 2 notes no longer on GitLab +} +``` + +#### Implementation + +Add a `fetch_complete` parameter to the discussion ingestion functions. Only run the stale-note sweep when the fetch completed successfully: + +```rust +// In the discussion ingestion loop, after upserting all notes: +if fetch_complete { + sweep_stale_issue_notes(&tx, local_discussion_id, last_seen_at)?; +} else { + tracing::warn!( + discussion_id = local_discussion_id, + "Skipping stale note sweep due to partial/incomplete fetch" + ); +} +``` + +**Determining `fetch_complete`:** The discussion notes come from the GitLab API response. When the API returns all notes for a discussion in a single response (no pagination error, no timeout), `fetch_complete = true`. When the fetch encounters a network error, rate limit, or is interrupted, `fetch_complete = false`. The exact signaling mechanism depends on how the existing ingestion pipeline handles partial API responses — look at the MR discussion ingestion path for the existing pattern. + +**Note:** This is a safety guard, not a completeness guarantee. The sweep will still run on the next successful full fetch. The guard prevents data loss during transient failures, not during permanent API changes. + +--- + +### Work Chunk 0D: Immutable Author Identity Capture + +**Files:** `src/ingestion/discussions.rs`, `src/ingestion/mr_discussions.rs` + +**Depends on:** Work Chunk 0A (modifies the upsert functions from 0A) + +**Context:** The core use case is year-scale reviewer profiling ("search through jdefting's code review comments over the past year"). GitLab usernames are mutable — a user can change their username at any time. If a reviewer changes their username from `jdefting` to `jd-engineering` mid-year, author-based queries fragment their identity into two separate result sets. The `notes` table already captures `author_username` from the API response, but this only reflects the username at ingestion time. + +GitLab note payloads include `note.author.id` (an immutable integer). Capturing this alongside the username provides a stable identity anchor for longitudinal analysis, even across username changes. + +**Scope:** This chunk adds the column and populates it during ingestion. A `--author-id` CLI filter for `lore notes` is wired up in Phase 1 (Work Chunk 1A/1B) to make the immutable identity immediately usable for the core longitudinal analysis use case. The value here is data capture and query foundation: once `author_id` is stored, it can never be retroactively recovered if we don't capture it now. + +#### Tests to Write First + +```rust +#[test] +fn test_issue_note_upsert_captures_author_id() { + // Insert a note with author_id = 12345 + // Assert: notes.author_id == 12345 + // Assert: notes.author_username == "jdefting" +} + +#[test] +fn test_mr_note_upsert_captures_author_id() { + // Same pattern for MR notes +} + +#[test] +fn test_note_upsert_author_id_nullable() { + // Insert a note with author_id = None (older API responses may lack this) + // Assert: notes.author_id IS NULL + // Assert: no error (column is nullable) +} + +#[test] +fn test_note_author_id_survives_username_change() { + // Insert note with author_username = "jdefting", author_id = 12345 + // Re-upsert same gitlab_id with author_username = "jd-engineering", author_id = 12345 + // Assert: author_id unchanged (12345) + // Assert: author_username updated to "jd-engineering" + // Assert: changed_semantics = true (username is embedded in document content/title) +} +``` + +#### Implementation + +**1. Migration** — Add `author_id` column to `notes` table. This goes in migration 022 (combined with the query index migration from Work Chunk 1E to avoid an extra migration): + +Add to the query index migration SQL: + +```sql +-- Add immutable author identity column (nullable for backcompat with pre-existing notes) +ALTER TABLE notes ADD COLUMN author_id INTEGER; + +-- Composite index for author_id lookups — used by `lore notes --author-id` +-- for immutable identity queries. Includes project_id and created_at for +-- the common "all notes by this person in this project" pattern. +CREATE INDEX IF NOT EXISTS idx_notes_project_author_id_created +ON notes(project_id, author_id, created_at DESC, id DESC) +WHERE is_system = 0 AND author_id IS NOT NULL; +``` + +**2. Populate `author_id` during upsert** — In both `upsert_note_for_issue()` (discussions.rs) and `upsert_note()` (mr_discussions.rs), add `author_id` to the INSERT and ON CONFLICT DO UPDATE SET clauses. Extract from the GitLab API note payload's `author.id` field. + +**3. Semantic change detection** — `author_id` changes should NOT trigger `changed_semantics = true`. The `author_id` is an identity anchor, not a content field. It's excluded from the semantic change comparison alongside `updated_at` and `last_seen_at`. However, `author_username` changes DO trigger `changed_semantics = true` because the username appears in document content and title (see Work Chunk 0A semantic detection). + +**4. Note document extraction** — Work Chunk 2C's `extract_note_document()` function includes both `author_username` (in the document content header and title) and `author_id` (in the metadata header). The `author_id` field enables downstream tools to reliably identify the same person even after username changes. + +--- + +## Phase 1: `lore notes` Command + +### Work Chunk 1A: Data Types & Query Layer + +**Files:** `src/cli/commands/list.rs` + +**Context:** This file contains `IssueListRow`, `MrListRow`, their JSON counterparts, `ListFilters`, `MrListFilters`, and the `query_issues()`/`query_mrs()` functions. The new code follows these exact patterns. + +**Depends on:** Nothing (standalone) + +#### Tests to Write First + +Add to `src/cli/commands/list.rs` in the `#[cfg(test)] mod tests` block. The test DB setup requires the `notes` and `discussions` tables — reuse the patterns from `src/documents/extractor.rs::setup_discussion_test_db()`. + +```rust +// Test helper — create in-memory DB with projects, issues, MRs, discussions, notes tables +// Pattern: same as extractor.rs::setup_discussion_test_db() but also include +// merge_requests, mr_labels, issue_labels, labels tables + +#[test] +fn test_query_notes_empty_db() { + // Setup DB with no notes + // Call query_notes with default NoteListFilters + // Assert: total_count == 0, notes.is_empty() +} + +#[test] +fn test_query_notes_returns_user_notes_only() { + // Insert 2 user notes and 1 system note into same discussion + // Call query_notes with default filters (no_system = true by default) + // Assert: returns 2 notes, system note excluded +} + +#[test] +fn test_query_notes_include_system() { + // Insert 2 user notes and 1 system note + // Call query_notes with include_system = true + // Assert: returns 3 notes +} + +#[test] +fn test_query_notes_filter_author() { + // Insert notes from "alice" and "bob" + // Call query_notes with author = Some("alice") + // Assert: only alice's notes returned +} + +#[test] +fn test_query_notes_filter_author_strips_at() { + // Call query_notes with author = Some("@alice") + // Assert: still matches "alice" (@ prefix stripped) +} + +#[test] +fn test_query_notes_filter_author_case_insensitive() { + // Insert note from "Alice" (capital A) + // Call query_notes with author = Some("alice") + // Assert: matches (COLLATE NOCASE) +} + +#[test] +fn test_query_notes_filter_author_id() { + // Insert notes from author_id = 100 (username "alice") and author_id = 200 (username "bob") + // Call query_notes with author_id = Some(100) + // Assert: only alice's notes returned (by immutable identity) +} + +#[test] +fn test_query_notes_filter_author_id_and_author_combined() { + // Insert notes from author_id=100/username="alice" and author_id=100/username="alice-renamed" + // Call query_notes with author_id = Some(100), author = Some("alice") + // Assert: only notes where BOTH match (AND semantics) — returns alice's notes before rename +} + +#[test] +fn test_query_notes_filter_note_type() { + // Insert notes with note_type = Some("DiffNote") and Some("DiscussionNote") and None + // Call query_notes with note_type = Some("DiffNote") + // Assert: only DiffNote notes returned +} + +#[test] +fn test_query_notes_filter_project() { + // Insert 2 projects, notes in each + // Call query_notes with project = Some("group/project-one") + // Assert: only project-one notes returned (uses resolve_project()) +} + +#[test] +fn test_query_notes_filter_project_uses_default() { + // Insert 2 projects, notes in each + // Call query_notes with project = None, config.default_project = Some("group/project-one") + // Assert: only project-one notes returned when for_issue_iid or for_mr_iid is set +} + +#[test] +fn test_query_notes_filter_since() { + // Insert notes at created_at = 1000, 2000, 3000 + // Call with since cutoff that excludes the first + // Assert: only notes after cutoff returned +} + +#[test] +fn test_query_notes_filter_until() { + // Insert notes at created_at = 1000, 2000, 3000 + // Call with until cutoff that excludes the last + // Assert: only notes before cutoff returned +} + +#[test] +fn test_query_notes_filter_since_and_until_combined() { + // Insert notes at created_at = 1000, 2000, 3000, 4000 + // Call with since=1500, until=3500 + // Assert: only notes at 2000 and 3000 returned +} + +#[test] +fn test_query_notes_invalid_time_window_rejected() { + // Call with since pointing to a time AFTER until + // (e.g., since = "30d", until = "90d" — 30 days ago is after 90 days ago) + // Assert: returns a clear error, not an empty result set +} + +#[test] +fn test_query_notes_until_date_uses_end_of_day() { + // Insert notes at various times on 2025-06-15 + // Call with until = "2025-06-15" + // Assert: all notes on that day are included (end-of-day, not start-of-day) +} + +#[test] +fn test_query_notes_filter_contains() { + // Insert notes with body "This function is too complex" and "LGTM" + // Call with contains = Some("complex") + // Assert: only the first note returned +} + +#[test] +fn test_query_notes_filter_contains_case_insensitive() { + // Insert note with body "Use EXPECT instead of unwrap" + // Call with contains = Some("expect") + // Assert: matches (COLLATE NOCASE) +} + +#[test] +fn test_query_notes_filter_contains_escapes_like_wildcards() { + // Insert notes with body "100% coverage" and "100 tests" + // Call with contains = Some("100%") + // Assert: only "100% coverage" returned (% is literal, not wildcard) +} + +#[test] +fn test_query_notes_filter_path() { + // Insert DiffNotes on "src/auth.rs" and "src/config.rs" + // Call with path = Some("src/auth.rs") + // Assert: only auth.rs notes returned +} + +#[test] +fn test_query_notes_filter_path_prefix() { + // Insert DiffNotes on "src/auth/login.rs" and "test/auth_test.rs" + // Call with path = Some("src/") (trailing slash = prefix) + // Assert: only src/ notes returned +} + +#[test] +fn test_query_notes_filter_for_issue_requires_project() { + // Insert issue with iid=42 in project-one, same iid=42 in project-two + // Call with for_issue_iid = Some(42), project = Some("group/project-one") + // Assert: only notes from project-one's issue #42 +} + +#[test] +fn test_query_notes_filter_for_mr_requires_project() { + // Insert MR with iid=10 in project-one, same iid=10 in project-two + // Call with for_mr_iid = Some(10), project = Some("group/project-one") + // Assert: only notes from project-one's MR !10 +} + +#[test] +fn test_query_notes_filter_for_issue_uses_default_project() { + // Insert issue with iid=42 in project-one + // Call with for_issue_iid = Some(42), project = None, config.default_project = Some("group/project-one") + // Assert: resolves via defaultProject fallback — returns notes from project-one's issue #42 +} + +#[test] +fn test_query_notes_filter_for_mr_uses_default_project() { + // Insert MR with iid=10 in project-one + // Call with for_mr_iid = Some(10), project = None, config.default_project = Some("group/project-one") + // Assert: resolves via defaultProject fallback +} + +#[test] +fn test_query_notes_filter_for_issue_without_project_context_errors() { + // Call with for_issue_iid = Some(42), project = None, no defaultProject + // Assert: returns error (IID requires project context) +} + +#[test] +fn test_query_notes_filter_resolution_unresolved() { + // Insert 2 notes: one with resolvable=1,resolved=0 and one with resolvable=1,resolved=1 + // Call with resolution = "unresolved" + // Assert: only the unresolved note returned +} + +#[test] +fn test_query_notes_filter_resolution_resolved() { + // Same setup as above + // Call with resolution = "resolved" + // Assert: only the resolved note returned +} + +#[test] +fn test_query_notes_filter_resolution_any() { + // Same setup as above + // Call with resolution = "any" (default) + // Assert: both notes returned +} + +#[test] +fn test_query_notes_sort_created_desc() { + // Insert notes with created_at = 1000, 3000, 2000 + // Call with sort = "created", order = "desc" + // Assert: notes ordered 3000, 2000, 1000 +} + +#[test] +fn test_query_notes_sort_created_asc() { + // Same data, order = "asc" + // Assert: ordered 1000, 2000, 3000 +} + +#[test] +fn test_query_notes_deterministic_tiebreak() { + // Insert 3 notes with identical created_at timestamps + // Call twice with same sort/order + // Assert: order is identical both times (n.id tiebreak) +} + +#[test] +fn test_query_notes_limit() { + // Insert 10 notes + // Call with limit = 3 + // Assert: notes.len() == 3, total_count == 10 +} + +#[test] +fn test_query_notes_combined_filters() { + // Insert notes from multiple authors, types, projects, paths + // Call with author + note_type + project + since combined + // Assert: intersection of all filters +} + +#[test] +fn test_query_notes_filter_note_id_exact() { + // Insert 3 notes with known local IDs + // Call with note_id = Some(2) + // Assert: only the note with local id 2 returned +} + +#[test] +fn test_query_notes_filter_gitlab_note_id_exact() { + // Insert notes with gitlab_id = 12345 and gitlab_id = 67890 + // Call with gitlab_note_id = Some(12345) + // Assert: only the note with gitlab_id 12345 returned +} + +#[test] +fn test_query_notes_filter_discussion_id_exact() { + // Insert 2 discussions, each with 2 notes + // Call with discussion_id = Some(1) + // Assert: only notes from discussion 1 returned +} + +#[test] +fn test_note_list_row_json_conversion() { + // Create NoteListRow with known ms timestamps + // Convert to NoteListRowJson + // Assert: created_at_iso and updated_at_iso are correct ISO strings + // Assert: all fields carry over +} +``` + +#### Implementation + +**Data structures** (add in `src/cli/commands/list.rs` after `MrListResultJson`): + +```rust +// NoteListRow — raw query result, ms timestamps +pub struct NoteListRow { + pub id: i64, // notes.id (local) + pub gitlab_id: i64, // notes.gitlab_id + pub author_username: Option, + pub body: Option, + pub note_type: Option, // "DiffNote" | "DiscussionNote" | null + pub is_system: bool, + pub created_at: i64, + pub updated_at: i64, + pub position_new_path: Option, + pub position_new_line: Option, + pub position_old_path: Option, + pub position_old_line: Option, + pub resolvable: bool, + pub resolved: bool, + pub resolved_by: Option, + pub noteable_type: String, // "Issue" | "MergeRequest" + pub parent_iid: i64, // parent issue/MR iid + pub parent_title: Option, + pub project_path: String, +} + +// NoteListRowJson — ISO timestamps, serde for JSON output +pub struct NoteListRowJson { ... } // with created_at_iso, updated_at_iso + +impl From<&NoteListRow> for NoteListRowJson { ... } + +// NoteListResult +pub struct NoteListResult { + pub notes: Vec, + pub total_count: usize, +} + +// NoteListResultJson +pub struct NoteListResultJson { + pub notes: Vec, + pub total_count: usize, + pub showing: usize, +} + +impl From<&NoteListResult> for NoteListResultJson { ... } +``` + +**Filter struct:** + +```rust +pub struct NoteListFilters<'a> { + pub limit: usize, + pub project: Option<&'a str>, + pub author: Option<&'a str>, // display-name filter, case-insensitive via COLLATE NOCASE + pub author_id: Option, // immutable identity filter (exact match) + pub note_type: Option<&'a str>, // "DiffNote" | "DiscussionNote" + pub include_system: bool, // default false + pub for_issue_iid: Option, // filter by parent issue iid + pub for_mr_iid: Option, // filter by parent MR iid + pub note_id: Option, // filter by local note row id (exact) + pub gitlab_note_id: Option, // filter by GitLab note id (exact) + pub discussion_id: Option, // filter by local discussion id (exact) + pub since: Option<&'a str>, + pub until: Option<&'a str>, // end of time window + pub path: Option<&'a str>, // exact or prefix (trailing /) + pub contains: Option<&'a str>, // case-insensitive body substring match + pub resolution: &'a str, // "any" (default) | "unresolved" | "resolved" + pub sort: &'a str, // "created" (default) | "updated" + pub order: &'a str, // "desc" (default) | "asc" +} +``` + +**Query function** `query_notes(conn, filters) -> Result`: + +**Time window parsing:** Parse `since` and `until` relative to a single anchored `now_ms` captured once at the start of `query_notes()`. This prevents subtle drift if parsing happens at different times (e.g., across midnight boundary). When `--until` is a date string (`YYYY-MM-DD`), interpret as end-of-day (`23:59:59.999 UTC`) so that `--until 2025-06-15` includes all events on June 15th, not just those before midnight. After both values are parsed, validate `since_ms <= until_ms` — if the window is inverted (e.g., `--since 30d --until 90d`, which means "30 days ago to 90 days ago" — an inverted range), return a clear error: + +```rust +let now_ms = Utc::now().timestamp_millis(); + +let since_ms = filters.since.map(|s| parse_since_with_anchor(s, now_ms)).transpose()?; +let until_ms = filters.until.map(|s| parse_until_with_anchor(s, now_ms)).transpose()?; + +if let (Some(s), Some(u)) = (since_ms, until_ms) { + if s > u { + return Err(LoreError::Usage(format!( + "Invalid time window: --since ({}) is after --until ({}). \ + Did you mean --since {} --until {}?", + format_iso(s), format_iso(u), + filters.until.unwrap(), filters.since.unwrap(), + ))); + } +} +``` + +**`parse_until_with_anchor`** differs from `parse_since_with_anchor` in one way: when the input is a `YYYY-MM-DD` date, it returns end-of-day (23:59:59.999 UTC) instead of start-of-day. For relative formats (`7d`, `2w`, `1m`), it behaves identically to `parse_since_with_anchor`. + +Core SQL shape: + +```sql +SELECT + n.id, n.gitlab_id, n.author_username, n.body, n.note_type, + n.is_system, n.created_at, n.updated_at, + n.position_new_path, n.position_new_line, + n.position_old_path, n.position_old_line, + n.resolvable, n.resolved, n.resolved_by, + d.noteable_type, + COALESCE(i.iid, m.iid) AS parent_iid, + COALESCE(i.title, m.title) AS parent_title, + p.path_with_namespace +FROM notes n +JOIN discussions d ON n.discussion_id = d.id +JOIN projects p ON n.project_id = p.id +LEFT JOIN issues i ON d.issue_id = i.id +LEFT JOIN merge_requests m ON d.merge_request_id = m.id +WHERE {dynamic_filters} +ORDER BY {sort_column} {order}, n.id {order} +LIMIT ? +``` + +**Important:** The `ORDER BY` includes `n.id` as a deterministic tiebreaker. Notes with identical timestamps will always sort in the same order. This follows SQLite best practice for reproducible result sets. + +Dynamic WHERE clauses follow the same `where_clauses` + `params` vec pattern as `query_issues()` (see `list.rs:287-374`). + +Filter mappings: +- `include_system = false` (default): `n.is_system = 0` +- `author`: strip `@` prefix, `n.author_username = ? COLLATE NOCASE` +- `author_id`: `n.author_id = ?` (exact immutable identity match). If both `author` and `author_id` are provided, both are applied (AND) for precision — this lets users query "notes by user 12345 when they were known as jdefting" +- `note_type`: `n.note_type = ?` +- `project`: `resolve_project(conn, project)?` then `n.project_id = ?` +- `note_id`: `n.id = ?` (exact local row ID match — useful for debugging sync correctness) +- `gitlab_note_id`: `n.gitlab_id = ?` (exact GitLab note ID match — cross-reference with GitLab API) +- `discussion_id`: `n.discussion_id = ?` (all notes in a specific discussion thread) +- `since`: parsed via `parse_since_with_anchor(since_str, now_ms)` then `n.created_at >= ?` +- `until`: parsed via `parse_until_with_anchor(until_str, now_ms)` then `n.created_at <= ?` +- `path` with trailing `/`: `n.position_new_path LIKE ? ESCAPE '\'` (use `escape_like` from `filters.rs`) +- `path` without trailing `/`: `n.position_new_path = ?` +- `resolution = "unresolved"`: `n.resolvable = 1 AND n.resolved = 0` +- `resolution = "resolved"`: `n.resolvable = 1 AND n.resolved = 1` +- `resolution = "any"`: no filter (default) +- `for_issue_iid`: requires resolved project_id (from `--project` flag or `defaultProject` config). SQL: `d.issue_id = (SELECT id FROM issues WHERE iid = ? AND project_id = ?)` — the project_id param comes from the already-resolved project context +- `for_mr_iid`: same pattern — `d.merge_request_id = (SELECT id FROM merge_requests WHERE iid = ? AND project_id = ?)` — requires resolved project_id + +**IID scoping rule:** `for_issue_iid` and `for_mr_iid` require a project context because IIDs are only unique within a project. The query layer validates this: if `for_issue_iid` or `for_mr_iid` is set without a resolved project_id, return an error. The project can come from either `--project` flag or `defaultProject` in config (resolved via the existing `resolve_project()` which already handles `defaultProject` fallback). Note: the CLI does NOT use clap's `requires = "project"` constraint for these flags, because that would block `defaultProject` resolution — the validation happens at the query layer instead. + +COUNT query first (same pattern as issues), then SELECT with LIMIT. + +**Public entry point:** + +```rust +pub fn run_list_notes(config: &Config, filters: NoteListFilters) -> Result { + let db_path = get_db_path(config.storage.db_path.as_deref()); + let conn = create_connection(&db_path)?; + query_notes(&conn, &filters) +} +``` + +--- + +### Work Chunk 1B: CLI Arguments & Command Wiring + +**Files:** `src/cli/mod.rs`, `src/main.rs`, `src/cli/commands/mod.rs`, `src/cli/robot.rs` + +**Depends on:** Work Chunk 1A + +#### Tests to Write First + +No unit tests for CLI arg parsing (clap handles this). Integration-level assertions: + +```rust +// In src/cli/robot.rs tests (or new test module): + +#[test] +fn test_expand_fields_preset_notes() { + let fields = vec!["minimal".to_string()]; + let expanded = expand_fields_preset(&fields, "notes"); + assert_eq!(expanded, vec!["id", "author_username", "body", "created_at_iso"]); +} +``` + +#### Implementation + +**1. Add `NotesArgs` to `src/cli/mod.rs`** (after `MrsArgs`, around line 472): + +```rust +#[derive(Parser)] +#[command(after_help = "\x1b[1mExamples:\x1b[0m + lore notes --author jdefting --since 365d # All of jdefting's notes in past year + lore notes --author jdefting --note-type DiffNote # Only code review comments + lore notes --path src/auth/ --resolution unresolved # Unresolved comments on auth code + lore notes --for-mr 456 -p group/repo # All notes on MR !456 + lore notes --since 180d --until 90d # Notes from 180 to 90 days ago + lore notes --author jdefting --format jsonl # Stream notes for LLM analysis + lore notes --contains \"unwrap\" --note-type DiffNote # Find review comments mentioning unwrap")] +pub struct NotesArgs { + /// Maximum results + #[arg(short = 'n', long = "limit", default_value = "50", help_heading = "Output")] + pub limit: usize, + + /// Select output fields (comma-separated, or 'minimal' preset) + #[arg(long, help_heading = "Output", value_delimiter = ',')] + pub fields: Option>, + + /// Output format (table, json, jsonl, csv) + #[arg(long, value_parser = ["table", "json", "jsonl", "csv"], default_value = "table", help_heading = "Output")] + pub format: String, + + /// Filter by author username (case-insensitive) + #[arg(short = 'a', long, help_heading = "Filters")] + pub author: Option, + + /// Filter by immutable GitLab author id (stable across username changes) + #[arg(long = "author-id", help_heading = "Filters")] + pub author_id: Option, + + /// Filter by note type (DiffNote, DiscussionNote) + #[arg(long = "note-type", value_parser = ["DiffNote", "DiscussionNote"], help_heading = "Filters")] + pub note_type: Option, + + /// Filter by case-insensitive substring in note body + #[arg(long, help_heading = "Filters")] + pub contains: Option, + + /// Filter by local note row id (exact match, for debugging) + #[arg(long = "note-id", help_heading = "Filters")] + pub note_id: Option, + + /// Filter by GitLab note id (exact match, for cross-referencing) + #[arg(long = "gitlab-note-id", help_heading = "Filters")] + pub gitlab_note_id: Option, + + /// Filter by local discussion id (all notes in a thread) + #[arg(long = "discussion-id", help_heading = "Filters")] + pub discussion_id: Option, + + /// Include system-generated notes (excluded by default) + #[arg(long = "include-system", help_heading = "Filters", overrides_with = "no_include_system")] + pub include_system: bool, + + #[arg(long = "no-include-system", hide = true, overrides_with = "include_system")] + pub no_include_system: bool, + + /// Filter to notes on a specific issue IID (requires --project or defaultProject) + #[arg(long = "for-issue", help_heading = "Filters", conflicts_with = "for_mr")] + pub for_issue: Option, + + /// Filter to notes on a specific MR IID (requires --project or defaultProject) + #[arg(long = "for-mr", help_heading = "Filters", conflicts_with = "for_issue")] + pub for_mr: Option, + + /// Filter by project path + #[arg(short = 'p', long, help_heading = "Filters")] + pub project: Option, + + /// Filter by start time (7d, 2w, 1m, or YYYY-MM-DD) + #[arg(long, help_heading = "Filters")] + pub since: Option, + + /// Filter by end time (7d, 2w, 1m, or YYYY-MM-DD) + #[arg(long, help_heading = "Filters")] + pub until: Option, + + /// Filter by file path (trailing / for prefix match) + #[arg(long, help_heading = "Filters")] + pub path: Option, + + /// Resolution filter: any (default), unresolved, resolved + #[arg(long, value_parser = ["any", "unresolved", "resolved"], default_value = "any", help_heading = "Filters")] + pub resolution: String, + + /// Sort field (created, updated) + #[arg(long, value_parser = ["created", "updated"], default_value = "created", help_heading = "Sorting")] + pub sort: String, + + /// Sort ascending (default: descending) + #[arg(long, help_heading = "Sorting", overrides_with = "no_asc")] + pub asc: bool, + + #[arg(long = "no-asc", hide = true, overrides_with = "asc")] + pub no_asc: bool, +} +``` + +**Note on `--for-issue` / `--for-mr`:** These flags do NOT use clap's `requires = "project"` constraint. The `defaultProject` config option provides the project context without the `--project` flag being explicitly passed. Validation happens at the query layer (Work Chunk 1A) — if neither `--project` nor `defaultProject` resolves a project, the query returns a clear error. + +**2. Add `Notes` variant to `Commands` enum** in `src/cli/mod.rs` (around line 113): + +```rust +/// List discussion notes with filtering +Notes(NotesArgs), +``` + +**3. Add `"notes"` minimal preset to `expand_fields_preset()`** in `src/cli/robot.rs` (around line 42): + +```rust +"notes" => ["id", "author_username", "body", "created_at_iso"] + .iter() + .map(|s| (*s).to_string()) + .collect(), +``` + +**4. Add handler in `src/main.rs`** (follow `handle_issues`/`handle_mrs` pattern): + +```rust +fn handle_notes(config_path: Option<&str>, args: NotesArgs, robot_mode: bool) -> Result<()> { + let config = load_config(config_path)?; + let start = std::time::Instant::now(); + + let filters = NoteListFilters { + limit: args.limit, + project: args.project.as_deref(), + author: args.author.as_deref(), + author_id: args.author_id, + note_type: args.note_type.as_deref(), + include_system: args.include_system, + for_issue_iid: args.for_issue, + for_mr_iid: args.for_mr, + note_id: args.note_id, + gitlab_note_id: args.gitlab_note_id, + discussion_id: args.discussion_id, + since: args.since.as_deref(), + until: args.until.as_deref(), + path: args.path.as_deref(), + contains: args.contains.as_deref(), + resolution: &args.resolution, + sort: &args.sort, + order: if args.asc { "asc" } else { "desc" }, + }; + + // JSONL and CSV use streaming path (no full materialization in memory) + // Table and JSON use buffered path (need total_count for envelope/summary) + match (robot_mode, args.format.as_str()) { + (_, "jsonl") => { + let conn = open_db(&config)?; + print_list_notes_jsonl_stream(&conn, &filters)?; + } + (_, "csv") => { + let conn = open_db(&config)?; + print_list_notes_csv_stream(&conn, &filters)?; + } + _ => { + let result = run_list_notes(&config, filters)?; + match (robot_mode, args.format.as_str()) { + (true, _) | (_, "json") => { + print_list_notes_json(&result, start.elapsed().as_millis() as u64, args.fields.as_deref()); + } + _ => { + print_list_notes(&result); + } + } + } + } + Ok(()) +} +``` + +Add dispatch in main match (around line 175): + +```rust +Some(Commands::Notes(args)) => handle_notes(cli.config.as_deref(), args, robot_mode), +``` + +**5. Re-export in `src/cli/commands/mod.rs`:** + +```rust +pub use list::{run_list_notes, print_list_notes, print_list_notes_json, print_list_notes_jsonl, print_list_notes_csv}; +``` + +--- + +### Work Chunk 1C: Human & Robot Output Formatting + +**Files:** `src/cli/commands/list.rs` + +**Depends on:** Work Chunk 1A + +#### Tests to Write First + +```rust +#[test] +fn test_truncate_note_body() { + // Body with 200 chars should truncate to 80 + "..." + let body = "x".repeat(200); + let truncated = truncate_with_ellipsis(&body, 80); + assert_eq!(truncated.len(), 80); + assert!(truncated.ends_with("...")); +} + +#[test] +fn test_csv_output_roundtrip() { + // NoteListRow with body containing commas, quotes, newlines, and multi-byte chars + // Write via print_list_notes_csv, parse back with csv::ReaderBuilder + // Assert: all fields roundtrip correctly +} + +#[test] +fn test_jsonl_output_one_per_line() { + // NoteListResult with 3 notes + // Capture stdout, split by newline + // Assert: each line parses as valid JSON + // Assert: 3 lines total +} +``` + +#### Implementation + +**`print_list_notes(result: &NoteListResult)`** — human-readable table: + +Table columns: `ID | Author | Type | Body (truncated 60) | Path:Line | Parent | Created` + +- ID: `colored_cell(note.gitlab_id, Color::Cyan)` +- Author: `colored_cell(format!("@{}", author), Color::Magenta)` +- Type: "Diff" or "Disc" or "-" (colored) +- Body: first line, truncated to 60 chars +- Path:Line: `position_new_path:position_new_line` or "-" +- Parent: `Issue #42` or `MR !456` (from noteable_type + parent_iid) +- Created: `format_relative_time(created_at)` + +**`print_list_notes_json(result, elapsed_ms, fields)`** — robot JSON: + +Follows exact envelope pattern: + +```json +{ + "ok": true, + "data": { + "notes": [...], + "total_count": N, + "showing": M + }, + "meta": { "elapsed_ms": U64 } +} +``` + +Supports `--fields` via `filter_fields(&mut output, "notes", &expanded)`. + +**`print_list_notes_jsonl` / `print_list_notes_csv`** — streaming output: + +For JSONL and CSV formats, use a **streaming path** that writes rows directly to stdout as they're read from the database, avoiding full materialization in memory. This matters for the year-long analysis use case where `--limit 10000` or higher is common, and for piped workflows where downstream consumers (jq, LLM ingestion) can begin processing before the query completes. + +**`print_list_notes_jsonl_stream(conn, filters)`** — streaming JSONL: + +```rust +// Execute query, iterate over rows with a callback +query_notes_stream(&conn, &filters, |row| { + let json_row = NoteListRowJson::from(&row); + println!("{}", serde_json::to_string(&json_row).unwrap()); + Ok(()) +})?; +``` + +Each line is a complete `NoteListRowJson` object. No envelope, no metadata. This format is ideal for streaming into LLM prompts, `jq` pipelines, or notebook ingestion. + +**`print_list_notes_csv_stream(conn, filters)`** — streaming CSV: + +```rust +let mut wtr = csv::Writer::from_writer(std::io::stdout()); +wtr.write_record(&["id", "gitlab_id", "author_username", "body", "note_type", ...])?; +query_notes_stream(&conn, &filters, |row| { + let json_row = NoteListRowJson::from(&row); + wtr.write_record(&[json_row.id.to_string(), ...])?; + Ok(()) +})?; +wtr.flush()?; +``` + +Columns mirror `NoteListRowJson` field names. Uses the `csv` crate (`csv::Writer`) for RFC 4180-compliant escaping, handling commas, quotes, newlines, and multi-byte characters correctly. + +**`query_notes_stream(conn, filters, row_handler)`** — forward-only row iteration that calls `row_handler` for each row. Uses the same SQL as `query_notes()` but iterates with `rusqlite::Statement::query_map()` instead of collecting into a Vec. The table and JSON formats continue to use the buffered `query_notes()` path since they need `total_count` and `showing` metadata. + +**Note:** The streaming path skips the COUNT query since there's no envelope to report `total_count` in. For JSONL, this is expected — consumers count lines themselves. For CSV, the header row provides column names; row count is implicit. + +**Dependency:** Add `csv = "1"` to `Cargo.toml` under `[dependencies]`. The `csv` crate is well-maintained, widely adopted (~100M downloads), and has zero unsafe code. + +--- + +### Work Chunk 1D: robot-docs Integration + +**Files:** Wherever `robot-docs` manifest is generated (search for `robot-docs` or `RobotDocs` command handler) + +**Depends on:** Work Chunks 1A-1C complete + +Add the `notes` command to the robot-docs manifest with: +- Command name, description, flags (including `--format`, `--until`, `--resolution`, `--contains`) +- Response schema for robot mode +- Exit codes + +Also update `--type` value_parser on `SearchArgs` (line 542 of `src/cli/mod.rs`) to include `"note"` and `"notes"` as valid values for `--source-type` (this is forward-prep for Phase 2 but doesn't break anything until Phase 2 lands). + +--- + +### Work Chunk 1E: Composite Query Index + +**Files:** `migrations/022_notes_query_index.sql`, `src/core/db.rs` + +**Depends on:** Nothing (standalone, can run in parallel with 1A) + +**Context:** The `notes` table already has single-column indexes on `author_username`, `discussion_id`, `note_type`, `position_new_path`, and a composite `idx_notes_diffnote_path_created`. However, the new `query_notes()` function's most common query patterns would benefit from composite covering indexes. + +#### Tests to Write First + +```rust +#[test] +fn test_migration_022_indexes_exist() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + run_migrations(&conn).unwrap(); + + // Verify all indexes were created + let count: i64 = conn.query_row( + "SELECT COUNT(*) FROM sqlite_master WHERE type='index' AND name IN ( + 'idx_notes_user_created', 'idx_notes_project_created', + 'idx_notes_project_path_created', + 'idx_discussions_issue_id', 'idx_discussions_mr_id' + )", + [], + |r| r.get(0), + ).unwrap(); + assert_eq!(count, 5); +} +``` + +#### Implementation + +**Migration SQL** (`migrations/022_notes_query_index.sql`): + +```sql +-- Composite index for the common "notes by author" query pattern: +-- non-system notes filtered by author, sorted by created_at DESC with id tiebreaker. +-- The is_system partial index condition avoids indexing system notes (which are +-- filtered out by default and typically comprise 30-50% of all notes). +-- Uses COLLATE NOCASE to match the query's case-insensitive author comparison. +CREATE INDEX IF NOT EXISTS idx_notes_user_created +ON notes(project_id, author_username COLLATE NOCASE, created_at DESC, id DESC) +WHERE is_system = 0; + +-- Composite index for the common "all notes in project by date" query pattern: +-- serves project-scoped listings without author filter. +CREATE INDEX IF NOT EXISTS idx_notes_project_created +ON notes(project_id, created_at DESC, id DESC) +WHERE is_system = 0; + +-- Composite index for path-centric note queries (--path with project/date filters). +-- DiffNote reviews on specific files are a stated hot path for the reviewer +-- profiling use case. Only indexes rows where position_new_path is populated. +CREATE INDEX IF NOT EXISTS idx_notes_project_path_created +ON notes(project_id, position_new_path, created_at DESC, id DESC) +WHERE is_system = 0 AND position_new_path IS NOT NULL; + +-- Index on discussions.issue_id for efficient JOIN when filtering by parent issue. +-- The query_notes() function JOINs discussions to reach parent entities. +CREATE INDEX IF NOT EXISTS idx_discussions_issue_id +ON discussions(issue_id); + +-- Index on discussions.merge_request_id for efficient JOIN when filtering by parent MR. +CREATE INDEX IF NOT EXISTS idx_discussions_mr_id +ON discussions(merge_request_id); +``` + +The first partial index serves the primary use case (author-scoped queries) with `COLLATE NOCASE` matching the query's case-insensitive author comparison. The second serves project-scoped date-range queries (`--since`/`--until` without `--author`). The third serves path-centric DiffNote queries (`--path src/auth/` combined with project and date filters). All three exclude system notes, which are filtered out by default. The discussion indexes accelerate the JOIN path used by all note queries. + +**Register in `src/core/db.rs`:** + +Add to the `MIGRATIONS` array (after migration 021): + +```rust +( + "022", + include_str!("../../migrations/022_notes_query_index.sql"), +), +``` + +**Note:** This bumps the migration number, so Work Chunk 2A's schema migration (which was originally numbered 022) becomes migration **023** instead. + +--- + +## Phase 2: Per-Note Documents + +### Work Chunk 2A: Schema Migration (023) + +**Files:** `migrations/023_note_documents.sql`, `src/core/db.rs` + +**Depends on:** Work Chunk 1E (must come after migration 022) + +**Context:** Current migration is 021 (022 after Work Chunk 1E). The `documents` and `dirty_sources` tables have CHECK constraints limiting `source_type` to `('issue','merge_request','discussion')`. SQLite doesn't support `ALTER TABLE ... ALTER CONSTRAINT`, so we use the table-rebuild pattern. + +#### Tests to Write First + +```rust +// In src/core/db.rs tests or a new migration test: + +#[test] +fn test_migration_023_allows_note_source_type() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + run_migrations(&conn).unwrap(); + + // Should NOT error — note is now a valid source_type + conn.execute( + "INSERT INTO dirty_sources (source_type, source_id, queued_at) VALUES ('note', 1, 1000)", + [], + ).unwrap(); + + conn.execute( + "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash, is_truncated) + VALUES ('note', 1, 1, 'test', 'abc123', 0)", + [], + ).unwrap(); +} + +#[test] +fn test_migration_023_preserves_existing_data() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + run_migrations(&conn).unwrap(); + + // Insert with old source types still works + conn.execute( + "INSERT INTO dirty_sources (source_type, source_id, queued_at) VALUES ('issue', 1, 1000)", + [], + ).unwrap(); + conn.execute( + "INSERT INTO dirty_sources (source_type, source_id, queued_at) VALUES ('discussion', 2, 1000)", + [], + ).unwrap(); + + let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap(); + assert_eq!(count, 2); +} + +#[test] +fn test_migration_023_fts_triggers_intact() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + run_migrations(&conn).unwrap(); + + // Insert a note document + conn.execute( + "INSERT INTO documents (source_type, source_id, project_id, title, content_text, content_hash, is_truncated) + VALUES ('note', 1, 1, 'Test Note', 'This is the note body', 'hash123', 0)", + [], + ).unwrap(); + + // FTS should auto-sync via trigger + let count: i64 = conn.query_row( + "SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'note'", + [], + |r| r.get(0), + ).unwrap(); + assert_eq!(count, 1); +} + +#[test] +fn test_migration_023_row_counts_preserved() { + // This test verifies the migration doesn't lose data during table rebuild. + // It runs all migrations up to version 22, inserts test data into documents/dirty_sources/ + // document_labels/document_paths BEFORE migration 023, then verifies + // counts are identical after migration 023 runs. + // (Implementation: create_connection_at_version(22) + insert data + run_migration(23) + assert counts) + // Note: This may require a test helper that runs migrations up to a specific version. +} +``` + +#### Implementation + +**Migration SQL** (`migrations/023_note_documents.sql`): + +The tables with CHECK constraints that need rebuilding: +1. `dirty_sources` — add `'note'` to source_type CHECK +2. `documents` — add `'note'` to source_type CHECK + +Pattern: create new table, copy data, drop old, rename. Must also recreate FTS triggers (they reference the table by name) and all indexes. + +**CRITICAL:** The `documents_fts` external content table references `documents` by rowid. Rebuilding `documents` changes rowids unless we preserve them. Use `INSERT INTO documents_new SELECT * FROM documents` to preserve the `id` (PRIMARY KEY = rowid). + +**CRITICAL:** The FTS triggers (`documents_ai`, `documents_ad`, `documents_au`) must be dropped and recreated after the table rebuild because they reference `documents` which was dropped/renamed. + +**Migration safety requirements:** +- The migration executes as a single transaction (SQLite migration runner wraps each migration in a transaction). +- After the table rebuild, verify row counts match: `SELECT COUNT(*) FROM documents` must equal the pre-rebuild count. The migration SQL captures counts into temp variables and asserts equality. +- Run `PRAGMA foreign_key_check` after the rebuild and abort on any violation. +- Rebuild FTS index and verify `documents_fts` row count matches `documents` row count. + +The migration must: +1. Drop FTS triggers +2. Create `documents_new` with updated CHECK (adding `'note'`) +3. `INSERT INTO documents_new SELECT * FROM documents` +4. Drop `documents` (cascades `document_labels`, `document_paths` due to ON DELETE CASCADE — so save those first!) +5. Actually: disable foreign keys, copy document_labels and document_paths data, drop old tables, rename new, recreate junction tables, restore data, recreate FTS triggers, recreate all indexes +6. Same pattern for `dirty_sources` (simpler — no dependents) + +```sql +-- Backfill: seed all existing non-system notes into the dirty queue +-- so the next generate-docs run creates documents for them. +-- Uses LEFT JOIN to skip notes that already have documents (idempotent). +-- ON CONFLICT DO NOTHING handles notes already in the dirty queue. +INSERT INTO dirty_sources (source_type, source_id, queued_at) +SELECT 'note', n.id, CAST(strftime('%s', 'now') AS INTEGER) * 1000 +FROM notes n +LEFT JOIN documents d + ON d.source_type = 'note' AND d.source_id = n.id +WHERE n.is_system = 0 AND d.id IS NULL +ON CONFLICT(source_type, source_id) DO NOTHING; +``` + +**Register in `src/core/db.rs`:** + +Add to the `MIGRATIONS` array (after migration 022): + +```rust +( + "023", + include_str!("../../migrations/023_note_documents.sql"), +), +``` + +**Note:** This is a data-only migration — no schema changes. It's safe to run on empty databases (no notes = no-op). On databases with existing notes, it queues them for document generation on the next `lore generate-docs` or `lore sync` run. + +--- + +### Work Chunk 2B: SourceType Enum Extension + +**Files:** `src/documents/extractor.rs` + +**Depends on:** Work Chunk 2A (migration must exist so test DBs have the right schema) + +#### Tests to Write First + +Add to `src/documents/extractor.rs` in the existing test module: + +```rust +#[test] +fn test_source_type_parse_note() { + assert_eq!(SourceType::parse("note"), Some(SourceType::Note)); + assert_eq!(SourceType::parse("notes"), Some(SourceType::Note)); + assert_eq!(SourceType::parse("NOTE"), Some(SourceType::Note)); +} + +#[test] +fn test_source_type_note_as_str() { + assert_eq!(SourceType::Note.as_str(), "note"); +} + +#[test] +fn test_source_type_note_display() { + assert_eq!(format!("{}", SourceType::Note), "note"); +} + +#[test] +fn test_source_type_note_serde_roundtrip() { + let st = SourceType::Note; + let json = serde_json::to_string(&st).unwrap(); + assert_eq!(json, "\"note\""); + let parsed: SourceType = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed, SourceType::Note); +} +``` + +#### Implementation + +In `src/documents/extractor.rs`: + +1. Add `Note` variant to `SourceType` enum (line 18): + ```rust + Note, + ``` + +2. Add match arm to `as_str()` (line 27): + ```rust + Self::Note => "note", + ``` + +3. Add parse aliases (line 35): + ```rust + "note" | "notes" => Some(Self::Note), + ``` + +--- + +### Work Chunk 2C: Note Document Extractor + +**Files:** `src/documents/extractor.rs` + +**Depends on:** Work Chunk 2B + +**Context:** Follows the exact pattern of `extract_issue_document()` (lines 85-184) and `extract_discussion_document()` (lines 302-516). The new function extracts a single non-system note into a `DocumentData` struct. + +#### Tests to Write First + +Add to `src/documents/extractor.rs` test module. Uses `setup_discussion_test_db()` (line 1025) and `insert_note()` (line 1086) helpers that already exist. + +```rust +#[test] +fn test_note_document_basic_format() { + let conn = setup_discussion_test_db(); + insert_issue(&conn, 1, 42, Some("Auth redesign"), Some("desc"), "opened", Some("alice"), + Some("https://gitlab.example.com/group/project-one/-/issues/42")); + insert_discussion(&conn, 1, "Issue", Some(1), None); + insert_note(&conn, 1, 12345, 1, Some("jdefting"), Some("This function is too complex, consider extracting the validation logic."), 1710460800000, false, None, None); + + let doc = extract_note_document(&conn, 1).unwrap().unwrap(); + assert_eq!(doc.source_type, SourceType::Note); + assert_eq!(doc.source_id, 1); + assert_eq!(doc.project_id, 1); + assert_eq!(doc.author_username, Some("jdefting".to_string())); + assert!(doc.content_text.contains("[[Note]]")); + assert!(doc.content_text.contains("author: @jdefting")); + assert!(doc.content_text.contains("This function is too complex")); + assert!(doc.content_text.contains("Issue #42: Auth redesign")); + assert!(doc.content_text.contains("group/project-one")); + assert_eq!(doc.title, Some("Note by @jdefting on Issue #42".to_string())); + assert!(!doc.is_truncated); +} + +#[test] +fn test_note_document_diffnote_with_path() { + let conn = setup_discussion_test_db(); + insert_mr(&conn, 1, 99, Some("JWT Auth"), Some("desc"), Some("opened"), Some("alice"), + Some("feat/jwt"), Some("main"), Some("https://gitlab.example.com/group/project-one/-/merge_requests/99")); + insert_discussion(&conn, 1, "MergeRequest", None, Some(1)); + insert_note(&conn, 1, 54321, 1, Some("jdefting"), Some("This should use a match statement"), + 1710460800000, false, Some("src/old_auth.rs"), Some("src/auth.rs")); + + let doc = extract_note_document(&conn, 1).unwrap().unwrap(); + assert_eq!(doc.paths, vec!["src/auth.rs", "src/old_auth.rs"]); + assert!(doc.content_text.contains("path: src/auth.rs")); + assert!(doc.content_text.contains("MR !99: JWT Auth")); + assert_eq!(doc.title, Some("Note by @jdefting on MR !99".to_string())); + assert!(doc.url.unwrap().contains("#note_54321")); +} + +#[test] +fn test_note_document_inherits_parent_labels() { + let conn = setup_discussion_test_db(); + insert_issue(&conn, 1, 10, Some("Test"), Some("desc"), "opened", None, None); + insert_label(&conn, 1, "backend"); + insert_label(&conn, 2, "security"); + link_issue_label(&conn, 1, 1); + link_issue_label(&conn, 1, 2); + insert_discussion(&conn, 1, "Issue", Some(1), None); + insert_note(&conn, 1, 100, 1, Some("alice"), Some("Hello"), 1710460800000, false, None, None); + + let doc = extract_note_document(&conn, 1).unwrap().unwrap(); + assert_eq!(doc.labels, vec!["backend", "security"]); +} + +#[test] +fn test_note_document_mr_labels() { + let conn = setup_discussion_test_db(); + insert_mr(&conn, 1, 10, Some("Test"), None, Some("opened"), None, None, None, None); + insert_label(&conn, 1, "review"); + link_mr_label(&conn, 1, 1); + insert_discussion(&conn, 1, "MergeRequest", None, Some(1)); + insert_note(&conn, 1, 100, 1, Some("reviewer"), Some("LGTM"), 1710460800000, false, None, None); + + let doc = extract_note_document(&conn, 1).unwrap().unwrap(); + assert_eq!(doc.labels, vec!["review"]); +} + +#[test] +fn test_note_document_system_note_returns_none() { + let conn = setup_discussion_test_db(); + insert_issue(&conn, 1, 10, Some("Test"), Some("desc"), "opened", None, None); + insert_discussion(&conn, 1, "Issue", Some(1), None); + insert_note(&conn, 1, 100, 1, Some("bot"), Some("assigned to @alice"), 1710460800000, true, None, None); + + let result = extract_note_document(&conn, 1).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn test_note_document_not_found() { + let conn = setup_discussion_test_db(); + let result = extract_note_document(&conn, 999).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn test_note_document_orphaned_discussion() { + // Discussion exists but parent issue was deleted + let conn = setup_discussion_test_db(); + insert_issue(&conn, 99, 10, Some("Deleted"), None, "opened", None, None); + insert_discussion(&conn, 1, "Issue", Some(99), None); + insert_note(&conn, 1, 100, 1, Some("alice"), Some("Hello"), 1710460800000, false, None, None); + conn.execute("PRAGMA foreign_keys = OFF", []).unwrap(); + conn.execute("DELETE FROM issues WHERE id = 99", []).unwrap(); + conn.execute("PRAGMA foreign_keys = ON", []).unwrap(); + + let result = extract_note_document(&conn, 1).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn test_note_document_hash_deterministic() { + let conn = setup_discussion_test_db(); + insert_issue(&conn, 1, 10, Some("Test"), Some("desc"), "opened", None, None); + insert_discussion(&conn, 1, "Issue", Some(1), None); + insert_note(&conn, 1, 100, 1, Some("alice"), Some("Comment"), 1710460800000, false, None, None); + + let doc1 = extract_note_document(&conn, 1).unwrap().unwrap(); + let doc2 = extract_note_document(&conn, 1).unwrap().unwrap(); + assert_eq!(doc1.content_hash, doc2.content_hash); + assert_eq!(doc1.labels_hash, doc2.labels_hash); + assert_eq!(doc1.paths_hash, doc2.paths_hash); +} + +#[test] +fn test_note_document_empty_body() { + let conn = setup_discussion_test_db(); + insert_issue(&conn, 1, 10, Some("Test"), Some("desc"), "opened", None, None); + insert_discussion(&conn, 1, "Issue", Some(1), None); + insert_note(&conn, 1, 100, 1, Some("alice"), Some(""), 1710460800000, false, None, None); + + // Should still produce a document (body is optional in schema) + let doc = extract_note_document(&conn, 1).unwrap().unwrap(); + assert!(doc.content_text.contains("[[Note]]")); +} + +#[test] +fn test_note_document_null_body() { + let conn = setup_discussion_test_db(); + insert_issue(&conn, 1, 10, Some("Test"), Some("desc"), "opened", None, None); + insert_discussion(&conn, 1, "Issue", Some(1), None); + insert_note(&conn, 1, 100, 1, Some("alice"), None, 1710460800000, false, None, None); + + // Should still produce a document (body is optional in schema) + let doc = extract_note_document(&conn, 1).unwrap().unwrap(); + assert!(doc.content_text.contains("[[Note]]")); +} +``` + +#### Implementation + +Add `extract_note_document()` to `src/documents/extractor.rs` (after `extract_discussion_document`, around line 516): + +```rust +pub fn extract_note_document( + conn: &Connection, + note_id: i64, +) -> Result> { + // 1. Fetch the note + let note_row = conn.query_row( + "SELECT n.id, n.gitlab_id, n.author_username, n.body, n.note_type, + n.is_system, n.created_at, n.updated_at, + n.position_old_path, n.position_new_path, + n.position_new_line, + n.resolvable, n.resolved, + d.noteable_type, d.issue_id, d.merge_request_id, + p.path_with_namespace, p.id AS project_id + FROM notes n + JOIN discussions d ON n.discussion_id = d.id + JOIN projects p ON n.project_id = p.id + WHERE n.id = ?1", + rusqlite::params![note_id], + |row| { /* map all fields */ }, + ); + // Handle QueryReturnedNoRows -> Ok(None) + + // 2. Skip system notes + if is_system { return Ok(None); } + + // 3. Fetch parent entity (Issue or MR) — same pattern as extract_discussion_document lines 332-401 + // Get parent_iid, parent_title, parent_web_url, labels + + // 4. Build paths BTreeSet from position_old_path, position_new_path + + // 5. Build URL: parent_web_url + "#note_{gitlab_id}" + + // 6. Format content with structured metadata header: + // [[Note]] + // source_type: note + // note_gitlab_id: {gitlab_id} + // project: {path_with_namespace} + // parent_type: {Issue|MergeRequest} + // parent_iid: {iid} + // parent_title: {title} + // note_type: {DiffNote|DiscussionNote|Comment} + // author: @{author} + // author_id: {author_id} (only if non-null) + // created_at: {iso8601} + // resolved: {true|false} (only if resolvable) + // path: {position_new_path}:{position_new_line} (only if DiffNote) + // labels: {comma-separated} + // url: {url} + // + // --- Body --- + // + // {body} + + // 7. Title: "Note by @{author} on {parent_type_prefix}" + + // 8. Compute hashes, apply truncate_hard_cap, return DocumentData +} +``` + +The content format uses a structured key-value header optimized for machine parsing and semantic search, followed by the raw note body. This is deliberately different from discussion documents — it's optimized for individual note semantics rather than thread context. + +**Structured header rationale:** The key-value format allows the embedding model and FTS to index structured fields (author, project, parent reference) alongside the free-text body, improving search precision for queries like "jdefting's comments on authentication issues." + +--- + +### Work Chunk 2D: Regenerator & Dirty Tracking Integration + +**Files:** `src/documents/regenerator.rs`, `src/ingestion/discussions.rs`, `src/ingestion/mr_discussions.rs` + +**Depends on:** Work Chunks 0A, 2B, 2C + +#### Tests to Write First + +**In `src/documents/regenerator.rs` tests:** + +```rust +#[test] +fn test_regenerate_note_document() { + let conn = setup_db(); + // Add discussions + notes tables to setup_db() (or use a richer setup) + // Insert: project, issue, discussion, non-system note + // mark_dirty(SourceType::Note, note_id) + // regenerate_dirty_documents() + // Assert: document created with source_type = 'note' + // Assert: document content contains note body +} + +#[test] +fn test_regenerate_note_system_note_deletes() { + // Insert system note, mark dirty + // regenerate_dirty_documents() + // Assert: no document created (extract returns None -> delete path) +} + +#[test] +fn test_regenerate_note_unchanged() { + // Create note, regenerate, mark dirty again, regenerate + // Assert: second run returns unchanged = 1 +} + +#[test] +fn test_note_ingestion_idempotent_across_two_syncs() { + // Setup: project, issue, discussion, 3 non-system notes + // Run ingestion once -> verify 3 dirty notes queued + // Regenerate documents -> verify 3 note documents created + // Run ingestion again with identical data + // Assert: no new dirty entries (changed_semantics = false for all) +} +``` + +**In `src/ingestion/dirty_tracker.rs` tests:** + +```rust +#[test] +fn test_mark_dirty_note_type() { + // Update the test DB setup to include 'note' in CHECK constraint + let conn = setup_db(); // This needs the new CHECK + mark_dirty(&conn, SourceType::Note, 1).unwrap(); + let results = get_dirty_sources(&conn).unwrap(); + assert_eq!(results.len(), 1); + assert_eq!(results[0].0, SourceType::Note); +} +``` + +#### Implementation + +**1. Update `regenerate_one()` in `src/documents/regenerator.rs`** (line 90): + +```rust +SourceType::Note => extract_note_document(conn, source_id)?, +``` + +And add the import at line 8: + +```rust +use crate::documents::{ + DocumentData, SourceType, extract_discussion_document, extract_issue_document, + extract_mr_document, extract_note_document, +}; +``` + +**2. Add change-aware dirty marking in `src/ingestion/discussions.rs`** (in the new upsert loop from Phase 0): + +```rust +for note in &normalized_notes { + let outcome = upsert_note_for_issue(&tx, local_discussion_id, ¬e, last_seen_at)?; + if !note.is_system && outcome.changed_semantics { + dirty_tracker::mark_dirty_tx(&tx, SourceType::Note, outcome.local_note_id)?; + } +} +sweep_stale_issue_notes(&tx, local_discussion_id, last_seen_at)?; +``` + +**3. Same change-aware dirty marking in `src/ingestion/mr_discussions.rs`** (update the existing upsert loop): + +```rust +let outcome = upsert_note(&tx, local_discussion_id, ¬e, last_seen_at, None)?; +if !note.is_system && outcome.changed_semantics { + dirty_tracker::mark_dirty_tx(&tx, SourceType::Note, outcome.local_note_id)?; +} +``` + +**4. Update `dirty_tracker.rs` test `setup_db()`** to include `'note'` in the CHECK constraint (line 134). + +**5. Update `regenerator.rs` test `setup_db()`** to include the discussions + notes tables so note-type regeneration tests can run. + +--- + +### Work Chunk 2E: Generate-Docs Full Rebuild Support + +**Files:** Search for where `robot-docs` manifest is generated (search for `robot-docs` or `RobotDocs` command handler) + +**Depends on:** Work Chunk 2D + +**Context:** When `lore generate-docs --full` runs, it seeds ALL issues, MRs, and discussions into the dirty queue. Notes must be seeded too. + +#### Tests to Write First + +```rust +#[test] +fn test_full_seed_includes_notes() { + // Setup DB with project, issue, discussion, 3 non-system notes, 1 system note + // Call seed_all_dirty(conn) or whatever the full-rebuild seeder is named + // Assert: dirty_sources contains 3 entries with source_type = 'note' + // Assert: system note is NOT in dirty_sources +} + +#[test] +fn test_note_document_count_stable_after_second_generate_docs_full() { + // Setup DB with project, issue, discussion, 5 non-system notes + // Run generate-docs --full equivalent (seed + regenerate) + // Record document count + // Run generate-docs --full again + // Assert: document count unchanged (idempotent) + // Assert: dirty queue is empty after second run +} +``` + +#### Implementation + +Find the function that seeds dirty_sources for `--full` mode (likely in the generate-docs handler or a dedicated seeder function). Add: + +```sql +INSERT INTO dirty_sources (source_type, source_id, queued_at) +SELECT 'note', n.id, ?1 +FROM notes n +WHERE n.is_system = 0 +ON CONFLICT(source_type, source_id) DO UPDATE SET + queued_at = excluded.queued_at, + attempt_count = 0, + last_attempt_at = NULL, + last_error = NULL, + next_attempt_at = NULL +``` + +--- + +### Work Chunk 2F: Search CLI `--type note` Support + +**Files:** `src/cli/mod.rs`, `src/cli/commands/search.rs` (display code) + +**Depends on:** Work Chunks 2A-2E (documents must exist to be searched) + +#### Tests to Write First + +Integration/smoke test: + +```rust +#[test] +fn test_search_source_type_note_filter() { + // This is essentially testing that SourceType::Note flows through + // the existing search pipeline correctly. Since the search filter + // code is generic (filters.rs:70-73), the main test is that + // SourceType::parse("note") works — already covered in 2B. + // Add a smoke test that the CLI accepts --type note. +} +``` + +#### Implementation + +1. Update `SearchArgs.source_type` value_parser in `src/cli/mod.rs` (line 542): + ```rust + #[arg(long = "type", value_name = "TYPE", + value_parser = ["issue", "mr", "discussion", "note", "notes"], + help_heading = "Filters")] + pub source_type: Option, + ``` + +2. Update the search results display to show `"Note"` prefix for note-type results (check `print_search_results` in `src/cli/commands/search.rs`). + +--- + +### Work Chunk 2G: Parent Metadata Change Propagation + +**Files:** `src/ingestion/orchestrator.rs` (or wherever parent entity updates trigger dirty marking), `src/documents/regenerator.rs` + +**Depends on:** Work Chunk 2D + +**Context:** Note documents inherit metadata from their parent issue/MR — specifically labels and title. When a parent's title or labels change, the note documents derived from that parent become stale. The existing ingestion pipeline already marks discussion documents dirty when parent metadata changes. Note documents need the same treatment. + +#### Problem + +If issue #42's title changes from "Auth redesign" to "Auth overhaul", all note documents under that issue still say "Issue #42: Auth redesign" until their content is regenerated. Similarly, label changes on the parent propagate into the note document's `labels` field and `label_names` text. + +#### Tests to Write First + +```rust +#[test] +fn test_parent_title_change_marks_notes_dirty() { + // Setup: project, issue, discussion, 2 non-system notes + // Generate note documents (verify they exist) + // Change the issue title + // Trigger the parent-change propagation + // Assert: both note documents are in dirty_sources +} + +#[test] +fn test_parent_label_change_marks_notes_dirty() { + // Setup: project, issue with label "backend", discussion, note + // Generate note document (verify labels = ["backend"]) + // Add label "security" to the issue + // Trigger the parent-change propagation + // Assert: note document is in dirty_sources + // Regenerate and verify labels = ["backend", "security"] +} +``` + +#### Implementation + +Find where the ingestion pipeline detects parent entity changes and marks discussion documents dirty. Add the same logic for note documents: + +```sql +-- When an issue's title or labels change, mark all its non-system notes dirty +INSERT INTO dirty_sources (source_type, source_id, queued_at) +SELECT 'note', n.id, ?1 +FROM notes n +JOIN discussions d ON n.discussion_id = d.id +WHERE d.issue_id = ?2 AND n.is_system = 0 +ON CONFLICT(source_type, source_id) DO UPDATE SET + queued_at = excluded.queued_at, + attempt_count = 0 +``` + +Same pattern for MR parent changes. The exact integration point depends on how the existing discussion dirty-marking works — it should be adjacent to that code. + +**Note on deletion handling:** Note deletion is handled by two complementary mechanisms: +1. **Immediate propagation (Work Chunk 0B):** When sweep deletes stale notes, documents and dirty_sources entries are cleaned up in the same transaction. No stale search results. +2. **Eventual consistency (generate-docs --full):** For edge cases where a note was deleted outside the normal sweep path, the full rebuild catches orphaned documents since the note row no longer exists and `extract_note_document()` returns `None` -> document deleted. + +No additional deletion logic is needed beyond Work Chunk 0B + the existing regenerator orphan cleanup. + +--- + +### Work Chunk 2H: Backfill Existing Notes After Upgrade (Migration 024) + +**Files:** `migrations/024_note_dirty_backfill.sql`, `src/core/db.rs` + +**Depends on:** Work Chunk 2A (migration 023 must exist so `dirty_sources` accepts `source_type='note'`) + +**Context:** When a user upgrades to a version with note document support, existing notes in the database have no corresponding documents. Without a backfill, only notes that change after the upgrade would get documents — historical notes remain invisible to search. This migration seeds all existing non-system notes into the dirty queue so the next `generate-docs` run creates documents for them. + +#### Tests to Write First + +```rust +#[test] +fn test_migration_024_backfills_existing_notes() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + // Run migrations up through 023 + // Insert: project, issue, discussion, 5 non-system notes, 2 system notes + // Run migration 024 + // Assert: dirty_sources contains 5 entries with source_type = 'note' + // Assert: system notes are NOT in dirty_sources +} + +#[test] +fn test_migration_024_idempotent_with_existing_documents() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + // Run all migrations including 024 + // Insert: project, issue, discussion, 3 non-system notes + // Create note documents for 2 of 3 notes (simulate partial state) + // Re-run the backfill SQL manually + // Assert: only the 1 note without a document is in dirty_sources + // Assert: ON CONFLICT DO NOTHING prevents duplicates +} + +#[test] +fn test_migration_024_skips_notes_already_in_dirty_queue() { + let conn = create_connection(Path::new(":memory:")).unwrap(); + // Run all migrations + // Insert note and manually add to dirty_sources + // Re-run backfill SQL + // Assert: no duplicate entries (ON CONFLICT DO NOTHING) +} +``` + +#### Implementation + +**Migration SQL** (`migrations/024_note_dirty_backfill.sql`): + +```sql +-- Backfill: seed all existing non-system notes into the dirty queue +-- so the next generate-docs run creates documents for them. +-- Uses LEFT JOIN to skip notes that already have documents (idempotent). +-- ON CONFLICT DO NOTHING handles notes already in the dirty queue. +INSERT INTO dirty_sources (source_type, source_id, queued_at) +SELECT 'note', n.id, CAST(strftime('%s', 'now') AS INTEGER) * 1000 +FROM notes n +LEFT JOIN documents d + ON d.source_type = 'note' AND d.source_id = n.id +WHERE n.is_system = 0 AND d.id IS NULL +ON CONFLICT(source_type, source_id) DO NOTHING; +``` + +**Register in `src/core/db.rs`:** + +Add to the `MIGRATIONS` array (after migration 023): + +```rust +( + "024", + include_str!("../../migrations/024_note_dirty_backfill.sql"), +), +``` + +**Note:** This is a data-only migration — no schema changes. It's safe to run on empty databases (no notes = no-op). On databases with existing notes, it queues them for document generation on the next `lore generate-docs` or `lore sync` run. + +--- + +### Work Chunk 2I: Batch Parent Metadata Cache for Note Regeneration + +**Files:** `src/documents/regenerator.rs`, `src/documents/extractor.rs` + +**Depends on:** Work Chunk 2C (extractor function must exist) + +**Context:** The `extract_note_document()` function fetches parent entity metadata (issue/MR title, labels, project path) via individual SQL queries per note. During the initial backfill of ~8,000 existing notes, this creates N+1 query amplification: each note triggers its own parent metadata lookup, even though many notes share the same parent entity. For example, 50 notes on the same MR would execute 50 identical parent metadata queries. + +This is a performance optimization for batch regeneration, not a correctness change. Individual note regeneration (dirty tracking during incremental sync) is unaffected — the N+1 cost is negligible for the typical 1-10 dirty notes per sync. + +#### Tests to Write First + +```rust +#[test] +fn test_note_regeneration_batch_uses_cache() { + // Setup: project, issue with 10 non-system notes + // Mark all 10 as dirty + // Run regenerate_dirty_documents() + // Assert: all 10 documents created correctly + // Assert: parent metadata query count == 1 (not 10) + // (Use a query counter or verify via cache hit metrics) +} + +#[test] +fn test_note_regeneration_cache_consistent_with_direct_extraction() { + // Setup: project, issue with labels, discussion, 3 notes + // Extract note document directly (no cache) + // Extract via cached batch path + // Assert: content_hash is identical for both paths + // Assert: labels_hash is identical for both paths +} + +#[test] +fn test_note_regeneration_cache_invalidates_across_parents() { + // Setup: 2 issues, each with notes + // Regenerate notes from both issues in one batch + // Assert: each issue's notes get correct parent metadata + // (cache keyed by (noteable_type, parent_id), not globally shared) +} +``` + +#### Implementation + +**1. Add `ParentMetadataCache` struct** in `src/documents/extractor.rs`: + +```rust +use std::collections::HashMap; + +/// Cache for parent entity metadata during batch note document extraction. +/// Keyed by (noteable_type, parent_local_id) to avoid repeated lookups +/// when multiple notes share the same parent issue/MR. +pub struct ParentMetadataCache { + cache: HashMap<(String, i64), ParentMetadata>, +} + +pub struct ParentMetadata { + pub iid: i64, + pub title: Option, + pub web_url: Option, + pub labels: Vec, + pub project_path: String, +} + +impl ParentMetadataCache { + pub fn new() -> Self { Self { cache: HashMap::new() } } + + pub fn get_or_fetch( + &mut self, + conn: &Connection, + noteable_type: &str, + parent_id: i64, + ) -> Result<&ParentMetadata> { + // HashMap entry API: fetch from DB on miss, return cached on hit + } +} +``` + +**2. Add `extract_note_document_cached()` variant** that accepts `&mut ParentMetadataCache` and uses it instead of inline parent metadata queries. The uncached `extract_note_document()` remains for single-note regeneration. + +**3. Update batch regeneration loop** in `src/documents/regenerator.rs`: + +```rust +// In the regeneration loop, when processing a batch of dirty sources: +let mut parent_cache = ParentMetadataCache::new(); + +for (source_type, source_id) in dirty_batch { + match source_type { + SourceType::Note => extract_note_document_cached(conn, source_id, &mut parent_cache)?, + // Other source types use existing extraction functions (no cache needed) + _ => regenerate_one(conn, source_type, source_id)?, + }; +} +``` + +**Scope limit:** The cache is created fresh per regeneration batch and discarded after. No cross-batch persistence, no invalidation complexity. The cache is purely an optimization for batch processing where many notes share parents. + +--- + +## Verification Checklist + +After all chunks are complete, run the full quality gate: + +```bash +cargo test +cargo clippy --all-targets -- -D warnings +cargo fmt --check +``` + +Then functional smoke tests: + +```bash +# Phase 0 verification +# Sync twice and verify note IDs are stable: +lore sync +lore -J notes --limit 5 # Record gitlab_ids and local ids +lore sync +lore -J notes --limit 5 # Verify same local ids for same gitlab_ids + +# Phase 1 verification +lore -J notes --author jdefting --since 365d --limit 5 +lore -J notes --note-type DiffNote --path src/ --limit 10 +lore notes --for-mr 456 -p group/repo +lore notes --for-issue 42 -p group/repo # Verify project-scoping works +lore notes --since 180d --until 90d # Bounded window (180 days ago to 90 days ago) +lore notes --resolution unresolved # Tri-state resolution filter +lore notes --contains "unwrap" --note-type DiffNote # Body substring + type filter +lore notes --author jdefting --format jsonl | wc -l # JSONL streaming +lore -J notes --gitlab-note-id 12345 # Precision filter: exact GitLab note +lore -J notes --discussion-id 42 # Precision filter: all notes in thread + +# Phase 2 verification +lore sync # Should generate note documents +lore -J stats # Should show note document count in source_type breakdown +lore -J search "code complexity" --type note --author jdefting +lore -J search "error handling" --type note --since 180d +``` + +Idempotence checks: + +```bash +# Verify generate-docs --full is idempotent +lore generate-docs --full +lore -J stats > /tmp/stats1.json +lore generate-docs --full +lore -J stats > /tmp/stats2.json +diff /tmp/stats1.json /tmp/stats2.json # Should be identical (modulo timing metadata) +``` + +Deletion propagation checks: + +```bash +# Verify that deleted notes don't leave stale documents +# (Manual test: delete a note on GitLab, sync, verify document is gone) +lore sync +lore -J search "specific phrase from deleted note" --type note +# Should return no results +``` + +Performance and query plan verification: + +```bash +# Verify indexes are used for common query patterns +# Run EXPLAIN QUERY PLAN for the hot paths: +sqlite3 ~/.local/share/lore/lore.db "EXPLAIN QUERY PLAN +SELECT n.id, n.gitlab_id, n.author_username, n.body, n.note_type, + n.is_system, n.created_at, n.updated_at, + n.position_new_path, n.position_new_line, + n.position_old_path, n.position_old_line, + n.resolvable, n.resolved, n.resolved_by, + d.noteable_type, + COALESCE(i.iid, m.iid) AS parent_iid, + COALESCE(i.title, m.title) AS parent_title, + p.path_with_namespace +FROM notes n +JOIN discussions d ON n.discussion_id = d.id +JOIN projects p ON n.project_id = p.id +LEFT JOIN issues i ON d.issue_id = i.id +LEFT JOIN merge_requests m ON d.merge_request_id = m.id +WHERE n.is_system = 0 AND n.author_username = 'jdefting' COLLATE NOCASE AND n.created_at >= 1704067200000 +ORDER BY n.created_at DESC, n.id DESC +LIMIT 50;" +# Should show SEARCH using idx_notes_user_created + +sqlite3 ~/.local/share/lore/lore.db "EXPLAIN QUERY PLAN +SELECT n.id FROM notes n +JOIN discussions d ON n.discussion_id = d.id +WHERE n.is_system = 0 AND n.project_id = 1 AND n.created_at >= 1704067200000 +ORDER BY n.created_at DESC, n.id DESC +LIMIT 50;" +# Should show SEARCH using idx_notes_project_created + +sqlite3 ~/.local/share/lore/lore.db "EXPLAIN QUERY PLAN +SELECT n.id FROM notes n +JOIN discussions d ON n.discussion_id = d.id +WHERE n.is_system = 0 AND d.issue_id = (SELECT id FROM issues WHERE iid = 42 AND project_id = 1) +ORDER BY n.created_at DESC, n.id DESC +LIMIT 50;" +# Should show SEARCH using idx_discussions_issue_id for the join +``` + +Operational checks: +- `lore -J stats` output includes `documents.notes` count (the stats command queries by hardcoded source_type strings — verify `'note'` is added) +- Verify `lore -J count notes` still reports user vs system breakdown correctly after the changes +- After a full `lore generate-docs --full`, verify note document count approximately matches non-system note count from `lore count notes` + +--- + +## Work Chunk Dependency Graph + +``` +0A (stable note identity) ──┬────────────────────────────────────┐ + │ │ + ├── 0B (deletion propagation) ◄──────┤── 2A (migration 023, + cleanup triggers) + │ │ + ├── 0C (sweep safety guard) │ + │ │ + ├── 0D (author_id capture) │ + │ │ +1A (data types + query) ──┐ │ │ + ├── 1B (CLI args + wiring) ──┐ │ + ├── 1C (output formatting) ├── 1D (robot-docs) +1E (query index, mig 022 │ │ │ + + author_id column) ──┘ │ │ + │ │ +2A (migration 023) ───────┐ │ │ + ├── 2B (SourceType enum) │ │ + │ │ │ │ + │ ├── 2C (extractor fn) │ │ + │ │ │ │ │ + │ │ ├── 2D (regenerator + dirty tracking) ◄─┘ + │ │ │ │ + │ │ │ ├── 2E (generate-docs --full) + │ │ │ │ + │ │ │ ├── 2F (search --type note) + │ │ │ │ + │ │ │ ├── 2G (parent change propagation) + │ │ │ │ + │ │ │ ├── 2H (backfill migration 024) + │ │ │ │ + │ │ ├── 2I (batch parent metadata cache) +``` + +**Parallelizable pairs:** +- 0A, 1A, 1E, and 2A can all run simultaneously (no code overlap) +- 0C and 0D can run immediately after 0A (both modify upsert functions from 0A) +- 1C and 2B can run simultaneously +- 2E, 2F, 2G, 2H, and 2I can run simultaneously after 2D (2I only needs 2C) +- 0B depends on both 0A and 2A (needs sweep functions from 0A and documents table accepting 'note' from 2A) + +**Critical path:** 0A -> 0C -> 2D -> 2G (Phase 0 must land before dirty tracking integrates with upsert outcomes) + +**Secondary critical path:** 2A -> 2B -> 2C -> 2D (document pipeline chain) + +--- + +## Estimated Document Volume Impact + +| Entity | Typical Count | Documents Before | Documents After | +|--------|--------------|-----------------|-----------------| +| Issues | 500 | 500 | 500 | +| MRs | 300 | 300 | 300 | +| Discussions | 2,000 | 2,000 | 2,000 | +| Notes (non-system) | ~8,000 | 0 | **+8,000** | +| **Total** | | **2,800** | **10,800** | + +FTS5 handles this comfortably. Embedding generation time scales linearly (~4x increase). The three-hash dedup means incremental syncs remain fast. With Phase 0's change-aware dirty marking, only genuinely modified notes trigger regeneration — typical incremental syncs will dirty a small fraction of the 8k total. + +--- + +## Rejected Recommendations + +These recommendations were proposed during review and deliberately rejected. Documenting here to prevent re-proposal. + +- **Feature flag gating / gated rollout** — rejected because this is a single-user CLI tool in early development with no external users. Adding runtime feature gates (`feature.notes_cli`, `feature.note_documents`) for a feature we're building from scratch adds complexity with no benefit. Both phases ship together; there's no "blast radius" to manage. + +- **Keyset pagination / cursor support** — rejected because no existing list command (`lore issues`, `lore mrs`) has pagination. Adding it just for `notes` would be inconsistent. The year-long analysis use case works fine with `--limit 10000`. If pagination becomes needed across all list commands, that's a separate horizontal feature. + +- **Path filtering upgrade (`--path-mode exact|prefix|glob`, `--match-old-path`)** — rejected because the trailing-slash prefix convention is already established across the codebase (issues/MRs use the same pattern). Adding glob mode and old-path matching adds multiple CLI flags for a niche use case. Can be added later if users request it. + +- **Embedding policy knobs (`documents.note_embeddings.min_chars`, `documents.note_embeddings.enabled`, prioritize unresolved DiffNotes)** — rejected because the embedding pipeline already handles volume scaling. Adding per-source-type enable flags and minimum character thresholds is premature optimization. Short notes (e.g., "LGTM", "nit: use `expect()` here") are still semantically valuable for reviewer profiling. The existing embedding batch system handles the volume. + +- **Structured reviewer profiling command (`lore notes profile --author `)** — rejected because this is explicitly a non-goal in the PRD. The reviewer profile report is a downstream consumer of the infrastructure we're building. Adding it here is scope creep. It belongs in a separate PRD after this infrastructure lands. + +- **Operational SLOs / queue lag metrics** — rejected because this is a local CLI tool, not a service. "Oldest dirty note age" and "retry backlog size" are service-oriented metrics that don't apply. The existing `lore stats` and `lore -J count` commands provide sufficient observability. If the dirty queue becomes problematic, we add diagnostics then. + +- **Replace CHECK constraints with source_types registry table + FK** — rejected because the table rebuild for adding a new source type to a CHECK constraint is a rare, one-time cost (done 4 times across 23 migrations). A registry table adds per-insert FK lookup overhead, complicates the migration (still requires a table rebuild to change from CHECK to FK), and optimizes for a hypothetical future where we frequently add source types. The current CHECK approach is simpler, self-documenting, and sufficient. + +- **Unresolved-specific partial index (`idx_notes_unresolved_project_created`)** — rejected because the selectivity is too narrow. Unresolved notes are a small, unpredictable subset. The `idx_notes_project_created` index already covers the project+date scan; adding `WHERE resolvable = 1 AND resolved = 0` provides marginal benefit at the cost of index maintenance overhead. SQLite can filter the small remaining set in-memory efficiently. + +- **Previous note excerpt in document content (`previous_note_excerpt`)** — rejected because it adds a query per note extraction (fetch the preceding note in the same discussion), increases document size, creates coupling between note documents (changing one note's body would stale the next note's document), and the semantic benefit is marginal. The parent title and labels provide sufficient context. Full thread context is available via the existing discussion documents. + +- **Compact/slim metadata header for note documents** — rejected because the verbose key-value header is intentional. The structured fields (`source_type`, `note_gitlab_id`, `project`, `parent_type`, `parent_iid`, etc.) are what enable precise FTS and embedding search for queries like "jdefting's comments on authentication issues in project-one." The compact format (`@author on Issue#42 in project`) loses machine-parseable structure and reduces search precision. Metadata stored in document columns/labels/paths is not searchable via FTS — only `content_text` is FTS-indexed. The token cost of the header (~50 tokens) is negligible compared to typical note body length. + +- **Replace `fetch_complete: bool` with `FetchState` enum (`Complete`/`Partial`/`Failed`) and run_seen_at monotonicity checks** (feedback-5, rec #2) — rejected because the boolean captures the one bit of information that matters: did the fetch complete? `FetchState::Failed` is redundant with not reaching the sweep call site — if the fetch fails, we don't call sweep at all. The monotonicity check on `run_seen_at` adds complexity for a condition that can't occur in practice: `run_seen_at` is generated once per sync run and passed unchanged through all upserts. The boolean is sufficient and self-documenting. + +- **Embedding dedup cache keyed by semantic text hash** (feedback-5, rec #5) — rejected because the existing `content_hash` dedup already prevents re-embedding unchanged documents. A semantic-text-only hash that ignores metadata would conflate genuinely different review contexts: two "LGTM" notes from different authors on different MRs are semantically distinct for the reviewer profiling use case (who said it, where, and when matters). The embedding pipeline handles ~8k notes comfortably without dedup optimization. + +- **Derived review signal labels (`signal:nit`, `signal:blocking`, `signal:security`)** (feedback-5, rec #6) — rejected because (a) it encroaches on the explicitly excluded reviewer profiling scope, (b) heuristic signal derivation (regex for "nit:", keyword matching for "security") is inherently fragile and would require ongoing maintenance as review vocabulary evolves, and (c) the raw note text already supports downstream LLM-based analysis that produces far more accurate signal classification than static keyword matching. This belongs in the downstream profiling PRD where LLM-based classification can be done properly. diff --git a/docs/user-journeys.md b/docs/user-journeys.md new file mode 100644 index 0000000..9763fb9 --- /dev/null +++ b/docs/user-journeys.md @@ -0,0 +1,541 @@ +# Lore CLI User Journeys + +## Purpose + +Map realistic workflows for both human users and AI agents to identify gaps in the command surface and optimization opportunities. Each journey starts with a **problem** and traces the commands needed to reach a **resolution**. + +--- + +## Part 1: Human User Flows + +### H1. Morning Standup Prep + +**Problem:** "What happened since yesterday? I need to know what moved before standup." + +**Flow:** +``` +lore sync -q # Refresh data (quiet, no noise) +lore issues -s opened --since 1d # Issues that changed overnight +lore mrs -s opened --since 1d # MRs that moved +lore who @me # My current workload snapshot +``` + +**Gap identified:** No single "activity feed" command. User runs 3 queries to get what should be one view. No `--since 1d` shorthand for "since yesterday." No `@me` alias for the authenticated user. + +--- + +### H2. Sprint Planning: What's Ready to Pick Up? + +**Problem:** "We're planning the next sprint. What's open, unassigned, and actionable?" + +**Flow:** +``` +lore issues -s opened -p myproject # All open issues +lore issues -s opened -l "ready" # Issues labeled ready +lore issues -s opened --has-due # Issues with deadlines approaching +lore count issues -p myproject # How many total? +``` + +**Gap identified:** No way to filter by "unassigned" issues (missing `--no-assignee` flag). No way to sort by due date. No way to see priority/weight. Can't combine filters like "opened AND no assignee AND has due date." + +--- + +### H3. Investigating a Production Incident + +**Problem:** "Deploy broke prod. I need the full timeline of what changed around the deploy." + +**Flow:** +``` +lore sync -q # Get latest +lore timeline "deploy" --since 7d # What happened around deploys +lore search "deploy" --type mr # MRs mentioning deploy +lore mrs 456 # Inspect the suspicious MR +lore who --overlap src/deploy/ # Who else touches deploy code +``` + +**Gap identified:** Timeline is keyword-based, not event-based. Can't filter by "MRs merged in the last 24 hours" directly. No way to see which MRs were merged between two dates (release diff). Would benefit from `lore mrs -s merged --since 1d`. + +--- + +### H4. Preparing to Review Someone's MR + +**Problem:** "I was assigned to review MR !789. I need context before diving in." + +**Flow:** +``` +lore mrs 789 # Read the MR description + discussions +lore mrs 789 -o # Open in browser for the actual diff +lore who src/features/auth/ # Who are the experts in this area? +lore search "auth refactor" --type issue # Related issues for background +lore timeline "authentication" # History of auth changes +``` + +**Gap identified:** No way to see the file list touched by an MR from the CLI (data is stored in `mr_file_changes` but not surfaced). No way to link an MR back to its closing issue(s) from the MR detail view. The cross-reference data exists in `entity_references` but isn't shown in `mrs ` output. + +--- + +### H5. Onboarding to an Unfamiliar Code Area + +**Problem:** "I'm new to the team and need to understand how the billing module works." + +**Flow:** +``` +lore search "billing" -n 20 # What exists about billing? +lore who src/billing/ # Who knows billing best? +lore timeline "billing" --depth 2 # History of billing changes +lore mrs -s merged -l billing --since 6m # Recent merged billing work +lore issues -s opened -l billing # Outstanding billing issues +``` + +**Gap identified:** No way to get a "module overview" in one command. The search spans issues, MRs, and discussions but doesn't summarize by category. No way to see the most-discussed or most-referenced entities (high-signal items for understanding). + +--- + +### H6. Finding the Right Reviewer for My PR + +**Problem:** "I'm about to submit a PR touching auth and payments. Who should review?" + +**Flow:** +``` +lore who src/features/auth/ # Auth experts +lore who src/features/payments/ # Payment experts +lore who @candidate1 # Check candidate1's workload +lore who @candidate2 # Check candidate2's workload +``` + +**Gap identified:** No way to query multiple paths at once (`lore who src/auth/ src/payments/`). No way to find the intersection of expertise. No workload-aware recommendation ("who knows this AND has bandwidth"). Four separate commands for what should be one decision. + +--- + +### H7. Understanding Why a Feature Was Built This Way + +**Problem:** "This code is weird. Why was it implemented like this? What was the original discussion?" + +**Flow:** +``` +lore search "feature-name rationale" # Search for decision context +lore timeline "feature-name" --depth 2 # Full history with cross-refs +lore issues 234 # Read the original issue +lore mrs 567 # Read the implementation MR +``` + +**Gap identified:** No way to search within a specific issue's or MR's discussion notes. The search covers documents (titles + descriptions) but per-note search isn't available yet (PRD exists). No way to navigate "issue 234 was closed by MR 567" without manually knowing both IDs. + +--- + +### H8. Checking Team Workload Before Assigning Work + +**Problem:** "I need to assign this urgent bug. Who has the least on their plate?" + +**Flow:** +``` +lore who @alice # Alice's workload +lore who @bob # Bob's workload +lore who @carol # Carol's workload +lore who @dave # Dave's workload +``` + +**Gap identified:** No team-level workload view. Must query each person individually. No way to list "all assignees and their open issue counts." No concept of a team roster. Would benefit from `lore who --team` or `lore workload`. + +--- + +### H9. Preparing Release Notes + +**Problem:** "We're cutting a release. I need to summarize what's in this version." + +**Flow:** +``` +lore mrs -s merged --since 2w -p myproject # MRs merged since last release +lore issues -s closed --since 2w -p myproject # Issues closed since last release +lore mrs -s merged -l feature --since 2w # Feature MRs specifically +lore mrs -s merged -l bugfix --since 2w # Bugfix MRs +``` + +**Gap identified:** No way to filter by milestone (for version-based releases). Wait -- `issues` has `-m` for milestone but `mrs` does not. No changelog generation. No "what closed between tag A and tag B." No grouping by label for release note categories. + +--- + +### H10. Finding and Closing Stale Issues + +**Problem:** "Our backlog is bloated. Which issues haven't been touched in months?" + +**Flow:** +``` +lore issues -s opened --sort updated --asc -n 50 # Oldest-updated first +# Then manually inspect each one... +lore issues 42 # Is this still relevant? +``` + +**Gap identified:** No `--before` or `--updated-before` filter (only `--since` exists). Can sort ascending but can't filter "not updated in 90 days." No staleness indicator. No bulk operations concept. + +--- + +### H11. Understanding a Bug's Full History + +**Problem:** "Bug #321 keeps getting reopened. I need to understand its entire lifecycle." + +**Flow:** +``` +lore issues 321 # Read the issue +lore timeline "bug-keyword" -p myproject # Try to find timeline events +# But timeline is keyword-based, not entity-based... +``` + +**Gap identified:** No way to get a timeline for a specific entity by IID. `lore timeline` requires a keyword query, not an entity reference. Would benefit from `lore timeline --issue 321` or `lore timeline --mr 456` to get the event history of a specific entity directly. + +--- + +### H12. Identifying Who to Ask About Failing Tests + +**Problem:** "CI tests are failing in `src/lib/parser.rs`. Who last touched this?" + +**Flow:** +``` +lore who src/lib/parser.rs # Expert lookup +lore who --overlap src/lib/parser.rs # Who else has touched it +lore search "parser" --type mr --since 2w # Recent MRs touching parser +``` + +**Gap identified:** Expert mode uses DiffNote analysis (code review comments), not actual file change tracking. The `mr_file_changes` table has the real data but `who` doesn't use it for attribution. Could be much more accurate with file-change-based expertise. + +--- + +### H13. Tracking a Feature Across Multiple MRs + +**Problem:** "The 'dark mode' feature spans 5 MRs. I need to see them all together." + +**Flow:** +``` +lore mrs -l dark-mode # MRs with the label +lore issues -l dark-mode # Related issues +lore timeline "dark mode" --depth 2 # Cross-referenced events +``` + +**Gap identified:** Works reasonably well with labels as the grouping mechanism. But if the team didn't label consistently, there's no way to discover related MRs by content similarity. No "related items" view that combines issues + MRs + discussions for a topic. + +--- + +### H14. Checking if a Similar Fix Was Already Attempted + +**Problem:** "Before I implement this fix, was something similar tried before?" + +**Flow:** +``` +lore search "memory leak connection pool" # Semantic search +lore search "connection pool" --type mr -s all # Wait, no state filter on search +lore mrs -s closed -l bugfix # Closed bugfix MRs (coarse) +lore timeline "connection pool" # Historical context +``` + +**Gap identified:** Search doesn't have a `--state` filter. Can't search only closed/merged items. The semantic search is powerful but can't be combined with entity state. Would benefit from `--state merged` on search to find past attempts. + +--- + +### H15. Reviewing Discussions That Need My Attention + +**Problem:** "Which discussion threads am I involved in that are still unresolved?" + +**Flow:** +``` +lore who --active # All active unresolved discussions +lore who --active --since 30d # Wider window +# But can't filter to "discussions I'm in"... +``` + +**Gap identified:** `--active` shows all unresolved discussions, not filtered by participant. No way to say "show me discussions where @me participated." No notification/mention tracking. No "my unresolved threads" view. + +--- + +## Part 2: AI Agent Flows + +### A1. Context Gathering Before Code Modification + +**Problem:** Agent is about to modify `src/features/auth/session.rs` and needs full context. + +**Flow:** +``` +lore -J health # Pre-flight check +lore -J who src/features/auth/ # Who knows this area +lore -J search "auth session" -n 10 # Related issues/MRs +lore -J mrs -s merged --since 3m -l auth # Recent auth changes +lore -J who --overlap src/features/auth/session.rs # Concurrent work risk +``` + +**Gap identified:** No way to check "are there open MRs touching this file right now?" The overlap mode shows historical touches, not active branches. An agent needs to know about in-flight changes to avoid conflicts. + +--- + +### A2. Auto-Triaging an Incoming Issue + +**Problem:** Agent receives a new issue and needs to categorize it, find related work, and suggest assignees. + +**Flow:** +``` +lore -J issues 999 # Read the new issue +lore -J search "$(extract_keywords)" --explain # Find similar past issues +lore -J who src/affected/path/ # Suggest experts as assignees +lore -J issues -s opened -l same-label # Check for duplicates +``` + +**Gap identified:** No way to get just the description text for programmatic keyword extraction. `issues ` returns full detail including discussions. Agent must parse the full response to extract the description for a secondary search. Would benefit from `--fields description` on detail view. No duplicate detection built in. + +--- + +### A3. Generating Sprint Status Report + +**Problem:** Agent needs to produce a weekly status report for the team. + +**Flow:** +``` +lore -J issues -s closed --since 1w --fields minimal # Completed work +lore -J issues -s opened --status "In progress" # In-flight work +lore -J mrs -s merged --since 1w --fields minimal # Merged PRs +lore -J mrs -s opened -D --fields minimal # Open non-draft MRs +lore -J count issues # Totals +lore -J count mrs # MR totals +lore -J who --active --since 1w # Discussions needing attention +``` + +**Gap identified:** Seven separate queries for one report. No `lore summary` or `lore report` command. No way to get "issues transitioned from X to Y this week" (state change history exists in events but isn't queryable). No velocity metric (issues closed per week trend). + +--- + +### A4. Finding Relevant Prior Art Before Implementing + +**Problem:** Agent is implementing a caching layer and wants to find if similar patterns exist in the codebase's GitLab history. + +**Flow:** +``` +lore -J search "caching" --mode hybrid -n 20 --explain +lore -J search "cache invalidation" --mode hybrid -n 10 +lore -J search "redis" --mode lexical --type discussion # Exact term in discussions +lore -J timeline "cache" --since 1y # Wait, max is 1y? Let's try 12m +``` + +**Gap identified:** No way to search discussion notes individually (per-note search). Discussions are aggregated into documents, so individual note-level matches are lost. The `--explain` flag helps but doesn't show which specific note matched. No `--since 1y` or `--since 12m` duration format. + +--- + +### A5. Building Context for PR Description + +**Problem:** Agent wrote code and needs to generate a PR description that references relevant issues. + +**Flow:** +``` +lore -J search "feature description keywords" --type issue +lore -J issues -s opened -l feature-label --fields iid,title,web_url +# Cross-reference: which issues does this MR close? +# No command for this -- must manually scan search results +``` + +**Gap identified:** No way to query the `entity_references` table directly. Agent can't ask "which issues reference MR !456" or "which issues contain 'closes #123' in their text." The data exists but isn't exposed as a query surface. Would benefit from `lore refs --mr 456` or `lore refs --issue 123`. + +--- + +### A6. Identifying Affected Experts for Review Assignment + +**Problem:** Agent needs to automatically assign reviewers based on the files changed in an MR. + +**Flow:** +``` +lore -J mrs 456 # Get MR details +# Parse file paths from response... but file changes aren't in the output +lore -J who src/path/from/mr/ # Query each path +lore -J who src/another/path/ # One at a time... +lore -J who @candidate --fields minimal # Check workload +``` + +**Gap identified:** MR detail view (`mrs `) doesn't include the file change list from `mr_file_changes`. Agent can't programmatically extract which files an MR touches. Must fall back to GitLab API or guess from description. The `who` command doesn't accept multiple paths. No "auto-reviewer" suggestion combining expertise + availability. + +--- + +### A7. Incident Investigation and Timeline Reconstruction + +**Problem:** Agent needs to reconstruct what happened during an outage for a postmortem. + +**Flow:** +``` +lore -J timeline "outage" --since 3d --depth 2 --expand-mentions +lore -J search "error 500" --since 3d +lore -J mrs -s merged --since 3d -p production-service +lore -J issues --status "In progress" -p production-service +``` + +**Gap identified:** Timeline is keyword-seeded, which means if the outage wasn't described with that exact term, seeds may miss it. No way to seed a timeline from an entity ID (e.g., "start from issue #321 and expand outward"). No severity/priority filter. No way to correlate with merge times. + +--- + +### A8. Cross-Project Impact Assessment + +**Problem:** Agent needs to understand how a breaking API change in project A affects projects B and C. + +**Flow:** +``` +lore -J search "api-endpoint-name" -p project-a +lore -J search "api-endpoint-name" -p project-b +lore -J search "api-endpoint-name" -p project-c +# Or without project filter to search everywhere: +lore -J search "api-endpoint-name" -n 50 +lore -J timeline "api-endpoint-name" --depth 2 +``` + +**Gap identified:** Cross-project references in entity_references are tracked but the timeline shows unresolved references for entities not synced locally. No way to see a cross-project dependency map. Search works across projects but doesn't group results by project. + +--- + +### A9. Automated Stale Issue Recommendations + +**Problem:** Agent runs weekly to identify issues that should be closed or re-prioritized. + +**Flow:** +``` +lore -J issues -s opened --sort updated --asc -n 100 # Oldest first +# For each issue, check: +lore -J issues # Read details +lore -J search "" # Any recent activity? +``` + +**Gap identified:** No `--updated-before` filter, so agent must fetch all and filter client-side. No way to detect "issue has no assignee AND no activity in 90 days." The 100-issue limit means pagination is needed for large backlogs, but there's no cursor/offset pagination -- only `--limit`. Agent must do N+1 queries to inspect each candidate. + +--- + +### A10. Code Review Preparation (File-Level Context) + +**Problem:** Agent is reviewing MR !789 and needs to understand the history of each changed file. + +**Flow:** +``` +lore -J mrs 789 # Get MR details +# Can't get file list from output... +# Fall back to search by MR title keywords +lore -J search "feature-from-mr" --type mr +lore -J who src/guessed/path/ # Expertise for each file +lore -J who --overlap src/guessed/path/ # Concurrent changes +``` + +**Gap identified:** Same as A6 -- `mr_file_changes` data isn't exposed. Agent is blind to the actual files in the MR unless it parses the description or uses the GitLab API directly. This is the single biggest gap for automated code review workflows. + +--- + +### A11. Building a Knowledge Graph of Entity Relationships + +**Problem:** Agent wants to map how issues, MRs, and discussions are connected for a feature. + +**Flow:** +``` +lore -J search "feature-name" -n 30 +lore -J timeline "feature-name" --depth 2 --max-entities 100 +# Timeline shows expanded entities and cross-refs, but... +# No way to query entity_references directly +# No way to get "all entities that reference issue #123" +``` + +**Gap identified:** The `entity_references` table (closes, related, mentioned) is used internally by timeline but isn't queryable as a standalone command. Agent can't ask "what closes issue #123?" or "what does MR !456 reference?" No graph export. Would enable powerful dependency mapping. + +--- + +### A12. Release Readiness Assessment + +**Problem:** Agent needs to verify all issues in milestone "v2.0" are closed and MRs are merged. + +**Flow:** +``` +lore -J issues -m "v2.0" -s opened # Any open issues in milestone? +lore -J issues -m "v2.0" -s closed # Closed issues +# MRs don't have milestone filter... +lore -J mrs -s opened -l "v2.0" # Try label as proxy +lore -J who --active -p myproject # Unresolved discussions +``` + +**Gap identified:** MRs don't have a `--milestone` filter (issues do). No way to check "all MRs linked to issues in milestone v2.0" -- would require joining `entity_references` with issue milestone. No release checklist concept. No way to verify "every issue in this milestone has a closing MR." + +--- + +### A13. Answering "What Changed?" Between Two Points + +**Problem:** Agent needs to diff project state between two dates for a stakeholder report. + +**Flow:** +``` +lore -J issues -s closed --since 2w --fields minimal # Recently closed +lore -J issues -s opened --since 2w --fields minimal # Recently opened +lore -J mrs -s merged --since 2w --fields minimal # Recently merged +# But no way to get "issues that CHANGED STATE" in a window +# An issue opened 3 months ago but closed yesterday won't appear in --since 2w for issues -s opened +``` + +**Gap identified:** `--since` filters by `updated_at`, not by "state changed at." An issue closed yesterday but created 6 months ago would appear in `issues -s closed --since 1d` (because updated_at changed), but the semantics are subtle. No explicit "state transitions in time window" query. The resource_state_events table has this data but it's not exposed as a filter. + +--- + +### A14. Meeting Prep: Summarize Recent Activity for a Stakeholder + +**Problem:** Agent needs to prepare a 2-minute summary for a project sponsor meeting. + +**Flow:** +``` +lore -J count issues -p project # Current totals +lore -J count mrs -p project # MR totals +lore -J issues -s closed --since 1w -p project --fields minimal +lore -J mrs -s merged --since 1w -p project --fields minimal +lore -J issues -s opened --status "In progress" -p project +lore -J who --active -p project --since 1w +``` + +**Gap identified:** Six queries, same as A3. No summary/dashboard command. Agent must synthesize all responses. No trend data (is the open issue count growing or shrinking?). No "highlights" extraction. + +--- + +### A15. Determining If Work Is Safe to Start (Conflict Detection) + +**Problem:** Agent is about to start work on an issue and needs to check nobody else is already working on it. + +**Flow:** +``` +lore -J issues 123 # Read the issue +# Check assignees from response +lore -J mrs -s opened -A other-person # Are they working on related MRs? +lore -J who --overlap src/target/path/ # Anyone actively touching these files? +lore -J search "issue-123-keywords" --type mr -s opened # Wait, search has no --state +``` + +**Gap identified:** No way to check "is there an open MR that closes issue #123?" -- the entity_references data exists but isn't queryable. Search doesn't support `--state` filter. No "conflict detection" or "in-flight work" check. Agent must do multiple queries and manually correlate. + +--- + +## Part 3: Gap Summary + +### Critical Gaps (high impact, blocks common workflows) + +| # | Gap | Affected Flows | Suggested Command/Flag | +|---|-----|----------------|----------------------| +| 1 | **MR file changes not surfaced** | H4, A6, A10 | `lore mrs --files` or include in detail view | +| 2 | **Entity references not queryable** | H7, A5, A11, A15 | `lore refs --issue 123` / `lore refs --mr 456` | +| 3 | **Per-note search missing** | H7, A4 | `lore search --granularity note` (PRD exists) | +| 4 | **No entity-based timeline** | H11, A7 | `lore timeline --issue 321` / `lore timeline --mr 456` | +| 5 | **No @me / current-user alias** | H1, H15 | Resolve from auth token automatically | + +### Important Gaps (significant friction, multiple workarounds needed) + +| # | Gap | Affected Flows | Suggested Command/Flag | +|---|-----|----------------|----------------------| +| 6 | **No activity feed / summary** | H1, A3, A14 | `lore activity --since 1d` or `lore summary` | +| 7 | **No multi-path who query** | H6, A6 | `lore who src/path1/ src/path2/` | +| 8 | **No --state filter on search** | H14, A15 | `lore search --state merged` | +| 9 | **MRs missing --milestone filter** | H9, A12 | `lore mrs -m "v2.0"` | +| 10 | **No --no-assignee / --unassigned** | H2 | `lore issues --no-assignee` | +| 11 | **No --updated-before filter** | H10, A9 | `lore issues --before 90d` or `--stale 90d` | +| 12 | **No team workload view** | H8 | `lore who --team` or `lore workload` | + +### Nice-to-Have Gaps (would improve agent efficiency) + +| # | Gap | Affected Flows | Suggested Command/Flag | +|---|-----|----------------|----------------------| +| 13 | **No pagination/offset** | A9 | `--offset 100` for large result sets | +| 14 | **No detail --fields on show** | A2 | `lore issues 999 --fields description` | +| 15 | **No cross-project grouping** | A8 | `lore search --group-by project` | +| 16 | **No trend/velocity metrics** | A3, A14 | `lore trends issues --period week` | +| 17 | **No --for-issue on mrs** | A12, A15 | `lore mrs --closes 123` (query entity_refs) | +| 18 | **1y/12m duration not supported** | A4 | Support `1y`, `12m`, `365d` in --since | +| 19 | **No discussion participant filter** | H15 | `lore who --active --participant @me` | +| 20 | **No sort by due date** | H2 | `lore issues --sort due` | diff --git a/plans/tui-prd-v2-frankentui.feedback-10.md b/plans/tui-prd-v2-frankentui.feedback-10.md new file mode 100644 index 0000000..f56ec26 --- /dev/null +++ b/plans/tui-prd-v2-frankentui.feedback-10.md @@ -0,0 +1,214 @@ +I found 9 high-impact revisions that materially improve correctness, robustness, and usability without reintroducing anything in `## Rejected Recommendations`. + +### 1. Prevent stale async overwrites on **all** screens (not just search) +Right now, only `SearchExecuted` is generation-guarded. `IssueListLoaded`, `MrListLoaded`, `IssueDetailLoaded`, etc. can still race and overwrite newer state after rapid navigation/filtering. This is the biggest correctness risk in the current design. + +```diff +diff --git a/PRD.md b/PRD.md +@@ message.rs +- IssueListLoaded(Vec), ++ IssueListLoaded { generation: u64, rows: Vec }, +@@ +- MrListLoaded(Vec), ++ MrListLoaded { generation: u64, rows: Vec }, +@@ +- IssueDetailLoaded { key: EntityKey, detail: IssueDetail }, +- MrDetailLoaded { key: EntityKey, detail: MrDetail }, ++ IssueDetailLoaded { generation: u64, key: EntityKey, detail: IssueDetail }, ++ MrDetailLoaded { generation: u64, key: EntityKey, detail: MrDetail }, + +@@ update() +- Msg::IssueListLoaded(result) => { ++ Msg::IssueListLoaded { generation, rows } => { ++ if !self.task_supervisor.is_current(&TaskKey::LoadScreen(Screen::IssueList), generation) { ++ return Cmd::none(); ++ } + self.state.set_loading(false); +- self.state.issue_list.set_result(result); ++ self.state.issue_list.set_result(rows); + Cmd::none() + } +``` + +### 2. Make cancellation safe with task-owned SQLite interrupt handles +The plan mentions `sqlite3_interrupt()` but uses pooled shared reader connections. Interrupting a shared connection can cancel unrelated work. Use per-task reader leases and store `InterruptHandle` in `TaskHandle`. + +```diff +diff --git a/PRD.md b/PRD.md +@@ DbManager +- readers: Vec>, ++ readers: Vec>, ++ // task-scoped interrupt handles prevent cross-task cancellation bleed ++ // each dispatched query receives an owned ReaderLease + ++pub struct ReaderLease { ++ conn: Connection, ++ interrupt: rusqlite::InterruptHandle, ++} ++ ++impl DbManager { ++ pub fn lease_reader(&self) -> Result { ... } ++} + +@@ TaskHandle + pub struct TaskHandle { + pub key: TaskKey, + pub generation: u64, + pub cancel: Arc, ++ pub interrupt: Option, + } + +@@ cancellation +-Query interruption: ... fires sqlite3_interrupt() on the connection. ++Query interruption: cancel triggers the task's owned InterruptHandle only. ++No shared-connection interrupt is permitted. +``` + +### 3. Harden keyset pagination for multi-project and sort changes +`updated_at + iid` cursor is not enough when rows share timestamps across projects or sort mode changes. This can duplicate/skip rows. + +```diff +diff --git a/PRD.md b/PRD.md +@@ issue_list.rs +-pub struct IssueCursor { +- pub updated_at: i64, +- pub iid: i64, +-} ++pub struct IssueCursor { ++ pub sort_field: SortField, ++ pub sort_order: SortOrder, ++ pub updated_at: Option, ++ pub created_at: Option, ++ pub iid: i64, ++ pub project_id: i64, // deterministic tie-breaker ++ pub filter_hash: u64, // invalidates stale cursors on filter mutation ++} + +@@ pagination section +-Windowed keyset pagination ... ++Windowed keyset pagination uses deterministic tuple ordering: ++`ORDER BY , project_id, iid`. ++Cursor is rejected if `filter_hash` or sort tuple mismatches current query. +``` + +### 4. Replace ad-hoc filter parsing with a small typed DSL +Current `split_whitespace()` parser is brittle and silently lossy. Add quoted values, negation, and strict parse errors. + +```diff +diff --git a/PRD.md b/PRD.md +@@ filter_bar.rs +- fn parse_tokens(&mut self) { +- let text = self.input.value().to_string(); +- self.tokens = text.split_whitespace().map(|chunk| { ... }).collect(); +- } ++ fn parse_tokens(&mut self) { ++ // grammar (v1): ++ // term := [ "-" ] (field ":" value | quoted_text | bare_text) ++ // value := quoted | unquoted ++ // examples: ++ // state:opened label:"P1 blocker" -author:bot since:14d ++ self.tokens = filter_dsl::parse(self.input.value())?; ++ } + +@@ section 8 / keybindings-help ++Filter parser surfaces actionable inline diagnostics with cursor position, ++and never silently drops unknown fields. +``` + +### 5. Add render caches for markdown/tree shaping +Markdown and tree shaping are currently recomputed on every frame in several snippets. Cache render artifacts by `(entity, width, theme, content_hash)` to protect frame time. + +```diff +diff --git a/PRD.md b/PRD.md +@@ module structure ++ render_cache.rs # Width/theme/content-hash keyed cache for markdown + tree layouts + +@@ Assumptions / Performance ++Detail and search preview rendering uses memoized render artifacts. ++Cache invalidation triggers: content hash change, terminal width change, theme change. +``` + +### 6. Use one-shot timers for debounce/prefix timeout +`Every` is periodic; it wakes repeatedly and can produce edge-case repeated firings. One-shot subscriptions are cleaner and cheaper. + +```diff +diff --git a/PRD.md b/PRD.md +@@ subscriptions() +- if self.state.search.debounce_pending() { +- subs.push(Box::new( +- Every::with_id(3, Duration::from_millis(200), move || { +- Msg::SearchDebounceFired { generation } +- }) +- )); +- } ++ if self.state.search.debounce_pending() { ++ subs.push(Box::new( ++ After::with_id(3, Duration::from_millis(200), move || { ++ Msg::SearchDebounceFired { generation } ++ }) ++ )); ++ } + +@@ InputMode GoPrefix timeout +-The tick subscription compares clock instant... ++GoPrefix timeout is a one-shot `After(500ms)` tied to prefix generation. +``` + +### 7. New feature: list “Quick Peek” panel (`Space`) for triage speed +This adds immediate value without v2-level scope. Users can inspect selected issue/MR metadata/snippet without entering detail and coming back. + +```diff +diff --git a/PRD.md b/PRD.md +@@ 5.2 Issue List +-Interaction: Enter detail ++Interaction: Enter detail, Space quick-peek (toggle right preview pane) + +@@ 5.4 MR List ++Quick Peek mode mirrors Issue List: metadata + first discussion snippet + cross-refs. + +@@ 8.2 List Screens + | `Enter` | Open selected item | ++| `Space` | Toggle Quick Peek panel for selected row | +``` + +### 8. Upgrade compatibility handshake from integer to machine-readable contract +Single integer compat is too coarse for real drift detection. Keep it simple but structured. + +```diff +diff --git a/PRD.md b/PRD.md +@@ Nightly Rust Strategy / Compatibility contract +- 1. Binary compat version (`lore-tui --compat-version`) — integer check ... ++ 1. Binary compat contract (`lore-tui --compat-json`) — JSON: ++ `{ "protocol": 1, "compat_version": 2, "min_schema": 14, "max_schema": 16, "build": "..." }` ++ `lore` validates protocol + compat + schema range before spawn. + +@@ CLI integration +-fn validate_tui_compat(...) { ... --compat-version ... } ++fn validate_tui_compat(...) { ... --compat-json ... } +``` + +### 9. Fix sync stream bug and formalize progress coalescing +The current snippet calls `try_send` for progress twice in one callback path and depth math is wrong. Also progress spam should be coalesced by lane. + +```diff +diff --git a/PRD.md b/PRD.md +@@ start_sync_task() +- let current_depth = 2048 - tx.try_send(Msg::SyncProgress(event.clone())) +- .err().map_or(0, |_| 1); +- max_queue_depth = max_queue_depth.max(current_depth); +- if tx.try_send(Msg::SyncProgress(event.clone())).is_err() { ++ // coalesce by lane key at <=30Hz; one send attempt per flush ++ coalescer.update(event.clone()); ++ if let Some(batch) = coalescer.flush_ready() { ++ if tx.try_send(Msg::SyncProgressBatch(batch)).is_err() { + dropped_count += 1; + let _ = tx.try_send(Msg::SyncBackpressureDrop); ++ } else { ++ max_queue_depth = max_queue_depth.max(observed_queue_depth()); ++ } + } +``` + +--- + +If you want, I can produce a single consolidated patch-style rewrite of Sections `4.x`, `5.2/5.4`, `8.2`, `9.3`, and `10.x` so you can drop it directly into iteration 10. \ No newline at end of file diff --git a/plans/tui-prd-v2-frankentui.feedback-11.md b/plans/tui-prd-v2-frankentui.feedback-11.md new file mode 100644 index 0000000..dd0f381 --- /dev/null +++ b/plans/tui-prd-v2-frankentui.feedback-11.md @@ -0,0 +1,177 @@ +I reviewed the full PRD and avoided everything listed under `## Rejected Recommendations`. +These are the highest-impact revisions I’d make. + +1. Stable list pagination via snapshot fences +Why this improves the plan: your keyset cursor is deterministic for sort/filter, but still vulnerable to duplicates/skips if sync writes land between page fetches. Add a per-browse snapshot fence so one browse session sees a stable dataset. +Tradeoff: newest rows are hidden until refresh, which is correct for deterministic triage. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 5.2 Issue List +- **Pagination:** Windowed keyset pagination with explicit cursor state. ++ **Pagination:** Windowed keyset pagination with explicit cursor state. ++ **Snapshot fence:** On list entry, capture `snapshot_upper_updated_at` (ms) and pin all ++ list-page queries to `updated_at <= snapshot_upper_updated_at`. This guarantees no duplicate ++ or skipped rows during scrolling even if sync writes occur concurrently. ++ A "new data available" badge appears when a newer sync completes; `r` refreshes the fence. + +@@ 5.4 MR List +- **Pagination:** Same windowed keyset pagination strategy as Issue List. ++ **Pagination:** Same strategy plus snapshot fence (`updated_at <= snapshot_upper_updated_at`) ++ for deterministic cross-page traversal under concurrent sync writes. + +@@ 4.7 Navigation Stack Implementation ++ Browsing sessions carry a per-screen `BrowseSnapshot` token to preserve stable ordering ++ until explicit refresh or screen re-entry. +``` + +2. Query budgets and soft deadlines +Why this improves the plan: currently “slow query” is handled mostly by cancellation and stale-drop. Add explicit latency budgets so UI responsiveness stays predictable under worst-case filters. +Tradeoff: sometimes user gets partial/truncated results first, followed by full results on retry/refine. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 4.5 Async Action System ++ #### 4.5.2 Query Budgets and Soft Deadlines ++ Each query type gets a budget: ++ - list window fetch: 120ms target, 250ms hard deadline ++ - detail phase-1 metadata: 75ms target, 150ms hard deadline ++ - search lexical/hybrid: 250ms hard deadline ++ On hard deadline breach, return `QueryDegraded { truncated: true }` and show inline badge: ++ "results truncated; refine filter or press r to retry full". ++ Implementation uses SQLite progress handler + per-task interrupt deadline. + +@@ 9.3 Phase 0 — Toolchain Gate ++ 26. Query deadline behavior validated: hard deadline cancels query and renders degraded badge ++ without blocking input loop. +``` + +3. Targeted cache invalidation and prewarm after sync +Why this improves the plan: `invalidate_all()` after sync throws away hot detail cache and hurts the exact post-sync workflow you optimized for. Invalidate only changed keys and prewarm likely-next entities. +Tradeoff: slightly more bookkeeping in sync result handling. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 4.1 Module Structure +- entity_cache.rs # Bounded LRU cache ... Invalidated on sync completion. ++ entity_cache.rs # Bounded LRU cache with selective invalidation by changed EntityKey ++ # and optional post-sync prewarm of top changed entities. + +@@ 4.4 App — Implementing the Model Trait (Msg::SyncCompleted) +- // Invalidate entity cache — synced data may have changed. +- self.entity_cache.invalidate_all(); ++ // Selective invalidation: evict only changed entities from sync delta. ++ self.entity_cache.invalidate_keys(&result.changed_entity_keys); ++ // Prewarm top N changed/new entities for immediate post-sync triage. ++ self.enqueue_cache_prewarm(&result.changed_entity_keys); +``` + +4. Exact “what changed” navigation without new DB tables +Why this improves the plan: your summary currently uses timestamp filter; this can include unrelated updates and miss edge cases. Keep an in-memory delta ledger per sync run and navigate by exact IDs. +Tradeoff: small memory overhead per run; no schema migration required. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 5.9 Sync (Summary mode) +-- `i` navigates to Issue List pre-filtered to "since last sync" (using `sync_status.last_completed_at` timestamp comparison) +-- `m` navigates to MR List pre-filtered to "since last sync" (using `sync_status.last_completed_at` timestamp comparison) ++- `i` navigates to Issue List filtered by exact issue IDs changed in this sync run ++- `m` navigates to MR List filtered by exact MR IDs changed in this sync run ++ (fallback to timestamp filter only if run delta not available) + +@@ 10.1 New Files ++crates/lore-tui/src/sync_delta_ledger.rs # In-memory per-run exact changed/new IDs (issues/MRs/discussions) +``` + +5. Adaptive render governor (runtime performance safety) +Why this improves the plan: capability detection is static; you also need dynamic adaptation when frame time/backpressure worsens (SSH, tmux nesting, huge logs). +Tradeoff: visual richness may step down automatically under load. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 3.4.1 Capability-Adaptive Rendering ++#### 3.4.2 Adaptive Render Governor ++Runtime monitors frame time and stream pressure: ++- if frame p95 > 40ms or sync drops spike, switch to lighter profile: ++ plain markdown, reduced tree guides, slower spinner tick, less frequent repaint. ++- when stable for N seconds, restore previous profile. ++CLI override: ++`lore tui --render-profile=auto|quality|balanced|speed` + +@@ 9.3 Phase 0 — Toolchain Gate ++27. Frame-time governor validated: under induced load, UI remains responsive and input latency ++stays within p95 < 75ms while auto-downgrading render profile. +``` + +6. First-run/data-not-ready screen (not an init wizard) +Why this improves the plan: empty DB or missing indexes will otherwise feel broken. A dedicated read-only readiness screen improves first impression and self-recovery. +Tradeoff: one extra lightweight screen/state. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 4.3 Core Types (Screen enum) + Sync, + Stats, + Doctor, ++ Bootstrap, + +@@ 5.11 Doctor / Stats (Info Screens) ++### 5.12 Bootstrap (Data Readiness) ++Shown when no synced projects/documents are present or required indexes are missing. ++Displays concise readiness checks and exact CLI commands to recover: ++`lore sync`, `lore migrate`, `lore --robot doctor`. ++Read-only; no auto-execution. +``` + +7. Global project scope pinning across screens +Why this improves the plan: users repeatedly apply the same project filter across dashboard/list/search/timeline/who. Add a global scope pin to reduce repetitive filtering and speed triage. +Tradeoff: must show clear “scope active” indicator to avoid confusion. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 4.1 Module Structure ++ scope.rs # Global project scope context (all-projects or pinned project set) + +@@ 8.1 Global (Available Everywhere) ++| `P` | Open project scope picker / toggle global scope pin | + +@@ 4.10 State Module — Complete ++pub global_scope: ScopeContext, // Applies to dashboard/list/search/timeline/who queries + +@@ 10.11 Action Module — Query Bridge +- pub fn fetch_issues(conn: &Connection, filter: &IssueFilter) -> Result, LoreError> ++ pub fn fetch_issues(conn: &Connection, scope: &ScopeContext, filter: &IssueFilter) -> Result, LoreError> +``` + +8. Concurrency correctness tests for pagination and cancellation races +Why this improves the plan: current reliability tests are good, but missing a direct test for duplicate/skip behavior under concurrent sync writes while paginating. +Tradeoff: additional integration test complexity. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 9.2 Phases (Phase 5.5 — Reliability Test Pack) ++ Concurrent pagination/write race tests :p55j, after p55h, 1d ++ Query deadline cancellation race tests :p55k, after p55j, 0.5d + +@@ 9.3 Phase 0 — Toolchain Gate ++28. Concurrent pagination/write test proves no duplicates/skips within a pinned browse snapshot. ++29. Cancellation race test proves no cross-task interrupt bleed and no stuck loading state. +``` + +9. URL opening policy v2: allowlisted GitLab entity paths +Why this improves the plan: host validation is necessary but not always sufficient. Restrict default browser opens to known GitLab entity paths and require confirmation for unusual paths on same host. +Tradeoff: occasional extra prompt for uncommon but valid URLs. + +```diff +diff --git a/docs/plans/gitlore-tui-prd-v2.md b/docs/plans/gitlore-tui-prd-v2.md +@@ 3.1 Risk Matrix +-| Malicious URL in entity data opened in browser | Medium | Low | URL host validated against configured GitLab instance before `open`/`xdg-open` | ++| Malicious URL in entity data opened in browser | Medium | Low | Validate scheme+host+port and path pattern allowlist (`/-/issues/`, `/-/merge_requests/`, project issue/MR routes). Unknown same-host paths require explicit confirm modal. | + +@@ 10.4.1 Terminal Safety — Untrusted Text Sanitization +- pub fn is_safe_url(url: &str, allowed_origins: &[AllowedOrigin]) -> bool ++ pub fn classify_safe_url(url: &str, policy: &UrlPolicy) -> UrlSafety ++ // UrlSafety::{AllowedEntityPath, AllowedButUnrecognizedPath, Blocked} +``` + +These 9 changes are additive, avoid previously rejected ideas, and materially improve determinism, responsiveness, post-sync usefulness, and safety without forcing a big architecture reset. \ No newline at end of file diff --git a/plans/tui-prd-v2-frankentui.md b/plans/tui-prd-v2-frankentui.md index bded6e3..25a3c74 100644 --- a/plans/tui-prd-v2-frankentui.md +++ b/plans/tui-prd-v2-frankentui.md @@ -2,12 +2,12 @@ plan: true title: "Gitlore TUI PRD v2 - FrankenTUI" status: iterating -iteration: 9 +iteration: 10 target_iterations: 10 beads_revision: 0 related_plans: [] created: 2026-02-11 -updated: 2026-02-11 +updated: 2026-02-12 --- # Gitlore TUI — Product Requirements Document @@ -288,7 +288,9 @@ crates/lore-tui/src/ safety.rs # sanitize_for_terminal(), safe_url_policy() redact.rs # redact_sensitive(): strip tokens, Authorization headers, and credential patterns from logs and crash reports before persisting to disk session.rs # Versioned session state persistence + corruption quarantine - entity_cache.rs # Bounded LRU cache for detail payloads (IssueDetail, MrDetail). Keyed by EntityKey. Invalidated on sync completion. Enables near-instant reopen during Enter/Esc drill-in/out workflows without re-querying. + scope.rs # Global project scope context: all-projects or pinned project set. Applied to dashboard/list/search/timeline/who queries. Persisted in session state. + entity_cache.rs # Bounded LRU cache for detail payloads (IssueDetail, MrDetail). Keyed by EntityKey. Selective invalidation by changed EntityKey set on sync completion (not blanket invalidate_all). Optional post-sync prewarm of top changed entities for immediate triage. Enables near-instant reopen during Enter/Esc drill-in/out workflows without re-querying. + render_cache.rs # Width/theme/content-hash keyed cache for expensive render artifacts (markdown → styled text, discussion tree shaping). Invalidation triggers: content hash change, terminal width change, theme change. Prevents per-frame recomputation of markdown parsing and tree layout. crash_context.rs # Ring buffer of last 2000 normalized events + current screen/task snapshot for crash diagnostics. Captured by panic hook for post-mortem debugging. ``` @@ -359,20 +361,24 @@ pub enum Msg { CommandPaletteSelect(usize), // Issue list - IssueListLoaded(Vec), + /// Generation-guarded: stale results from superseded filter/nav are dropped. + IssueListLoaded { generation: u64, rows: Vec }, IssueListFilterChanged(IssueFilter), IssueListSortChanged(SortField, SortOrder), IssueSelected(EntityKey), // MR list - MrListLoaded(Vec), + /// Generation-guarded: stale results from superseded filter/nav are dropped. + MrListLoaded { generation: u64, rows: Vec }, MrListFilterChanged(MrFilter), MrSelected(EntityKey), // Detail views - IssueDetailLoaded { key: EntityKey, detail: IssueDetail }, - MrDetailLoaded { key: EntityKey, detail: MrDetail }, - DiscussionsLoaded(Vec), + /// Generation-guarded: prevents stale detail overwrites after rapid navigation. + IssueDetailLoaded { generation: u64, key: EntityKey, detail: IssueDetail }, + /// Generation-guarded: prevents stale detail overwrites after rapid navigation. + MrDetailLoaded { generation: u64, key: EntityKey, detail: MrDetail }, + DiscussionsLoaded { generation: u64, discussions: Vec }, // Search SearchQueryChanged(String), @@ -395,6 +401,9 @@ pub enum Msg { // Sync SyncStarted, SyncProgress(ProgressEvent), + /// Coalesced batch of progress events (one per lane key). + /// Reduces render pressure by batching at <=30Hz per lane. + SyncProgressBatch(Vec), SyncLogLine(String), SyncBackpressureDrop, SyncCompleted(SyncResult), @@ -454,6 +463,7 @@ pub enum Screen { Sync, Stats, Doctor, + Bootstrap, } /// Composite key for entity identity across multi-project datasets. @@ -553,7 +563,7 @@ impl Default for InputMode { // crates/lore-tui/src/app.rs use ftui_runtime::program::{Model, Cmd, TaskSpec}; -use ftui_runtime::subscription::{Subscription, Every}; +use ftui_runtime::subscription::{Subscription, Every, After}; use ftui_core::event::{Event, KeyEvent, KeyCode, KeyEventKind, Modifiers}; use ftui_render::frame::Frame; use rusqlite::Connection; @@ -626,6 +636,20 @@ pub struct DbManager { next_reader: AtomicUsize, } +/// A task-scoped reader lease that owns an interrupt handle for safe cancellation. +/// Unlike interrupting a shared pooled connection (which can cancel unrelated work), +/// each dispatched query receives its own ReaderLease. The InterruptHandle stored in +/// TaskHandle targets only this lease's connection, preventing cross-task cancellation bleed. +pub struct ReaderLease<'a> { + conn: std::sync::MutexGuard<'a, Connection>, + /// Owned interrupt handle — safe to fire without affecting other tasks. + pub interrupt: rusqlite::InterruptHandle, +} + +impl<'a> ReaderLease<'a> { + pub fn conn(&self) -> &Connection { &self.conn } +} + impl DbManager { pub fn new(db_path: &Path, reader_count: usize) -> Result { let mut readers = Vec::with_capacity(reader_count); @@ -663,6 +687,19 @@ impl DbManager { .map_err(|e| LoreError::Internal(format!("writer lock poisoned: {e}")))?; f(&conn) } + + /// Lease a reader connection with a task-owned interrupt handle. + /// The returned `ReaderLease` holds the mutex guard and provides + /// an `InterruptHandle` that can be stored in `TaskHandle` for + /// safe per-task cancellation. This prevents cross-task interrupt bleed + /// that would occur with shared-connection `sqlite3_interrupt()`. + pub fn lease_reader(&self) -> Result, LoreError> { + let idx = self.next_reader.fetch_add(1, Ordering::Relaxed) % self.readers.len(); + let conn = self.readers[idx].lock() + .map_err(|e| LoreError::Internal(format!("reader lock poisoned: {e}")))?; + let interrupt = conn.get_interrupt_handle(); + Ok(ReaderLease { conn, interrupt }) + } } impl LoreApp { @@ -786,9 +823,11 @@ impl LoreApp { }), Screen::IssueList => { let filter = self.state.issue_list.current_filter(); + let handle = self.task_supervisor.submit(TaskKey::LoadScreen(Screen::IssueList)); + let generation = handle.generation; Cmd::task(move || { match db.with_reader(|conn| crate::tui::action::fetch_issues(conn, &filter)) { - Ok(result) => Msg::IssueListLoaded(result), + Ok(rows) => Msg::IssueListLoaded { generation, rows }, Err(e) => Msg::Error(AppError::Internal(e.to_string())), } }) @@ -797,21 +836,26 @@ impl LoreApp { // Check entity cache first — enables near-instant reopen // during Enter/Esc drill-in/out workflows. if let Some(cached) = self.entity_cache.get_issue(key) { - return Cmd::msg(Msg::IssueDetailLoaded { key: key.clone(), detail: cached.clone() }); + let handle = self.task_supervisor.submit(TaskKey::LoadScreen(Screen::IssueDetail(key.clone()))); + return Cmd::msg(Msg::IssueDetailLoaded { generation: handle.generation, key: key.clone(), detail: cached.clone() }); } + let handle = self.task_supervisor.submit(TaskKey::LoadScreen(Screen::IssueDetail(key.clone()))); + let generation = handle.generation; let key = key.clone(); Cmd::task(move || { match db.with_reader(|conn| crate::tui::action::fetch_issue_detail(conn, &key)) { - Ok(detail) => Msg::IssueDetailLoaded { key, detail }, + Ok(detail) => Msg::IssueDetailLoaded { generation, key, detail }, Err(e) => Msg::Error(AppError::Internal(e.to_string())), } }) } Screen::MrList => { let filter = self.state.mr_list.current_filter(); + let handle = self.task_supervisor.submit(TaskKey::LoadScreen(Screen::MrList)); + let generation = handle.generation; Cmd::task(move || { match db.with_reader(|conn| crate::tui::action::fetch_mrs(conn, &filter)) { - Ok(result) => Msg::MrListLoaded(result), + Ok(rows) => Msg::MrListLoaded { generation, rows }, Err(e) => Msg::Error(AppError::Internal(e.to_string())), } }) @@ -819,12 +863,15 @@ impl LoreApp { Screen::MrDetail(key) => { // Check entity cache first if let Some(cached) = self.entity_cache.get_mr(key) { - return Cmd::msg(Msg::MrDetailLoaded { key: key.clone(), detail: cached.clone() }); + let handle = self.task_supervisor.submit(TaskKey::LoadScreen(Screen::MrDetail(key.clone()))); + return Cmd::msg(Msg::MrDetailLoaded { generation: handle.generation, key: key.clone(), detail: cached.clone() }); } + let handle = self.task_supervisor.submit(TaskKey::LoadScreen(Screen::MrDetail(key.clone()))); + let generation = handle.generation; let key = key.clone(); Cmd::task(move || { match db.with_reader(|conn| crate::tui::action::fetch_mr_detail(conn, &key)) { - Ok(detail) => Msg::MrDetailLoaded { key, detail }, + Ok(detail) => Msg::MrDetailLoaded { generation, key, detail }, Err(e) => Msg::Error(AppError::Internal(e.to_string())), } }) @@ -895,9 +942,11 @@ impl LoreApp { Screen::IssueList => { let filter = self.state.issue_list.current_filter(); let db = Arc::clone(&self.db); + let handle = self.task_supervisor.submit(TaskKey::FilterRequery(Screen::IssueList)); + let generation = handle.generation; Cmd::task(move || { match db.with_reader(|conn| crate::tui::action::fetch_issues(conn, &filter)) { - Ok(result) => Msg::IssueListLoaded(result), + Ok(rows) => Msg::IssueListLoaded { generation, rows }, Err(e) => Msg::Error(AppError::Internal(e.to_string())), } }) @@ -905,9 +954,11 @@ impl LoreApp { Screen::MrList => { let filter = self.state.mr_list.current_filter(); let db = Arc::clone(&self.db); + let handle = self.task_supervisor.submit(TaskKey::FilterRequery(Screen::MrList)); + let generation = handle.generation; Cmd::task(move || { match db.with_reader(|conn| crate::tui::action::fetch_mrs(conn, &filter)) { - Ok(result) => Msg::MrListLoaded(result), + Ok(rows) => Msg::MrListLoaded { generation, rows }, Err(e) => Msg::Error(AppError::Internal(e.to_string())), } }) @@ -961,15 +1012,18 @@ impl LoreApp { if cancel_token.load(std::sync::atomic::Ordering::Relaxed) { return; // Early exit — orchestrator handles partial state } - // Track queue depth for stream stats - let current_depth = 2048 - tx.try_send(Msg::SyncProgress(event.clone())) - .err().map_or(0, |_| 1); - max_queue_depth = max_queue_depth.max(current_depth); - if tx.try_send(Msg::SyncProgress(event.clone())).is_err() { - // Channel full — drop this progress update rather than - // blocking the sync thread. Track for stats. - dropped_count += 1; - let _ = tx.try_send(Msg::SyncBackpressureDrop); + // Coalesce progress events by lane key at <=30Hz to reduce + // render pressure. Each lane (project x resource_type) keeps + // only its latest progress snapshot. The coalescer flushes + // a batch when 33ms have elapsed since last flush. + coalescer.update(event.clone()); + if let Some(batch) = coalescer.flush_ready() { + if tx.try_send(Msg::SyncProgressBatch(batch)).is_err() { + // Channel full — drop this batch rather than + // blocking the sync thread. Track for stats. + dropped_count += 1; + let _ = tx.try_send(Msg::SyncBackpressureDrop); + } } let _ = tx.try_send(Msg::SyncLogLine(format!("{event:?}"))); }, @@ -1143,23 +1197,35 @@ impl Model for LoreApp { self.state.dashboard.update(data); Cmd::none() } - Msg::IssueListLoaded(result) => { + Msg::IssueListLoaded { generation, rows } => { + if !self.task_supervisor.is_current(&TaskKey::LoadScreen(Screen::IssueList), generation) { + return Cmd::none(); // Stale — superseded by newer nav/filter + } self.state.set_loading(false); - self.state.issue_list.set_result(result); + self.state.issue_list.set_result(rows); Cmd::none() } - Msg::IssueDetailLoaded { key, detail } => { + Msg::IssueDetailLoaded { generation, key, detail } => { + if !self.task_supervisor.is_current(&TaskKey::LoadScreen(Screen::IssueDetail(key.clone())), generation) { + return Cmd::none(); // Stale — user navigated away + } self.state.set_loading(false); self.entity_cache.put_issue(key, detail.clone()); self.state.issue_detail.set(detail); Cmd::none() } - Msg::MrListLoaded(result) => { + Msg::MrListLoaded { generation, rows } => { + if !self.task_supervisor.is_current(&TaskKey::LoadScreen(Screen::MrList), generation) { + return Cmd::none(); // Stale — superseded by newer nav/filter + } self.state.set_loading(false); - self.state.mr_list.set_result(result); + self.state.mr_list.set_result(rows); Cmd::none() } - Msg::MrDetailLoaded { key, detail } => { + Msg::MrDetailLoaded { generation, key, detail } => { + if !self.task_supervisor.is_current(&TaskKey::LoadScreen(Screen::MrDetail(key.clone())), generation) { + return Cmd::none(); // Stale — user navigated away + } self.state.set_loading(false); self.entity_cache.put_mr(key, detail.clone()); self.state.mr_detail.set(detail); @@ -1219,6 +1285,12 @@ impl Model for LoreApp { self.state.sync.update_progress(event); Cmd::none() } + Msg::SyncProgressBatch(events) => { + for event in events { + self.state.sync.update_progress(event); + } + Cmd::none() + } Msg::SyncLogLine(line) => { self.state.sync.push_log(line); Cmd::none() @@ -1234,10 +1306,15 @@ impl Model for LoreApp { Cmd::none() } Msg::SyncCompleted(result) => { - self.state.sync.complete(result); - // Invalidate entity cache — synced data may have changed. - self.entity_cache.invalidate_all(); - Cmd::none() + self.state.sync.complete(&result); + // Selective invalidation: evict only changed entities from sync delta. + self.entity_cache.invalidate_keys(&result.changed_entity_keys); + // Prewarm top N changed/new entities for immediate post-sync triage. + // This is lazy — enqueues Cmd::task fetches, doesn't block the event loop. + let prewarm_cmds = self.enqueue_cache_prewarm(&result.changed_entity_keys); + // Notify list screens that new data is available (snapshot fence refresh badge). + self.state.notify_data_changed(); + prewarm_cmds } Msg::SyncFailed(err) => { self.state.sync.fail(err); @@ -1416,21 +1493,23 @@ impl Model for LoreApp { )); } - // Go-prefix timeout enforcement: tick even when nothing is loading. - // Without this, GoPrefix mode can get "stuck" when idle (no other - // events to drive the Tick that checks the 500ms timeout). + // Go-prefix timeout: one-shot After(500ms) tied to the prefix start. + // Uses After (one-shot) instead of Every (periodic) — the prefix + // either completes with a valid key or times out exactly once. if matches!(self.input_mode, InputMode::GoPrefix { .. }) { subs.push(Box::new( - Every::with_id(2, Duration::from_millis(50), || Msg::Tick) + After::with_id(2, Duration::from_millis(500), || Msg::Tick) )); } - // Search debounce timer: fires SearchDebounceFired after 200ms. + // Search debounce timer: one-shot fires SearchDebounceFired after 200ms. // Only active when a debounce is pending (armed by keystroke). + // Uses After (one-shot) instead of Every (periodic) to avoid repeated + // firings from a periodic timer — one debounce = one fire. if self.state.search.debounce_pending() { let generation = self.state.search.debounce_generation(); subs.push(Box::new( - Every::with_id(3, Duration::from_millis(200), move || { + After::with_id(3, Duration::from_millis(200), move || { Msg::SearchDebounceFired { generation } }) )); @@ -1485,7 +1564,7 @@ pub fn with_read_snapshot( } ``` -**Query interruption:** Long-running queries register interrupt checks tied to `CancelToken` to avoid >1s uninterruptible stalls during rapid navigation/filtering. When the user navigates away from a detail screen before queries complete, the cancel token fires `sqlite3_interrupt()` on the connection. +**Query interruption:** Long-running queries use task-owned `ReaderLease` interrupt handles (from `DbManager::lease_reader()`) to avoid >1s uninterruptible stalls during rapid navigation/filtering. When the user navigates away from a detail screen before queries complete, the `TaskHandle`'s owned `InterruptHandle` fires `sqlite3_interrupt()` on that specific leased connection — never on a shared pool connection. This prevents cross-task cancellation bleed where interrupting one query accidentally cancels an unrelated query on the same pooled connection. #### 4.5.1 Task Supervisor (Dedup + Cancellation + Priority) @@ -1549,6 +1628,10 @@ pub struct TaskHandle { pub key: TaskKey, pub generation: u64, pub cancel: Arc, + /// Per-task SQLite interrupt handle. When set, cancellation fires + /// this handle instead of interrupting shared pool connections. + /// Prevents cross-task cancellation bleed. + pub interrupt: Option, } /// The TaskSupervisor manages active tasks, deduplicates by key, and tracks @@ -1756,6 +1839,11 @@ pub struct NavigationStack { /// This mirrors vim's jump list behavior. jump_list: Vec, jump_index: usize, + /// Browse snapshot token: each list/search screen carries a per-screen + /// `BrowseSnapshot` that preserves stable ordering until explicit refresh + /// or screen re-entry. This works with the snapshot fence to ensure + /// deterministic pagination during concurrent sync writes. + browse_snapshots: HashMap, } impl NavigationStack { @@ -1979,9 +2067,21 @@ Insights are computed from local data during dashboard load. Each insight row is **Data source:** `lore issues` query against SQLite **Columns:** Configurable — iid, title, state, author, labels, milestone, updated_at **Sorting:** Click column header or Tab to cycle (iid, updated, created) -**Filtering:** Interactive filter bar with field:value syntax +**Filtering:** Interactive filter bar with typed DSL parser. Grammar (v1): +- `term := [ "-" ] (field ":" value | quoted_text | bare_text)` +- `value := quoted | unquoted` +- Examples: `state:opened label:"P1 blocker" -author:bot since:14d` +- Negation prefix (`-`) excludes matches for that term +- Quoted values allow spaces in filter values +- Parser surfaces inline diagnostics with cursor position for parse errors — never silently drops unknown fields **Pagination:** Windowed keyset pagination with explicit cursor state. The list state maintains `window` (current visible rows), `next_cursor` / `prev_cursor` (keyset boundary values for forward/back navigation), `prefetching` flag (background fetch of next window in progress), and a fixed `window_size` (default 200 rows). First paint uses current window only; no full-result materialization. Virtual scrolling within the window for smooth UX. When the user scrolls past ~80% of the window, the next window is prefetched in the background. +**Snapshot fence:** On list entry, capture `snapshot_upper_updated_at` (current max `updated_at` in the result set) and pin all list-page queries to `updated_at <= snapshot_upper_updated_at`. This guarantees no duplicate or skipped rows during scrolling even if sync writes occur concurrently. A "new data available" badge appears when a newer sync completes; `r` refreshes the fence and re-queries from the top. + +**Quick Peek (`Space`):** Toggle a right-side preview pane showing the selected item's metadata, first discussion snippet, and cross-references without entering the full detail view. This enables rapid triage scanning — the user can evaluate issues at a glance without the Enter/Esc cycle. The peek pane uses the same progressive hydration as detail views (metadata first, discussions lazy). The pane width adapts to terminal breakpoints (hidden at Xs/Sm, 40% width at Md+). + +**Cursor determinism:** Keyset pagination uses deterministic tuple ordering: `ORDER BY , project_id, iid`. The cursor struct includes the current `sort_field`, `sort_order`, `project_id` (tie-breaker for multi-project datasets where rows share timestamps), and a `filter_hash: u64` (hash of the active filter state). On cursor resume, the cursor is rejected if `filter_hash` or sort tuple mismatches the current query — this prevents stale cursors from producing duplicate/skipped rows after the user changes sort mode or filters mid-browse. + ### 5.3 Issue Detail ``` @@ -2052,7 +2152,9 @@ Identical structure to Issue List with MR-specific columns: | Author | MR author | | Updated | Relative time | -**Pagination:** Same windowed keyset pagination strategy as Issue List (window=200, background prefetch). +**Pagination:** Same windowed keyset pagination strategy as Issue List (window=200, background prefetch, deterministic cursor with `project_id` tie-breaker and `filter_hash` invalidation). Same snapshot fence (`updated_at <= snapshot_upper_updated_at`) for deterministic cross-page traversal under concurrent sync writes. + +**Quick Peek (`Space`):** Same as Issue List — toggle right preview pane showing MR metadata, first discussion snippet, and cross-references for rapid triage without entering detail view. **Additional filters:** `--draft`, `--no-draft`, `--target-branch`, `--source-branch`, `--reviewer` @@ -2294,8 +2396,8 @@ The Sync screen has two modes: **running** (progress + log) and **summary** (pos **Summary mode:** - Shows delta counts (new, updated) for each entity type -- `i` navigates to Issue List pre-filtered to "since last sync" (using `sync_status.last_completed_at` timestamp comparison) -- `m` navigates to MR List pre-filtered to "since last sync" (using `sync_status.last_completed_at` timestamp comparison) +- `i` navigates to Issue List filtered by exact issue IDs changed in this sync run (from in-memory `SyncDeltaLedger`). Falls back to timestamp filter via `sync_status.last_completed_at` only if run delta is not available (e.g., after app restart). +- `m` navigates to MR List filtered by exact MR IDs changed in this sync run (from in-memory `SyncDeltaLedger`). Falls back to timestamp filter only if run delta is not available. - `r` restarts sync ### 5.10 Command Palette (Overlay) @@ -2349,6 +2451,21 @@ The Sync screen has two modes: **running** (progress + log) and **summary** (pos - Does NOT auto-execute commands — the user always runs them manually for safety - Scrollable with j/k, Esc to go back +### 5.12 Bootstrap (Data Readiness) + +Shown automatically when the TUI detects no synced projects/documents or required indexes are missing. This is a read-only screen — it never auto-executes commands. + +Displays concise readiness checks with pass/fail indicators: +- Synced projects present? +- Issues/MRs populated? +- FTS index built? +- Embedding index built? (optional — warns but doesn't block) +- Required migration version met? + +For each failing check, shows the exact CLI command to recover (e.g., `lore sync`, `lore migrate`, `lore --robot doctor`). The user exits the TUI and runs the commands manually. + +This prevents the "blank screen" first-run experience where a user launches `lore tui` before syncing data and sees an empty dashboard with no indication of what to do next. + --- ## 6. User Flows @@ -2483,8 +2600,8 @@ graph TD style F fill:#51cf66,stroke:#333,color:#fff ``` -**Keystrokes:** `i` → `j/k` to scan → `Enter` to peek → `Esc` to return → continue scanning -**State preservation:** After pressing Esc from Issue Detail, the cursor returns to exactly the same row in the list. Filter state and scroll offset are preserved. This tight Enter/Esc loop is the most common daily workflow. +**Keystrokes:** `i` → `j/k` to scan → `Space` to Quick Peek (or `Enter` for full detail) → `Esc` to return → continue scanning +**State preservation:** After pressing Esc from Issue Detail, the cursor returns to exactly the same row in the list. Filter state and scroll offset are preserved. This tight Enter/Esc loop is the most common daily workflow. Quick Peek (`Space`) makes triage even faster — preview metadata and first discussion snippet without leaving the list. ### 6.8 Flow: "Jump between screens without returning to Dashboard" @@ -2591,6 +2708,7 @@ graph TD | `Ctrl+O` | Jump backward in jump list (entity hops) | | `Alt+o` | Jump forward in jump list (entity hops) | | `Ctrl+R` | Reset session state for current screen (clear filters, scroll to top) | +| `P` | Open project scope picker / toggle global scope pin. When a scope is pinned, all list/search/timeline/who queries are filtered to that project set. A visible `[scope: project/path]` indicator appears in the status bar. | | `Ctrl+C` | Quit (force) | ### 8.2 List Screens (Issues, MRs, Search Results) @@ -2600,6 +2718,7 @@ graph TD | `j` / `↓` | Move selection down | | `k` / `↑` | Move selection up | | `Enter` | Open selected item | +| `Space` | Toggle Quick Peek panel for selected row | | `G` | Jump to bottom | | `g` `g` | Jump to top | | `Tab` / `f` | Focus filter bar | @@ -2614,7 +2733,7 @@ graph TD 3. Global shortcuts — `q`, `H`, `?`, `o`, `Ctrl+C`, `Ctrl+P`, `Esc`, `g` prefix 4. Screen-local shortcuts — per-screen key handlers (the table above) -**Go-prefix timeout:** 500ms from first `g` press, enforced by `InputMode::GoPrefix { started_at }` state checked on each tick via `clock.now_instant()`. If no valid continuation key arrives within 500ms, the prefix cancels and a brief "g--" flash clears from the status bar. The tick subscription compares the injected Clock's current instant against `started_at` — no separate timer task needed. Using `InputMode` instead of ad-hoc boolean flags makes the state machine explicit and deterministic. Feedback is immediate — the status bar shows "g--" within the same frame as the keypress. +**Go-prefix timeout:** 500ms from first `g` press, enforced by a one-shot `After(500ms)` subscription tied to the prefix generation. If no valid continuation key arrives within 500ms, the timer fires a single `Msg::Tick` which checks `InputMode::GoPrefix { started_at }` via `clock.now_instant()` and cancels the prefix. A brief "g--" flash clears from the status bar. Using `After` (one-shot) instead of `Every` (periodic) avoids unnecessary repeated ticks. Using `InputMode` instead of ad-hoc boolean flags makes the state machine explicit and deterministic. Feedback is immediate — the status bar shows "g--" within the same frame as the keypress. **Terminal keybinding safety notes:** - `Ctrl+I` is NOT used — it is indistinguishable from `Tab` in most terminals (both send `\x09`). Jump-forward uses `Alt+o` instead. @@ -2783,6 +2902,8 @@ gantt Event fuzz tests (key/resize/paste, deterministic seed replay):p55g, after p55e, 1d Deterministic clock/render tests:p55i, after p55g, 0.5d 30-minute soak test (no panic/leak):p55h, after p55i, 1d + Concurrent pagination/write race tests :p55j, after p55h, 1d + Query cancellation race tests :p55k, after p55j, 0.5d section Phase 5.6 — CLI/TUI Parity Pack Dashboard count parity tests :p56a, after p55h, 0.5d @@ -2802,7 +2923,7 @@ Ensures the TUI displays the same data as the CLI robot mode, preventing drift b **Success criterion:** Parity suite passes on CI fixtures (S and M tiers). Parity is asserted by field-level comparison, not string formatting comparison — the TUI and CLI may format differently but must present the same underlying data. -**Total estimated scope:** ~47 implementation days across 9 phases (increased from ~43 to account for Phase 2.5 vertical slice gate, entity cache, crash context ring buffer, timer-based debounce, and expanded success criteria 24-25). +**Total estimated scope:** ~49 implementation days across 9 phases (increased from ~47 to account for filter DSL parser, render cache, progress coalescer, Quick Peek panel, ReaderLease interrupt handles, and generation-guarding all async Msg variants). ### 9.3 Phase 0 — Toolchain Gate @@ -2912,7 +3033,12 @@ crates/lore-tui/src/theme.rs # ftui Theme config crates/lore-tui/src/action.rs # Query bridge functions (uses lore core) crates/lore-tui/src/db_manager.rs # DbManager: closure-based read pool (with_reader) + dedicated writer (with_writer). Prevents lock-poison panics and accidental long-held guards. crates/lore-tui/src/task_supervisor.rs # TaskSupervisor: unified submit() → TaskHandle API with dedup, cancellation, generation IDs, and priority lanes -crates/lore-tui/src/entity_cache.rs # Bounded LRU cache for IssueDetail/MrDetail payloads. Keyed by EntityKey. Invalidated on sync completion. Enables near-instant reopen during Enter/Esc drill-in/out workflows. +crates/lore-tui/src/entity_cache.rs # Bounded LRU cache for IssueDetail/MrDetail payloads. Keyed by EntityKey. Selective invalidation by changed EntityKey set (not blanket invalidate_all). Optional post-sync prewarm of top changed entities. Enables near-instant reopen during Enter/Esc drill-in/out workflows. +crates/lore-tui/src/render_cache.rs # Width/theme/content-hash keyed cache for expensive render artifacts (markdown → styled text, discussion tree shaping). Prevents per-frame recomputation. +crates/lore-tui/src/filter_dsl.rs # Typed filter bar DSL parser: quoted values, negation prefix, field:value syntax, inline diagnostics with cursor position. Replaces brittle split_whitespace() parsing. +crates/lore-tui/src/progress_coalescer.rs # Per-lane progress event coalescer. Batches progress updates at <=30Hz per lane key (project x resource_type) to reduce render pressure during sync. +crates/lore-tui/src/sync_delta_ledger.rs # In-memory per-run exact changed/new entity IDs (issues, MRs, discussions). Populated from SyncCompleted result. Used by Sync Summary mode for exact "what changed" navigation without new DB tables. Cleared on next sync run start. +crates/lore-tui/src/scope.rs # Global project scope context (AllProjects or pinned project set). Flows through all query bridge functions. Persisted in session state. `P` keybinding opens scope picker overlay. crates/lore-tui/src/crash_context.rs # Ring buffer of last 2000 normalized events + current screen/task/build snapshot. Captured by panic hook for post-mortem crash diagnostics with retention policy (latest 20 files). crates/lore-tui/src/safety.rs # sanitize_for_terminal(), safe_url_policy() crates/lore-tui/src/redact.rs # redact_sensitive(): strip tokens, Authorization headers, and credential patterns from logs and crash reports before persisting @@ -4285,6 +4411,7 @@ pub struct AppState { pub command_palette: CommandPaletteState, // Cross-cutting state + pub global_scope: ScopeContext, // Applies to dashboard/list/search/timeline/who queries. Default: AllProjects. pub load_state: ScreenLoadStateMap, pub error_toast: Option, pub show_help: bool, @@ -5445,15 +5572,20 @@ pub fn fetch_dashboard(conn: &Connection) -> Result { } /// Fetch issues, converting TUI IssueFilter → CLI ListFilters. +/// The `scope` parameter applies global project pinning — when a scope is active, +/// it overrides any per-filter project selection, ensuring cross-screen consistency. pub fn fetch_issues( conn: &Connection, + scope: &ScopeContext, filter: &IssueFilter, ) -> Result, LoreError> { // Convert TUI filter to CLI filter format. // The CLI already has query_issues() — we just need to bridge the types. + // Global scope overrides per-filter project when active. + let effective_project = scope.effective_project(filter.project.as_deref()); let cli_filter = ListFilters { limit: filter.limit, - project: filter.project.as_deref(), + project: effective_project.as_deref(), state: filter.state.as_deref(), author: filter.author.as_deref(), assignee: filter.assignee.as_deref(), @@ -7806,3 +7938,7 @@ Recommendations from external review (feedback-8, ChatGPT) that were evaluated a Recommendations from external review (feedback-9, ChatGPT) that were evaluated and declined: - **Search Facets panel (entity type counts, top labels/projects/authors with one-key apply)** — rejected as feature scope expansion for v1. The concept (three-pane layout with facet counts and quick-apply shortcuts like `1/2/3` for type facets, `l` for label cycling) is compelling and would make search more actionable for triage workflows. However, it requires: new aggregate queries for facet counting that must perform well across all three data tiers, a third layout pane that breaks the current two-pane split design, new keybinding slots (`1/2/3/l`) that could conflict with future list navigation, and per-query facet recalculation that adds latency. The existing search with explicit field-based filters is sufficient for v1. Facets are a strong v2 candidate — once search has production mileage and users report wanting faster triage filtering, the aggregate query patterns and UI layout can be designed with real usage data. + +Recommendations from external review (feedback-10, ChatGPT) that were evaluated and declined: + +- **Structured compat handshake (`--compat-json` replacing `--compat-version` integer)** — rejected because the current two-step contract (integer compat version + separate schema version check) is intentionally minimal and robust. Adding JSON parsing (`{ "protocol": 1, "compat_version": 2, "min_schema": 14, "max_schema": 16, "build": "..." }`) to a preflight binary validation introduces a new failure mode (malformed JSON, missing fields, version parsing) for zero user-visible benefit. The integer check detects "too old to work" — the only case that matters before spawning the TUI. Schema range is already validated separately via `--check-schema`. Combining both into a single JSON response couples concerns that are better kept independent (binary compat vs schema compat). The current approach is more resilient: if `--compat-version` is missing (old binary), we warn and proceed; JSON parsing failure would be a hard error. KISS principle applies. diff --git a/src/cli/commands/stats.rs b/src/cli/commands/stats.rs index 9c4115b..e0841ea 100644 --- a/src/cli/commands/stats.rs +++ b/src/cli/commands/stats.rs @@ -79,33 +79,43 @@ pub fn run_stats(config: &Config, check: bool, repair: bool, dry_run: bool) -> R let mut result = StatsResult::default(); - result.documents.total = count_query(&conn, "SELECT COUNT(*) FROM documents")?; - result.documents.issues = count_query( - &conn, - "SELECT COUNT(*) FROM documents WHERE source_type = 'issue'", - )?; - result.documents.merge_requests = count_query( - &conn, - "SELECT COUNT(*) FROM documents WHERE source_type = 'merge_request'", - )?; - result.documents.discussions = count_query( - &conn, - "SELECT COUNT(*) FROM documents WHERE source_type = 'discussion'", - )?; - result.documents.truncated = count_query( - &conn, - "SELECT COUNT(*) FROM documents WHERE is_truncated = 1", - )?; + // Single-scan conditional aggregate: 5 sequential COUNT(*) → 1 table scan + let (total, issues, mrs, discussions, truncated) = conn + .query_row( + "SELECT COUNT(*), + COALESCE(SUM(CASE WHEN source_type = 'issue' THEN 1 END), 0), + COALESCE(SUM(CASE WHEN source_type = 'merge_request' THEN 1 END), 0), + COALESCE(SUM(CASE WHEN source_type = 'discussion' THEN 1 END), 0), + COALESCE(SUM(CASE WHEN is_truncated = 1 THEN 1 END), 0) + FROM documents", + [], + |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, i64>(1)?, + row.get::<_, i64>(2)?, + row.get::<_, i64>(3)?, + row.get::<_, i64>(4)?, + )) + }, + ) + .unwrap_or((0, 0, 0, 0, 0)); + result.documents.total = total; + result.documents.issues = issues; + result.documents.merge_requests = mrs; + result.documents.discussions = discussions; + result.documents.truncated = truncated; if table_exists(&conn, "embedding_metadata") { - let embedded = count_query( - &conn, - "SELECT COUNT(DISTINCT document_id) FROM embedding_metadata WHERE last_error IS NULL", - )?; - let chunks = count_query( - &conn, - "SELECT COUNT(*) FROM embedding_metadata WHERE last_error IS NULL", - )?; + // Single scan: COUNT(DISTINCT) + COUNT(*) in one pass + let (embedded, chunks) = conn + .query_row( + "SELECT COUNT(DISTINCT document_id), COUNT(*) + FROM embedding_metadata WHERE last_error IS NULL", + [], + |row| Ok((row.get::<_, i64>(0)?, row.get::<_, i64>(1)?)), + ) + .unwrap_or((0, 0)); result.embeddings.embedded_documents = embedded; result.embeddings.total_chunks = chunks; result.embeddings.coverage_pct = if result.documents.total > 0 { @@ -115,41 +125,57 @@ pub fn run_stats(config: &Config, check: bool, repair: bool, dry_run: bool) -> R }; } - result.fts.indexed = count_query(&conn, "SELECT COUNT(*) FROM documents_fts")?; + // FTS5 shadow table is a regular B-tree with one row per document — + // 19x faster than scanning the virtual table for COUNT(*) + result.fts.indexed = count_query(&conn, "SELECT COUNT(*) FROM documents_fts_docsize")?; - result.queues.dirty_sources = count_query( - &conn, - "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NULL", - )?; - result.queues.dirty_sources_failed = count_query( - &conn, - "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NOT NULL", - )?; + // Single scan: 2 conditional counts on dirty_sources + let (ds_pending, ds_failed) = conn + .query_row( + "SELECT COALESCE(SUM(CASE WHEN last_error IS NULL THEN 1 END), 0), + COALESCE(SUM(CASE WHEN last_error IS NOT NULL THEN 1 END), 0) + FROM dirty_sources", + [], + |row| Ok((row.get::<_, i64>(0)?, row.get::<_, i64>(1)?)), + ) + .unwrap_or((0, 0)); + result.queues.dirty_sources = ds_pending; + result.queues.dirty_sources_failed = ds_failed; if table_exists(&conn, "pending_discussion_fetches") { - result.queues.pending_discussion_fetches = count_query( - &conn, - "SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NULL", - )?; - result.queues.pending_discussion_fetches_failed = count_query( - &conn, - "SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NOT NULL", - )?; + let (pdf_pending, pdf_failed) = conn + .query_row( + "SELECT COALESCE(SUM(CASE WHEN last_error IS NULL THEN 1 END), 0), + COALESCE(SUM(CASE WHEN last_error IS NOT NULL THEN 1 END), 0) + FROM pending_discussion_fetches", + [], + |row| Ok((row.get::<_, i64>(0)?, row.get::<_, i64>(1)?)), + ) + .unwrap_or((0, 0)); + result.queues.pending_discussion_fetches = pdf_pending; + result.queues.pending_discussion_fetches_failed = pdf_failed; } if table_exists(&conn, "pending_dependent_fetches") { - result.queues.pending_dependent_fetches = count_query( - &conn, - "SELECT COUNT(*) FROM pending_dependent_fetches WHERE last_error IS NULL", - )?; - result.queues.pending_dependent_fetches_failed = count_query( - &conn, - "SELECT COUNT(*) FROM pending_dependent_fetches WHERE last_error IS NOT NULL", - )?; - result.queues.pending_dependent_fetches_stuck = count_query( - &conn, - "SELECT COUNT(*) FROM pending_dependent_fetches WHERE locked_at IS NOT NULL", - )?; + let (pf_pending, pf_failed, pf_stuck) = conn + .query_row( + "SELECT COALESCE(SUM(CASE WHEN last_error IS NULL THEN 1 END), 0), + COALESCE(SUM(CASE WHEN last_error IS NOT NULL THEN 1 END), 0), + COALESCE(SUM(CASE WHEN locked_at IS NOT NULL THEN 1 END), 0) + FROM pending_dependent_fetches", + [], + |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, i64>(1)?, + row.get::<_, i64>(2)?, + )) + }, + ) + .unwrap_or((0, 0, 0)); + result.queues.pending_dependent_fetches = pf_pending; + result.queues.pending_dependent_fetches_failed = pf_failed; + result.queues.pending_dependent_fetches_stuck = pf_stuck; } #[allow(clippy::field_reassign_with_default)] diff --git a/src/cli/commands/who.rs b/src/cli/commands/who.rs index 0f2ef8b..1506b92 100644 --- a/src/cli/commands/who.rs +++ b/src/cli/commands/who.rs @@ -473,9 +473,11 @@ fn build_path_query(conn: &Connection, path: &str, project_id: Option) -> R let looks_like_file = !forced_dir && (is_root || last_segment.contains('.')); // Probe 1: exact file exists in DiffNotes OR mr_file_changes (project-scoped) + // Exact-match probes already use the partial index, but LIKE probes below + // benefit from the INDEXED BY hint (same planner issue as expert query). let exact_exists = conn .query_row( - "SELECT 1 FROM notes + "SELECT 1 FROM notes INDEXED BY idx_notes_diffnote_path_created WHERE note_type = 'DiffNote' AND is_system = 0 AND position_new_path = ?1 @@ -501,7 +503,7 @@ fn build_path_query(conn: &Connection, path: &str, project_id: Option) -> R let escaped = escape_like(trimmed); let pat = format!("{escaped}/%"); conn.query_row( - "SELECT 1 FROM notes + "SELECT 1 FROM notes INDEXED BY idx_notes_diffnote_path_created WHERE note_type = 'DiffNote' AND is_system = 0 AND position_new_path LIKE ?1 ESCAPE '\\' @@ -597,7 +599,8 @@ fn suffix_probe(conn: &Connection, suffix: &str, project_id: Option) -> Res let mut stmt = conn.prepare_cached( "SELECT DISTINCT full_path FROM ( - SELECT position_new_path AS full_path FROM notes + SELECT position_new_path AS full_path + FROM notes INDEXED BY idx_notes_diffnote_path_created WHERE note_type = 'DiffNote' AND is_system = 0 AND (position_new_path LIKE ?1 ESCAPE '\\' OR position_new_path = ?2) @@ -658,6 +661,13 @@ fn query_expert( } else { "= ?1" }; + // When scanning DiffNotes with a LIKE prefix, SQLite's planner picks the + // low-selectivity idx_notes_system (38% of rows) instead of the much more + // selective partial index idx_notes_diffnote_path_created (9.3% of rows). + // INDEXED BY forces the correct index: measured 64x speedup (1.22s → 0.019s). + // For exact matches SQLite already picks the partial index, but the hint + // is harmless and keeps behavior consistent. + let notes_indexed_by = "INDEXED BY idx_notes_diffnote_path_created"; let author_w = scoring.author_weight; let reviewer_w = scoring.reviewer_weight; let note_b = scoring.note_bonus; @@ -672,7 +682,7 @@ fn query_expert( n.id AS note_id, n.created_at AS seen_at, (p.path_with_namespace || '!' || CAST(m.iid AS TEXT)) AS mr_ref - FROM notes n + FROM notes n {notes_indexed_by} JOIN discussions d ON n.discussion_id = d.id JOIN merge_requests m ON d.merge_request_id = m.id JOIN projects p ON m.project_id = p.id @@ -697,7 +707,7 @@ fn query_expert( (p.path_with_namespace || '!' || CAST(m.iid AS TEXT)) AS mr_ref FROM merge_requests m JOIN discussions d ON d.merge_request_id = m.id - JOIN notes n ON n.discussion_id = d.id + JOIN notes n {notes_indexed_by} ON n.discussion_id = d.id JOIN projects p ON m.project_id = p.id WHERE n.note_type = 'DiffNote' AND n.is_system = 0 @@ -851,6 +861,7 @@ fn query_expert_details( .collect(); let in_clause = placeholders.join(","); + let notes_indexed_by = "INDEXED BY idx_notes_diffnote_path_created"; let sql = format!( " WITH signals AS ( @@ -863,7 +874,7 @@ fn query_expert_details( m.title AS title, COUNT(*) AS note_count, MAX(n.created_at) AS last_activity - FROM notes n + FROM notes n {notes_indexed_by} JOIN discussions d ON n.discussion_id = d.id JOIN merge_requests m ON d.merge_request_id = m.id JOIN projects p ON m.project_id = p.id @@ -891,7 +902,7 @@ fn query_expert_details( MAX(n.created_at) AS last_activity FROM merge_requests m JOIN discussions d ON d.merge_request_id = m.id - JOIN notes n ON n.discussion_id = d.id + JOIN notes n {notes_indexed_by} ON n.discussion_id = d.id JOIN projects p ON m.project_id = p.id WHERE n.note_type = 'DiffNote' AND n.is_system = 0 @@ -1194,8 +1205,11 @@ fn query_reviews( project_id: Option, since_ms: i64, ) -> Result { - // Count total DiffNotes by this user on MRs they didn't author + // Force the partial index on DiffNote queries (same rationale as expert mode). + // COUNT + COUNT(DISTINCT) + category extraction all benefit from 26K DiffNote + // scan vs 282K notes full scan: measured 25x speedup. let total_sql = "SELECT COUNT(*) FROM notes n + INDEXED BY idx_notes_diffnote_path_created JOIN discussions d ON n.discussion_id = d.id JOIN merge_requests m ON d.merge_request_id = m.id WHERE n.author_username = ?1 @@ -1213,6 +1227,7 @@ fn query_reviews( // Count distinct MRs reviewed let mrs_sql = "SELECT COUNT(DISTINCT m.id) FROM notes n + INDEXED BY idx_notes_diffnote_path_created JOIN discussions d ON n.discussion_id = d.id JOIN merge_requests m ON d.merge_request_id = m.id WHERE n.author_username = ?1 @@ -1232,7 +1247,7 @@ fn query_reviews( let cat_sql = "SELECT SUBSTR(ltrim(n.body), 3, INSTR(SUBSTR(ltrim(n.body), 3), '**') - 1) AS raw_prefix, COUNT(*) AS cnt - FROM notes n + FROM notes n INDEXED BY idx_notes_diffnote_path_created JOIN discussions d ON n.discussion_id = d.id JOIN merge_requests m ON d.merge_request_id = m.id WHERE n.author_username = ?1