From e846a39ce6ffb2814bff530b2c0f3c4782e0d4f3 Mon Sep 17 00:00:00 2001 From: teernisse Date: Fri, 23 Jan 2026 10:03:40 -0500 Subject: [PATCH] More planning --- SPEC.md | 473 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 323 insertions(+), 150 deletions(-) diff --git a/SPEC.md b/SPEC.md index 752d7bc..f5a1e14 100644 --- a/SPEC.md +++ b/SPEC.md @@ -115,7 +115,8 @@ npm link # Makes `gi` available globally │ - Normalize artifacts to unified schema │ │ - Extract searchable documents (canonical text + metadata) │ │ - Content hashing for change detection │ -│ - Build relationship graph (issue↔MR↔note↔file) │ +│ - MVP relationships: parent-child FKs + label/path associations│ +│ (full cross-entity "decision graph" is post-MVP scope) │ └─────────────────────────────────────────────────────────────────┘ │ ▼ @@ -159,10 +160,16 @@ npm link # Makes `gi` available globally Issues and MRs support efficient bulk fetching with incremental sync: ``` -GET /projects/:id/issues?updated_after=X&order_by=updated_at&sort=asc&per_page=100 -GET /projects/:id/merge_requests?updated_after=X&order_by=updated_at&sort=asc&per_page=100 +GET /projects/:id/issues?scope=all&state=all&updated_after=X&order_by=updated_at&sort=asc&per_page=100 +GET /projects/:id/merge_requests?scope=all&state=all&updated_after=X&order_by=updated_at&sort=asc&per_page=100 ``` +**Required query params for completeness:** +- `scope=all` - include all issues/MRs, not just authored by current user +- `state=all` - include closed items (GitLab defaults may exclude them) + +Without these params, the 2+ years of historical data would be incomplete. + ### Dependent Resources (Per-Parent Fetch) Discussions must be fetched per-issue and per-MR. There is no bulk endpoint: @@ -178,10 +185,25 @@ GET /projects/:id/merge_requests/:iid/discussions?per_page=100&page=N **Initial sync:** 1. Fetch all issues (paginated, ~60 calls for 6K issues at 100/page) -2. For EACH issue → fetch all discussions (~3K calls) +2. For EACH issue → fetch all discussions (≥ issues_count calls + pagination overhead) 3. Fetch all MRs (paginated, ~60 calls) -4. For EACH MR → fetch all discussions (~3K calls) -5. Total: ~6,100+ API calls for initial sync +4. For EACH MR → fetch all discussions (≥ mrs_count calls + pagination overhead) +5. Total: thousands of API calls for initial sync + +**API Call Estimation Formula:** +``` +total_calls ≈ ceil(issues/100) + issues × avg_discussion_pages_per_issue + + ceil(mrs/100) + mrs × avg_discussion_pages_per_mr +``` + +Example: 3K issues, 3K MRs, average 1.2 discussion pages per parent: +- Issue list: 30 calls +- Issue discussions: 3,000 × 1.2 = 3,600 calls +- MR list: 30 calls +- MR discussions: 3,000 × 1.2 = 3,600 calls +- **Total: ~7,260 calls** + +This matters for rate limit planning and setting realistic "10-20 minutes" expectations. **Incremental sync:** 1. Fetch issues where `updated_after=cursor` (bulk) @@ -235,10 +257,15 @@ tests/unit/db.test.ts ✓ enables foreign keys tests/integration/gitlab-client.test.ts - ✓ authenticates with valid PAT - ✓ returns 401 for invalid PAT - ✓ fetches project by path - ✓ handles rate limiting (429) with retry + ✓ (mocked) authenticates with valid PAT + ✓ (mocked) returns 401 for invalid PAT + ✓ (mocked) fetches project by path + ✓ (mocked) handles rate limiting (429) with retry + +tests/live/gitlab-client.live.test.ts (optional, gated by GITLAB_LIVE_TESTS=1, not in CI) + ✓ authenticates with real PAT against configured baseUrl + ✓ fetches real project by path + ✓ handles actual rate limiting behavior tests/integration/app-lock.test.ts ✓ acquires lock successfully @@ -256,6 +283,7 @@ tests/integration/init.test.ts ✓ fails if any project path not found ✓ prompts before overwriting existing config ✓ respects --force to skip confirmation + ✓ generates gi.config.json with sensible defaults ``` **Manual CLI Smoke Tests:** @@ -269,6 +297,7 @@ tests/integration/init.test.ts | `gi init` (config exists) | Confirmation prompt | Warns before overwriting | | `gi --help` | Command list | Shows all available commands | | `gi version` | Version number | Shows installed version | +| `gi sync-status` | Last sync time, cursor positions | Shows successful last run | **Data Integrity Checks:** - [ ] `projects` table contains rows for each configured project path @@ -332,6 +361,13 @@ tests/integration/init.test.ts } ``` +**Raw Payload Compression:** +- When `storage.compressRawPayloads: true` (default), raw JSON payloads are gzip-compressed before storage +- `raw_payloads.content_encoding` indicates `'identity'` (uncompressed) or `'gzip'` (compressed) +- Compression typically reduces storage by 70-80% for JSON payloads +- Decompression is handled transparently when reading payloads +- Tradeoff: Slightly higher CPU on write/read, significantly lower disk usage + **DB Runtime Defaults (Checkpoint 0):** - On every connection: - `PRAGMA journal_mode=WAL;` @@ -387,13 +423,20 @@ CREATE TABLE raw_payloads ( source TEXT NOT NULL, -- 'gitlab' project_id INTEGER REFERENCES projects(id), -- nullable for instance-level resources resource_type TEXT NOT NULL, -- 'project' | 'issue' | 'mr' | 'note' | 'discussion' - gitlab_id INTEGER NOT NULL, + gitlab_id TEXT NOT NULL, -- TEXT because discussion IDs are strings; numeric IDs stored as strings fetched_at INTEGER NOT NULL, content_encoding TEXT NOT NULL DEFAULT 'identity', -- 'identity' | 'gzip' payload BLOB NOT NULL -- raw JSON or gzip-compressed JSON ); CREATE INDEX idx_raw_payloads_lookup ON raw_payloads(project_id, resource_type, gitlab_id); CREATE INDEX idx_raw_payloads_history ON raw_payloads(project_id, resource_type, gitlab_id, fetched_at); + +-- Schema version tracking for migrations +CREATE TABLE schema_version ( + version INTEGER PRIMARY KEY, + applied_at INTEGER NOT NULL, + description TEXT +); ``` --- @@ -411,7 +454,8 @@ tests/unit/issue-transformer.test.ts tests/unit/pagination.test.ts ✓ fetches all pages when multiple exist ✓ respects per_page parameter - ✓ stops when empty page returned + ✓ follows X-Next-Page header until empty/absent + ✓ falls back to empty-page stop if headers missing (robustness) tests/unit/discussion-transformer.test.ts ✓ transforms discussion payload to normalized schema @@ -450,7 +494,7 @@ tests/integration/sync-runs.test.ts | `gi list issues --limit=10` | Table of 10 issues | Shows iid, title, state, author | | `gi list issues --project=group/project-one` | Filtered list | Only shows issues from that project | | `gi count issues` | `Issues: 1,234` (example) | Count matches GitLab UI | -| `gi show issue 123` | Issue detail view | Shows title, description, labels, discussions, URL | +| `gi show issue 123` | Issue detail view | Shows title, description, labels, discussions, URL. If multiple projects have issue #123, prompts for clarification or use `--project=PATH` | | `gi count discussions --type=issue` | `Issue Discussions: 5,678` | Non-zero count | | `gi count notes --type=issue` | `Issue Notes: 12,345 (excluding 2,345 system)` | Non-zero count | | `gi sync-status` | Last sync time, cursor positions | Shows successful last run | @@ -543,7 +587,7 @@ CREATE TABLE discussions ( gitlab_discussion_id TEXT NOT NULL, -- GitLab's string ID (e.g. "6a9c1750b37d...") project_id INTEGER NOT NULL REFERENCES projects(id), issue_id INTEGER REFERENCES issues(id), - merge_request_id INTEGER REFERENCES merge_requests(id), + merge_request_id INTEGER, -- FK added in CP2 via ALTER TABLE noteable_type TEXT NOT NULL, -- 'Issue' | 'MergeRequest' individual_note BOOLEAN NOT NULL, -- standalone comment vs threaded discussion first_note_at INTEGER, -- for ordering discussions @@ -686,6 +730,15 @@ CREATE INDEX idx_mr_labels_label ON mr_labels(label_id); -- Additional indexes for DiffNote queries (tables created in CP1) CREATE INDEX idx_notes_type ON notes(type); CREATE INDEX idx_notes_new_path ON notes(position_new_path); + +-- Migration: Add FK constraint to discussions table (was deferred from CP1) +-- SQLite doesn't support ADD CONSTRAINT, so we recreate the table with FK +-- This is handled by the migration system; pseudocode for clarity: +-- 1. CREATE TABLE discussions_new with REFERENCES merge_requests(id) +-- 2. INSERT INTO discussions_new SELECT * FROM discussions +-- 3. DROP TABLE discussions +-- 4. ALTER TABLE discussions_new RENAME TO discussions +-- 5. Recreate indexes ``` **MR Discussion Processing Rules:** @@ -696,8 +749,8 @@ CREATE INDEX idx_notes_new_path ON notes(position_new_path); --- -### Checkpoint 3: Document + Embedding Generation with Lexical Search -**Deliverable:** Documents and embeddings generated; `gi search --mode=lexical` works end-to-end +### Checkpoint 3A: Document Generation + FTS (Lexical Search) +**Deliverable:** Documents generated + FTS5 index; `gi search --mode=lexical` works end-to-end (no Ollama required) **Automated Tests (Vitest):** ``` @@ -707,76 +760,67 @@ tests/unit/document-extractor.test.ts ✓ extracts discussion document with full thread context ✓ includes parent issue/MR title in discussion header ✓ formats notes with author and timestamp - ✓ truncates content exceeding 8000 tokens + ✓ excludes system notes from discussion documents by default + ✓ includes system notes only when --include-system-notes enabled (debug) + ✓ truncates content exceeding 8000 tokens at note boundaries ✓ preserves first and last notes when truncating middle ✓ computes SHA-256 content hash consistently -tests/unit/embedding-client.test.ts - ✓ connects to Ollama API - ✓ generates embedding for text input - ✓ returns 768-dimension vector - ✓ handles Ollama connection failure gracefully - ✓ batches requests (32 documents per batch) - tests/integration/document-creation.test.ts ✓ creates document for each issue ✓ creates document for each MR ✓ creates document for each discussion ✓ populates document_labels junction table ✓ computes content_hash for each document + ✓ excludes system notes from discussion content -tests/integration/embedding-storage.test.ts - ✓ stores embedding in sqlite-vss - ✓ embedding rowid matches document id - ✓ creates embedding_metadata record - ✓ skips re-embedding when content_hash unchanged - ✓ re-embeds when content_hash changes +tests/integration/fts-index.test.ts + ✓ documents_fts row count matches documents + ✓ FTS triggers fire on insert/update/delete + ✓ updates propagate via triggers + +tests/integration/fts-search.test.ts + ✓ returns exact keyword matches + ✓ porter stemming works (search/searching) + ✓ returns empty for non-matching query ``` **Manual CLI Smoke Tests:** | Command | Expected Output | Pass Criteria | |---------|-----------------|---------------| -| `gi embed --all` | Progress bar with ETA | Completes without error | -| `gi embed --all` (re-run) | `0 documents to embed` | Skips already-embedded docs | -| `gi stats` | Embedding coverage stats | Shows 100% coverage | -| `gi stats --json` | JSON stats object | Valid JSON with document/embedding counts | -| `gi embed --all` (Ollama stopped) | Clear error message | Non-zero exit, actionable error | -| `gi search "authentication" --mode=lexical` | FTS results | Returns matching documents, no embeddings required | +| `gi generate-docs` | Progress bar, final count | Completes without error | +| `gi generate-docs` (re-run) | `0 documents to regenerate` | Skips unchanged docs | +| `gi search "authentication" --mode=lexical` | FTS results | Returns matching documents, works without Ollama | +| `gi stats` | Document count stats | Shows document coverage | **Data Integrity Checks:** - [ ] `SELECT COUNT(*) FROM documents` = issues + MRs + discussions -- [ ] `SELECT COUNT(*) FROM embeddings` = `SELECT COUNT(*) FROM documents` -- [ ] `SELECT COUNT(*) FROM embedding_metadata` = `SELECT COUNT(*) FROM documents` -- [ ] All `embedding_metadata.content_hash` matches corresponding `documents.content_hash` +- [ ] `SELECT COUNT(*) FROM documents_fts` = `SELECT COUNT(*) FROM documents` (via FTS triggers) - [ ] `SELECT COUNT(*) FROM documents WHERE LENGTH(content_text) > 32000` logs truncation warnings - [ ] Discussion documents include parent title in content_text +- [ ] Discussion documents exclude system notes **Scope:** -- Ollama integration (nomic-embed-text model) -- Embedding generation pipeline: - - Batch size: 32 documents per batch - - Concurrency: configurable (default 4 workers) - - Retry with exponential backoff for transient failures (max 3 attempts) - - Per-document failure recording to enable targeted re-runs -- Vector storage in SQLite (sqlite-vss extension) -- Progress tracking and resumability - Document extraction layer: - Canonical "search documents" derived from issues/MRs/discussions - Stable content hashing for change detection (SHA-256 of content_text) - - Single embedding per document (chunking deferred to post-MVP) - - Truncation: content_text capped at 8000 tokens (nomic-embed-text limit is 8192) + - Truncation: content_text capped at 8000 tokens at NOTE boundaries - **Implementation:** Use character budget, not exact token count - `maxChars = 32000` (conservative 4 chars/token estimate) + - Drop whole notes from middle, never cut mid-note - `approxTokens = ceil(charCount / 4)` for reporting/logging only - - This avoids tokenizer dependency while preventing embedding failures +- System notes excluded from discussion documents (stored in DB for audit, but not in embeddings/search) - Denormalized metadata for fast filtering (author, labels, dates) - Fast label filtering via `document_labels` join table -- FTS5 index for lexical search (enables `gi search --mode=lexical` without Ollama) +- FTS5 index for lexical search - `gi search --mode=lexical` CLI command (works without Ollama) -**Schema Additions:** +This checkpoint delivers a working search experience before introducing embedding infrastructure risk. + +**Schema Additions (CP3A):** ```sql -- Unified searchable documents (derived from issues/MRs/discussions) +-- Note: Full documents table schema is in CP3B section for continuity with embeddings CREATE TABLE documents ( id INTEGER PRIMARY KEY, source_type TEXT NOT NULL, -- 'issue' | 'merge_request' | 'discussion' @@ -806,7 +850,122 @@ CREATE TABLE document_labels ( ); CREATE INDEX idx_document_labels_label ON document_labels(label_name); --- sqlite-vss virtual table +-- Fast path filtering for documents (extracted from DiffNote positions) +CREATE TABLE document_paths ( + document_id INTEGER NOT NULL REFERENCES documents(id), + path TEXT NOT NULL, + PRIMARY KEY(document_id, path) +); +CREATE INDEX idx_document_paths_path ON document_paths(path); + +-- Track sources that require document regeneration (populated during ingestion) +CREATE TABLE dirty_sources ( + source_type TEXT NOT NULL, -- 'issue' | 'merge_request' | 'discussion' + source_id INTEGER NOT NULL, -- local DB id + queued_at INTEGER NOT NULL, + PRIMARY KEY(source_type, source_id) +); + +-- Resumable dependent fetches (discussions are per-parent resources) +CREATE TABLE pending_discussion_fetches ( + project_id INTEGER NOT NULL REFERENCES projects(id), + noteable_type TEXT NOT NULL, -- 'Issue' | 'MergeRequest' + noteable_iid INTEGER NOT NULL, -- parent iid (stable human identifier) + queued_at INTEGER NOT NULL, + attempt_count INTEGER NOT NULL DEFAULT 0, + last_attempt_at INTEGER, + last_error TEXT, + PRIMARY KEY(project_id, noteable_type, noteable_iid) +); +CREATE INDEX idx_pending_discussions_retry + ON pending_discussion_fetches(attempt_count, last_attempt_at) + WHERE last_error IS NOT NULL; + +-- Full-text search for lexical retrieval +-- Using porter stemmer for better matching of word variants +CREATE VIRTUAL TABLE documents_fts USING fts5( + title, + content_text, + content='documents', + content_rowid='id', + tokenize='porter unicode61' +); + +-- Triggers to keep FTS in sync +CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN + INSERT INTO documents_fts(rowid, title, content_text) + VALUES (new.id, new.title, new.content_text); +END; + +CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, rowid, title, content_text) + VALUES('delete', old.id, old.title, old.content_text); +END; + +CREATE TRIGGER documents_au AFTER UPDATE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, rowid, title, content_text) + VALUES('delete', old.id, old.title, old.content_text); + INSERT INTO documents_fts(rowid, title, content_text) + VALUES (new.id, new.title, new.content_text); +END; +``` + +**FTS5 Tokenizer Notes:** +- `porter` enables stemming (searching "authentication" matches "authenticating", "authenticated") +- `unicode61` handles Unicode properly +- Code identifiers (snake_case, camelCase, file paths) may not tokenize ideally; post-MVP consideration for custom tokenizer + +--- + +### Checkpoint 3B: Embedding Generation (Semantic Search) +**Deliverable:** Embeddings generated + `gi search --mode=semantic` works; graceful fallback if Ollama unavailable + +**Automated Tests (Vitest):** +``` +tests/unit/embedding-client.test.ts + ✓ connects to Ollama API + ✓ generates embedding for text input + ✓ returns 768-dimension vector + ✓ handles Ollama connection failure gracefully + ✓ batches requests (32 documents per batch) + +tests/integration/embedding-storage.test.ts + ✓ stores embedding in sqlite-vss + ✓ embedding rowid matches document id + ✓ creates embedding_metadata record + ✓ skips re-embedding when content_hash unchanged + ✓ re-embeds when content_hash changes +``` + +**Manual CLI Smoke Tests:** +| Command | Expected Output | Pass Criteria | +|---------|-----------------|---------------| +| `gi embed --all` | Progress bar with ETA | Completes without error | +| `gi embed --all` (re-run) | `0 documents to embed` | Skips already-embedded docs | +| `gi stats` | Embedding coverage stats | Shows 100% coverage | +| `gi stats --json` | JSON stats object | Valid JSON with document/embedding counts | +| `gi embed --all` (Ollama stopped) | Clear error message | Non-zero exit, actionable error | +| `gi search "authentication" --mode=semantic` | Vector results | Returns semantically similar documents | + +**Data Integrity Checks:** +- [ ] `SELECT COUNT(*) FROM embeddings` = `SELECT COUNT(*) FROM documents` +- [ ] `SELECT COUNT(*) FROM embedding_metadata` = `SELECT COUNT(*) FROM documents` +- [ ] All `embedding_metadata.content_hash` matches corresponding `documents.content_hash` + +**Scope:** +- Ollama integration (nomic-embed-text model) +- Embedding generation pipeline: + - Batch size: 32 documents per batch + - Concurrency: configurable (default 4 workers) + - Retry with exponential backoff for transient failures (max 3 attempts) + - Per-document failure recording to enable targeted re-runs +- Vector storage in SQLite (sqlite-vss extension) +- Progress tracking and resumability +- `gi search --mode=semantic` CLI command + +**Schema Additions (CP3B):** +```sql +-- sqlite-vss virtual table for vector search -- Storage rule: embeddings.rowid = documents.id CREATE VIRTUAL TABLE embeddings USING vss0( embedding(768) @@ -828,22 +987,6 @@ CREATE TABLE embedding_metadata ( -- Index for finding failed embeddings to retry CREATE INDEX idx_embedding_metadata_errors ON embedding_metadata(last_error) WHERE last_error IS NOT NULL; - --- Track sources that require document regeneration (populated during ingestion) -CREATE TABLE dirty_sources ( - source_type TEXT NOT NULL, -- 'issue' | 'merge_request' | 'discussion' - source_id INTEGER NOT NULL, -- local DB id - queued_at INTEGER NOT NULL, - PRIMARY KEY(source_type, source_id) -); - --- Fast path filtering for documents (extracted from DiffNote positions) -CREATE TABLE document_paths ( - document_id INTEGER NOT NULL REFERENCES documents(id), - path TEXT NOT NULL, - PRIMARY KEY(document_id, path) -); -CREATE INDEX idx_document_paths_path ON document_paths(path); ``` **Storage Rule (MVP):** @@ -879,6 +1022,12 @@ Agreed. What about refresh token strategy? Short-lived access tokens (15min), longer refresh (7 days). Here's why... ``` +**System Notes Exclusion Rule:** +- System notes (is_system=1) are stored in the DB for audit purposes +- System notes are EXCLUDED from discussion documents by default +- This prevents semantic noise ("changed assignee", "added label", "mentioned in") from polluting embeddings +- Debug flag `--include-system-notes` available for troubleshooting + This format preserves: - Parent context (issue/MR title and number) - Project path for scoped search @@ -889,14 +1038,28 @@ This format preserves: - Temporal ordering of the conversation - Full thread semantics for decision traceability -**Truncation:** -If content exceeds 8000 tokens: -**Note:** Token count is approximate (`ceil(charCount / 4)`). Enforce `maxChars = 32000`. +**Truncation (Note-Boundary Aware):** +If content exceeds 8000 tokens (~32000 chars): -1. Truncate from the middle (preserve first + last notes for context) -2. Set `documents.is_truncated = 1` -3. Set `documents.truncated_reason = 'token_limit_middle_drop'` -4. Log a warning with document ID and original token count +**Algorithm:** +1. Count non-system notes in the discussion +2. If total chars ≤ maxChars, no truncation needed +3. Otherwise, drop whole notes from the MIDDLE: + - Preserve first N notes and last M notes + - Never cut mid-note (produces unreadable snippets and worse embeddings) + - Continue dropping middle notes until under maxChars +4. Insert marker: `\n\n[... N notes omitted for length ...]\n\n` +5. Set `documents.is_truncated = 1` +6. Set `documents.truncated_reason = 'token_limit_middle_drop'` +7. Log a warning with document ID and original/truncated token count + +**Why note-boundary truncation:** +- Cutting mid-note produces unreadable snippets ("...the authentication flow because--") +- Keeping whole notes preserves semantic coherence for embeddings +- First notes contain context/problem statement; last notes contain conclusions +- Middle notes are often back-and-forth that's less critical + +**Token estimation:** `approxTokens = ceil(charCount / 4)`. No tokenizer dependency. This metadata enables: - Monitoring truncation frequency in production @@ -954,14 +1117,14 @@ tests/e2e/golden-queries.test.ts | `gi search "authentication" --author=johndoe` | Filtered by author | All results have @johndoe | | `gi search "authentication" --after=2024-01-01` | Date filtered | All results after date | | `gi search "authentication" --label=bug` | Label filtered | All results have bug label | -| `gi search "redis" --mode=lexical` | FTS results only | Works without Ollama | +| `gi search "redis" --mode=lexical` | FTS results only | Shows FTS results, no embeddings | | `gi search "auth" --path=src/auth/` | Path-filtered results | Only results referencing files in src/auth/ | | `gi search "authentication" --json` | JSON output | Valid JSON matching stable schema | | `gi search "authentication" --explain` | Rank breakdown | Shows vector/FTS/RRF contributions | | `gi search "authentication" --limit=5` | 5 results max | Returns at most 5 results | | `gi search "xyznonexistent123"` | No results message | Graceful empty state | -| `gi search "auth"` (no data synced) | No data message | Shows "Run gi sync first" | -| `gi search "auth"` (Ollama stopped) | FTS results + warning | Shows warning, still returns results | +| `gi search "authentication"` (no data synced) | No data message | Shows "Run gi sync first" | +| `gi search "authentication"` (Ollama stopped) | FTS results + warning | Shows warning, still returns results | **Golden Query Test Suite:** Create `tests/fixtures/golden-queries.json` with 10 queries and expected URLs: @@ -991,8 +1154,12 @@ Each query must have at least one expected URL appear in top 10 results. - Result ranking and scoring (document-level) - Search filters: `--type=issue|mr|discussion`, `--author=username`, `--after=date`, `--label=name`, `--project=path`, `--path=file`, `--limit=N` - `--limit=N` controls result count (default: 20, max: 100) - - `--path` filters documents by referenced file paths (from DiffNote positions) - - MVP: substring/exact match; glob patterns deferred + - `--path` filters documents by referenced file paths (from DiffNote positions): + - If `--path` ends with `/`: prefix match (`path LIKE 'src/auth/%'`) + - Otherwise: exact match OR prefix on directory boundary + - Examples: `--path=src/auth/` matches `src/auth/login.ts`, `src/auth/utils/helpers.ts` + - Examples: `--path=src/auth/login.ts` matches only that exact file + - Glob patterns deferred to post-MVP - Label filtering operates on `document_labels` (indexed, exact-match) - Filters work identically in hybrid and lexical modes - Debug: `--explain` returns rank contributions from vector + FTS + RRF @@ -1005,51 +1172,25 @@ Each query must have at least one expected URL appear in top 10 results. - Filters exclude all results: `No results match the specified filters.` - Helpful hints shown in non-JSON mode (e.g., "Try broadening your search") -**Schema Additions:** -```sql --- Full-text search for hybrid retrieval --- Using porter stemmer for better matching of word variants -CREATE VIRTUAL TABLE documents_fts USING fts5( - title, - content_text, - content='documents', - content_rowid='id', - tokenize='porter unicode61' -); - --- Triggers to keep FTS in sync -CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN - INSERT INTO documents_fts(rowid, title, content_text) - VALUES (new.id, new.title, new.content_text); -END; - -CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN - INSERT INTO documents_fts(documents_fts, rowid, title, content_text) - VALUES('delete', old.id, old.title, old.content_text); -END; - -CREATE TRIGGER documents_au AFTER UPDATE ON documents BEGIN - INSERT INTO documents_fts(documents_fts, rowid, title, content_text) - VALUES('delete', old.id, old.title, old.content_text); - INSERT INTO documents_fts(rowid, title, content_text) - VALUES (new.id, new.title, new.content_text); -END; -``` - -**FTS5 Tokenizer Notes:** -- `porter` enables stemming (searching "authentication" matches "authenticating", "authenticated") -- `unicode61` handles Unicode properly -- Code identifiers (snake_case, camelCase, file paths) may not tokenize ideally; post-MVP consideration for custom tokenizer - **Hybrid Search Algorithm (MVP) - Reciprocal Rank Fusion:** -1. Query both vector index (top 50) and FTS5 (top 50) -2. Merge results by document_id -3. Combine with Reciprocal Rank Fusion (RRF): +1. Determine recall size (adaptive based on filters): + - `baseTopK = 50` + - If any filters present (--project, --type, --author, --label, --path, --after): `topK = 200` + - This prevents "no results" when relevant docs exist outside top-50 unfiltered recall +2. Query both vector index (top topK) and FTS5 (top topK) + - Apply SQL-expressible filters during retrieval when possible (project_id, author_username, source_type) +3. Merge results by document_id +4. Combine with Reciprocal Rank Fusion (RRF): - For each retriever list, assign ranks (1..N) - - `rrf_score = Σ 1 / (k + rank)` with k=60 (tunable) + - `rrfScore = Σ 1 / (k + rank)` with k=60 (tunable) - RRF is simpler than weighted sums and doesn't require score normalization -4. Apply filters (type, author, date, label) -5. Return top K +5. Apply remaining filters (date ranges, labels, paths that weren't applied in SQL) +6. Return top K results + +**Why Adaptive Recall:** +- Fixed top-50 + filter can easily return 0 results even when relevant docs exist +- Increasing recall when filters are present catches more candidates before filtering +- SQL-level filtering is preferred (faster, uses indexes) but not always possible **Why RRF over Weighted Sums:** - FTS5 BM25 scores and vector distances use different scales @@ -1125,17 +1266,23 @@ interface SearchResult { author: string | null; createdAt: string; // ISO 8601 updatedAt: string; // ISO 8601 - score: number; // 0-1 normalized RRF score + score: number; // normalized 0-1 (rrfScore / maxRrfScore in this result set) snippet: string; // truncated content_text labels: string[]; // Only present with --explain flag explain?: { vectorRank?: number; // null if not in vector results ftsRank?: number; // null if not in FTS results - rrfScore: number; + rrfScore: number; // raw RRF score (rank-based, comparable within a query) }; } +// Note on score normalization: +// - `score` is normalized 0-1 for UI display convenience +// - Normalization is per-query (score = rrfScore / max(rrfScore) in this result set) +// - Use `explain.rrfScore` for raw scores when comparing across queries +// - Scores are NOT comparable across different queries + interface SearchResponse { query: string; mode: "hybrid" | "lexical" | "semantic"; @@ -1186,7 +1333,7 @@ tests/integration/sync-recovery.test.ts | `gi sync` (no changes) | `0 issues, 0 MRs updated` | Fast completion, no API calls beyond cursor check | | `gi sync` (after GitLab change) | `1 issue updated, 3 discussions refetched` | Detects and syncs the change | | `gi sync --full` | Full sync progress | Resets cursors, fetches everything | -| `gi sync-status` | Cursor positions, last sync time | Shows current state | +| `gi sync-status` | Last sync time, cursor positions | Shows current state | | `gi sync` (with rate limit) | Backoff messages | Respects rate limits, completes eventually | | `gi search "new content"` (after sync) | Returns new content | New content is searchable | @@ -1262,13 +1409,25 @@ gi sync-status **Orchestration steps (in order):** 1. Acquire app lock with heartbeat -2. Ingest delta (issues, MRs, discussions) based on cursors - - During ingestion, INSERT into `dirty_sources` for each upserted entity -3. Apply rolling backfill window -4. Regenerate documents for entities in `dirty_sources` (process + delete from queue) -5. Embed documents with changed content_hash -6. FTS triggers auto-sync (no explicit step needed) -7. Release lock, record sync_run as succeeded +2. Ingest delta (issues, MRs) based on cursors + - For each upserted issue/MR, enqueue into `pending_discussion_fetches` + - INSERT into `dirty_sources` for each upserted issue/MR +3. Process `pending_discussion_fetches` queue (bounded per run, retryable): + - Fetch discussions for each queued parent + - On success: upsert discussions/notes, INSERT into `dirty_sources`, DELETE from queue + - On failure: increment `attempt_count`, record `last_error`, leave in queue for retry + - Bound processing: max N parents per sync run to avoid unbounded API calls +4. Apply rolling backfill window +5. Regenerate documents for entities in `dirty_sources` (process + delete from queue) +6. Embed documents with changed content_hash +7. FTS triggers auto-sync (no explicit step needed) +8. Release lock, record sync_run as succeeded + +**Why queue-based discussion fetching:** +- One pathological MR thread (huge pagination, 5xx errors, permission issues) shouldn't block the entire sync +- Primary resource cursors can advance independently +- Discussions can be retried without re-fetching all issues/MRs +- Bounded processing prevents unbounded API calls per sync run Individual commands remain available for checkpoint testing and debugging: - `gi ingest --type=issues` @@ -1298,7 +1457,8 @@ All commands support `--help` for detailed usage information. |---------|-----|-------------| | `gi ingest --type=issues` | 1 | Fetch issues from GitLab | | `gi ingest --type=merge_requests` | 2 | Fetch MRs and discussions | -| `gi embed --all` | 3 | Generate embeddings for all documents | +| `gi generate-docs` | 3A | Extract documents from issues/MRs/discussions | +| `gi embed --all` | 3B | Generate embeddings for all documents | | `gi embed --retry-failed` | 3 | Retry failed embeddings | | `gi sync` | 5 | Full sync orchestration (ingest + docs + embed) | | `gi sync --full` | 5 | Force complete re-sync (reset cursors) | @@ -1310,7 +1470,7 @@ All commands support `--help` for detailed usage information. | Command | CP | Description | |---------|-----|-------------| | `gi list issues [--limit=N] [--project=PATH]` | 1 | List issues | -| `gi list mrs [--limit=N]` | 2 | List merge requests | +| `gi list mrs --limit=N` | 2 | List merge requests | | `gi count issues` | 1 | Count issues | | `gi count mrs` | 2 | Count merge requests | | `gi count discussions --type=issue` | 1 | Count issue discussions | @@ -1318,8 +1478,8 @@ All commands support `--help` for detailed usage information. | `gi count discussions --type=mr` | 2 | Count MR discussions | | `gi count notes --type=issue` | 1 | Count issue notes (excluding system) | | `gi count notes` | 2 | Count all notes (excluding system) | -| `gi show issue ` | 1 | Show issue details | -| `gi show mr ` | 2 | Show MR details with discussions | +| `gi show issue [--project=PATH]` | 1 | Show issue details (prompts if iid ambiguous across projects) | +| `gi show mr [--project=PATH]` | 2 | Show MR details with discussions | | `gi stats` | 3 | Embedding coverage statistics | | `gi stats --json` | 3 | JSON stats for scripting | | `gi sync-status` | 1 | Show cursor positions and last sync | @@ -1401,6 +1561,8 @@ Common errors and their resolutions: | **Disk full during write** | Fails with clear error. Cursor preserved at last successful commit. Free space and resume. | | **Stale lock detected** | Lock held > 10 minutes without heartbeat is considered stale. Next sync auto-recovers. | | **Network interruption** | Retries with exponential backoff. After max retries, sync fails but cursor is preserved. | +| **Embedding permanent failure** | After 3 retries, document stays in `embedding_metadata` with `last_error` populated. Use `gi embed --retry-failed` to retry later, or `gi stats` to see failed count. Documents with failed embeddings are excluded from vector search but included in FTS. | +| **Orphaned records** | MVP: No automatic cleanup. `last_seen_at` field enables future detection of items deleted in GitLab. Post-MVP: `gi gc --dry-run` to identify orphans, `gi gc --confirm` to remove. | --- @@ -1506,7 +1668,7 @@ CREATE TABLE note_positions ( new_line INTEGER, position_type TEXT -- 'text' | 'image' | etc. ); -CREATE INDEX idx_note_positions_new_path ON note_positions(position_new_path); +CREATE INDEX idx_note_positions_new_path ON note_positions(new_path); ``` --- @@ -1535,6 +1697,8 @@ Each checkpoint includes: | Search quality | Hybrid (vector + FTS5) retrieval with RRF, golden query test suite | | Concurrent sync corruption | DB lock + heartbeat + rolling backfill, automatic stale lock recovery | | Embedding failures | Per-document error tracking, retry with backoff, targeted re-runs | +| Pathological discussions | Queue-based discussion fetching; one bad thread doesn't block entire sync | +| Empty search results with filters | Adaptive recall (topK 50→200 when filtered) | **SQLite Performance Defaults (MVP):** - Enable `PRAGMA journal_mode=WAL;` on every connection @@ -1552,23 +1716,24 @@ Each checkpoint includes: | sync_runs | 0 | Audit trail of sync operations (with heartbeat) | | app_locks | 0 | Crash-safe single-flight lock | | sync_cursors | 0 | Resumable sync state per primary resource | -| raw_payloads | 0 | Decoupled raw JSON storage (with project_id) | +| raw_payloads | 0 | Decoupled raw JSON storage (gitlab_id as TEXT) | | schema_version | 0 | Database migration version tracking | | issues | 1 | Normalized issues (unique by project+iid) | | labels | 1 | Label definitions (unique by project + name) | | issue_labels | 1 | Issue-label junction | -| merge_requests | 2 | Normalized MRs (unique by project+iid) | | discussions | 1 | Discussion threads (issue discussions in CP1, MR discussions in CP2) | | notes | 1 | Individual comments with is_system flag (DiffNote paths added in CP2) | +| merge_requests | 2 | Normalized MRs (unique by project+iid) | | mr_labels | 2 | MR-label junction | -| documents | 3 | Unified searchable documents with truncation metadata | -| document_labels | 3 | Document-label junction for fast filtering | -| document_paths | 3 | Fast path filtering for documents (DiffNote file paths) | -| dirty_sources | 3 | Queue for incremental document regeneration | -| embeddings | 3 | Vector embeddings (sqlite-vss, rowid=document_id) | -| embedding_metadata | 3 | Embedding provenance + error tracking | -| documents_fts | 4 | Full-text search index (fts5 with porter stemmer) | -| mr_files | 6 | MR file changes (deferred to File History feature) | +| documents | 3A | Unified searchable documents with truncation metadata | +| document_labels | 3A | Document-label junction for fast filtering | +| document_paths | 3A | Fast path filtering for documents (DiffNote file paths) | +| dirty_sources | 3A | Queue for incremental document regeneration | +| pending_discussion_fetches | 3A | Resumable queue for dependent discussion fetching | +| documents_fts | 3A | Full-text search index (fts5 with porter stemmer) | +| embeddings | 3B | Vector embeddings (sqlite-vss, rowid=document_id) | +| embedding_metadata | 3B | Embedding provenance + error tracking | +| mr_files | 6 | MR file changes (deferred to post-MVP) | --- @@ -1584,10 +1749,10 @@ Each checkpoint includes: | Labels uniqueness | **By (project_id, name)** | GitLab API returns labels as strings | | Sync method | **Polling only for MVP** | Webhooks add complexity; polling every 10 min is sufficient | | Sync safety | **DB lock + heartbeat + rolling backfill** | Prevents race conditions and missed updates | -| Discussions sync | **Dependent resource model** | Discussions API is per-parent; refetch all when parent updates | +| Discussions sync | **Resumable queue model** | Queue-based fetching allows one pathological thread to not block entire sync | | Hybrid ranking | **RRF over weighted sums** | Simpler, no score normalization needed | | Embedding rowid | **rowid = documents.id** | Eliminates fragile rowid mapping | -| Embedding truncation | **8000 tokens, truncate middle** | Preserve first/last notes for context | +| Embedding truncation | **Note-boundary aware middle drop** | Never cut mid-note; preserves semantic coherence | | Embedding batching | **32 docs/batch, 4 concurrent workers** | Balance throughput, memory, and error isolation | | FTS5 tokenizer | **porter unicode61** | Stemming improves recall | | Ollama unavailable | **Graceful degradation to FTS5** | Search still works without semantic matching | @@ -1596,6 +1761,14 @@ Each checkpoint includes: | `gi init` validation | **Validate GitLab before writing config** | Fail fast, better UX | | Ctrl+C handling | **Graceful shutdown** | Finish page, commit cursor, exit cleanly | | Empty state UX | **Actionable messages** | Guide user to next step | +| raw_payloads.gitlab_id | **TEXT not INTEGER** | Discussion IDs are strings; numeric IDs stored as strings | +| GitLab list params | **Always scope=all&state=all** | Ensures all historical data including closed items | +| Pagination | **X-Next-Page headers with empty-page fallback** | Headers are more robust than empty-page detection | +| Integration tests | **Mocked by default, live tests optional** | Deterministic CI; live tests gated by GITLAB_LIVE_TESTS=1 | +| Search recall with filters | **Adaptive topK (50→200 when filtered)** | Prevents "no results" when relevant docs exist outside top-50 | +| RRF score normalization | **Per-query normalized 0-1** | score = rrfScore / max(rrfScore); raw score in explain | +| --path semantics | **Trailing / = prefix match** | `--path=src/auth/` does prefix; otherwise exact match | +| CP3 structure | **Split into 3A (FTS) and 3B (embeddings)** | Lexical search works before embedding infra risk | ---