From f4dba386c9aa6018fe197578e325633de137e8fa Mon Sep 17 00:00:00 2001 From: Taylor Eernisse Date: Thu, 29 Jan 2026 08:42:39 -0500 Subject: [PATCH] docs: Restructure checkpoint-3 PRD with gated milestones Reorganizes the Search & Sync MVP plan into three independently verifiable gates (A: Lexical MVP, B: Hybrid MVP, C: Sync MVP) to reduce integration risk. Each gate has explicit deliverables, acceptance criteria, and can ship on its own. Expands the specification with additional detail on document generation, search API surface, sync orchestration, and integrity repair paths. Removes the outdated rename note since the project is now fully migrated to gitlore/lore naming. Co-Authored-By: Claude Opus 4.5 --- docs/prd/checkpoint-3.md | 449 ++++++++++++++++++++++++++------------- 1 file changed, 307 insertions(+), 142 deletions(-) diff --git a/docs/prd/checkpoint-3.md b/docs/prd/checkpoint-3.md index 2457096..9a23033 100644 --- a/docs/prd/checkpoint-3.md +++ b/docs/prd/checkpoint-3.md @@ -1,7 +1,5 @@ # Checkpoint 3: Search & Sync MVP -> **Note:** The project was renamed from "gitlab-inbox" to "gitlore" and the CLI from "gi" to "lore". References to "gi" in this document should be read as "lore". - > **Status:** Planning > **Prerequisite:** Checkpoints 0, 1, 2 complete (issues, MRs, discussions ingested) > **Goal:** Deliver working semantic + lexical hybrid search with efficient incremental sync @@ -17,13 +15,27 @@ All code integrates with existing `gitlore` infrastructure: --- -## Executive Summary +## Executive Summary (Gated Milestones) + +This checkpoint ships in three gates to reduce integration risk. Each gate is independently verifiable and shippable: + +**Gate A (Lexical MVP):** documents + FTS + filters + `lore search --mode=lexical` + `lore stats` +**Gate B (Hybrid MVP):** embeddings + vector + RRF fusion + graceful degradation +**Gate C (Sync MVP):** `lore sync` orchestration + queues/backoff + integrity check/repair **Deliverables:** + +**Gate A** 1. Document generation from issues/MRs/discussions with FTS5 indexing -2. Ollama-powered embedding pipeline with sqlite-vec storage -3. Hybrid search (RRF-ranked vector + lexical) with rich filtering -4. Orchestrated `gi sync` command with incremental re-embedding +2. Lexical search + filters + snippets + `lore stats` + +**Gate B** +3. Ollama-powered embedding pipeline with sqlite-vec storage +4. Hybrid search (RRF-ranked vector + lexical) with rich filtering + graceful degradation + +**Gate C** +5. Orchestrated `lore sync` command with incremental doc regen + re-embedding +6. Integrity checks + repair paths for FTS/embeddings consistency **Key Design Decisions:** - Documents are the search unit (not raw entities) @@ -144,15 +156,19 @@ CREATE VIRTUAL TABLE documents_fts USING fts5( prefix='2 3 4' ); --- Keep FTS in sync via triggers +-- Keep FTS in sync via triggers. +-- IMPORTANT: COALESCE(title, '') ensures FTS5 external-content table never +-- receives NULL values, which can cause inconsistencies with delete operations. +-- FTS5 delete requires exact match of original values; NULL != NULL in SQL, +-- so a NULL title on insert would make the delete trigger fail silently. CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN INSERT INTO documents_fts(rowid, title, content_text) - VALUES (new.id, new.title, new.content_text); + VALUES (new.id, COALESCE(new.title, ''), new.content_text); END; CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN INSERT INTO documents_fts(documents_fts, rowid, title, content_text) - VALUES('delete', old.id, old.title, old.content_text); + VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text); END; -- Only rebuild FTS when searchable text actually changes (not metadata-only updates) @@ -160,9 +176,9 @@ CREATE TRIGGER documents_au AFTER UPDATE ON documents WHEN old.title IS NOT new.title OR old.content_text != new.content_text BEGIN INSERT INTO documents_fts(documents_fts, rowid, title, content_text) - VALUES('delete', old.id, old.title, old.content_text); + VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text); INSERT INTO documents_fts(rowid, title, content_text) - VALUES (new.id, new.title, new.content_text); + VALUES (new.id, COALESCE(new.title, ''), new.content_text); END; ``` @@ -490,7 +506,7 @@ pub struct NoteContent { --- -### 2.4 CLI: `gi generate-docs` (Incremental by Default) +### 2.4 CLI: `lore generate-docs` (Incremental by Default) **File:** `src/cli/commands/generate_docs.rs` @@ -518,84 +534,83 @@ pub struct GenerateDocsResult { pub skipped: usize, // Unchanged documents } -/// Chunk size for --full mode transactions. +/// Chunk size for --full mode dirty queue seeding. /// Balances throughput against WAL file growth and memory pressure. const FULL_MODE_CHUNK_SIZE: usize = 2000; /// Run document generation (incremental by default). /// +/// IMPORTANT: Both modes use the same regenerator codepath to avoid +/// logic divergence in label/path hashing, deletion semantics, and +/// write-optimization behavior. The only difference is how dirty_sources +/// gets populated. +/// /// Incremental mode (default): -/// - Processes only items in dirty_sources queue +/// - Processes only items already in dirty_sources queue /// - Fast for routine syncs /// /// Full mode (--full): -/// - Regenerates ALL documents from scratch -/// - Uses chunked transactions (2k docs/tx) to bound WAL growth +/// - Seeds dirty_sources with ALL source entities in chunks +/// - Drains through the same regenerator pipeline +/// - Uses keyset pagination (WHERE id > last_id) to avoid OFFSET degradation +/// - Final FTS optimize after all chunks complete /// - Use when schema changes or after migration pub fn run_generate_docs( config: &Config, full: bool, project_filter: Option<&str>, ) -> Result { + let conn = open_db(config)?; + if full { - // Full mode: regenerate everything using chunked transactions + // Full mode: seed dirty_sources with all source entities, then drain. + // Uses keyset pagination to avoid O(n²) OFFSET degradation on large tables. // - // Using chunked transactions instead of a single giant transaction: - // - Bounds WAL file growth (single 50k-doc tx could balloon WAL) - // - Reduces memory pressure from statement caches - // - Allows progress reporting between chunks - // - Crash partway through leaves partial but consistent state + // Seeding is chunked to bound WAL growth: + // 1. For each source type (issues, MRs, discussions): + // a. Query next chunk WHERE id > last_id ORDER BY id LIMIT chunk_size + // b. INSERT OR IGNORE each into dirty_sources + // c. Advance last_id = chunk.last().id + // d. Loop until chunk is empty + // 2. Drain dirty_sources through regenerator (same as incremental) + // 3. Final FTS optimize (not full rebuild — triggers handle consistency) // - // Steps per chunk: - // 1. BEGIN IMMEDIATE transaction - // 2. Query next batch of sources (issues/MRs/discussions) - // 3. For each: generate document, compute hash - // 4. Upsert into `documents` table (FTS triggers auto-fire) - // 5. Populate `document_labels` and `document_paths` - // 6. COMMIT - // 7. Report progress, loop to next chunk - // - // After all chunks: - // 8. Single final transaction for FTS rebuild: - // INSERT INTO documents_fts(documents_fts) VALUES('rebuild') - // - // Example implementation: - let conn = open_db(config)?; - let mut result = GenerateDocsResult::default(); - let mut offset = 0; + // Benefits of unified codepath: + // - No divergence in label/path hash behavior + // - No divergence in deletion semantics + // - No divergence in write-optimization logic (labels_hash, paths_hash) + // - FTS triggers fire identically in both modes + // Seed issues + let mut last_id: i64 = 0; loop { - // Process issues in chunks - let issues: Vec = query_issues(&conn, project_filter, FULL_MODE_CHUNK_SIZE, offset)?; - if issues.is_empty() { break; } - + let chunk = query_issue_ids_after(&conn, project_filter, FULL_MODE_CHUNK_SIZE, last_id)?; + if chunk.is_empty() { break; } let tx = conn.transaction()?; - for issue in &issues { - let doc = generate_issue_document(issue)?; - upsert_document(&tx, &doc)?; - result.issues += 1; + for id in &chunk { + mark_dirty(&tx, SourceType::Issue, *id)?; } tx.commit()?; - - offset += issues.len(); - // Report progress here if using indicatif + last_id = *chunk.last().unwrap(); } - // Similar chunked loops for MRs and discussions... + // Similar keyset-paginated seeding for MRs and discussions... - // Final FTS rebuild in its own transaction - let tx = conn.transaction()?; - tx.execute( - "INSERT INTO documents_fts(documents_fts) VALUES('rebuild')", + // Report: seeding complete, now regenerating + } + + // Both modes: drain dirty_sources through the regenerator + let regen = regenerate_dirty_documents(&conn)?; + + if full { + // FTS optimize after bulk operations (compacts index segments) + conn.execute( + "INSERT INTO documents_fts(documents_fts) VALUES('optimize')", [], )?; - tx.commit()?; - } else { - // Incremental mode: process dirty_sources only - // 1. Query dirty_sources (bounded by LIMIT) - // 2. Regenerate only those documents - // 3. Clear from dirty_sources after processing } + + // Map regen -> GenerateDocsResult stats todo!() } @@ -849,7 +864,7 @@ pub fn search_fts( )?; let results = stmt - .query_map([&safe_query, &limit.to_string()], |row| { + .query_map(rusqlite::params![safe_query, limit as i64], |row| { Ok(FtsResult { document_id: row.get(0)?, rank: row.get(1)?, @@ -897,10 +912,11 @@ pub struct SearchFilters { pub source_type: Option, pub author: Option, pub project_id: Option, - pub after: Option, // ms epoch - pub labels: Vec, // AND logic + pub after: Option, // ms epoch (created_at >=) + pub updated_after: Option, // ms epoch (updated_at >=) + pub labels: Vec, // AND logic pub path: Option, - pub limit: usize, // Default 20, max 100 + pub limit: usize, // Default 20, max 100 } impl SearchFilters { @@ -910,6 +926,7 @@ impl SearchFilters { || self.author.is_some() || self.project_id.is_some() || self.after.is_some() + || self.updated_after.is_some() || !self.labels.is_empty() || self.path.is_some() } @@ -990,6 +1007,11 @@ pub fn apply_filters( params.push(Box::new(after)); } + if let Some(updated_after) = filters.updated_after { + conditions.push("d.updated_at >= ?".into()); + params.push(Box::new(updated_after)); + } + // Labels: AND logic - all labels must be present for label in &filters.labels { conditions.push( @@ -1064,6 +1086,7 @@ pub fn apply_filters( | `--author` | `author_username` | Exact match | | `--project` | `project_id` | Resolve path to ID | | `--after` | `created_at` | `>= date` (ms epoch) | +| `--updated-after` | `updated_at` | `>= date` (ms epoch), common triage filter | | `--label` | `document_labels` | JOIN, multiple = AND | | `--path` | `document_paths` | JOIN, trailing `/` = prefix | | `--limit` | N/A | Default 20, max 100 | @@ -1072,6 +1095,7 @@ pub fn apply_filters( - [ ] Each filter correctly restricts results - [ ] Multiple `--label` flags use AND logic - [ ] Path prefix vs exact match works correctly +- [ ] `--updated-after` filters on updated_at (not created_at) - [ ] Filters compose (all applied together) - [ ] Ranking order preserved after filtering (ORDER BY position) - [ ] Limit clamped to valid range [1, 100] @@ -1080,7 +1104,7 @@ pub fn apply_filters( --- -### 3.4 CLI: `gi search --mode=lexical` +### 3.4 CLI: `lore search --mode=lexical` **File:** `src/cli/commands/search.rs` @@ -1141,9 +1165,49 @@ pub fn run_search( explain: bool, ) -> Result { // 1. Parse query and filters - // 2. Execute search based on mode - // 3. Apply post-retrieval filters - // 4. Format and return results + // 2. Execute search based on mode -> ranked doc_ids (+ explain ranks) + // 3. Apply post-retrieval filters preserving ranking order + // 4. HYDRATE in one DB round-trip (see hydration query below): + // - documents fields (title, url, created_at, updated_at, content_text) + // - project_path via JOIN projects + // - labels aggregated via json_group_array + // - paths aggregated via json_group_array (optional) + // 5. Attach snippet: + // - prefer FTS snippet when doc hit FTS + // - fallback: truncated content_text via generate_fallback_snippet() + // 6. For --mode=semantic with 0% embedding coverage: + // return early with actionable error message (distinct from "Ollama down") + todo!() +} + +/// Hydration query: fetch all display fields for ranked doc IDs in a single round-trip. +/// +/// Uses json_each(?) to preserve ranking order from the search pipeline. +/// Aggregates labels and paths inline to avoid N+1 queries. +/// +/// ```sql +/// SELECT d.id, d.source_type, d.title, d.url, d.author_username, +/// d.created_at, d.updated_at, d.content_text, +/// p.path AS project_path, +/// (SELECT json_group_array(dl.label_name) +/// FROM document_labels dl WHERE dl.document_id = d.id) AS labels, +/// (SELECT json_group_array(dp.path) +/// FROM document_paths dp WHERE dp.document_id = d.id) AS paths +/// FROM json_each(?) AS j +/// JOIN documents d ON d.id = j.value +/// JOIN projects p ON p.id = d.project_id +/// ORDER BY j.key +/// ``` +/// +/// This single query replaces what would otherwise be: +/// - 1 query per document for metadata +/// - 1 query per document for labels +/// - 1 query per document for paths +/// For 20 results, that's 60 queries reduced to 1. +fn hydrate_results( + conn: &Connection, + doc_ids: &[i64], +) -> Result> { todo!() } @@ -1240,6 +1304,10 @@ pub struct SearchArgs { #[arg(long)] after: Option, + /// Filter by updated date (recently active items) + #[arg(long)] + updated_after: Option, + /// Filter by label (can specify multiple) #[arg(long, action = clap::ArgAction::Append)] label: Vec, @@ -1266,12 +1334,15 @@ pub struct SearchArgs { **Acceptance Criteria:** - [ ] Works without Ollama running -- [ ] All filters functional +- [ ] All filters functional (including `--updated-after`) - [ ] Human-readable output with snippets - [ ] Semantic-only results get fallback snippets from content_text +- [ ] Results hydrated in single DB round-trip (no N+1 queries) - [ ] JSON output matches schema - [ ] Empty results show helpful message - [ ] "No data indexed" message if documents table empty +- [ ] `--mode=semantic` with 0% embedding coverage returns actionable error + (distinct from "Ollama unavailable" — tells user to run `lore embed` first) - [ ] `--fts-mode=safe` (default) preserves prefix `*` while escaping special chars - [ ] `--fts-mode=raw` passes FTS5 MATCH syntax through unchanged @@ -1535,7 +1606,7 @@ impl GiError { // ... existing mappings ... Self::OllamaUnavailable { .. } => Some("Start Ollama: ollama serve"), Self::OllamaModelNotFound { model } => Some("Pull the model: ollama pull nomic-embed-text"), - Self::EmbeddingFailed { .. } => Some("Check Ollama logs or retry with 'gi embed --retry-failed'"), + Self::EmbeddingFailed { .. } => Some("Check Ollama logs or retry with 'lore embed --retry-failed'"), } } } @@ -1558,6 +1629,7 @@ use crate::embedding::OllamaClient; const BATCH_SIZE: usize = 32; /// SQLite page size for paging through pending documents. +/// Uses keyset paging (id > last_id) to avoid rescanning previously-processed rows. const DB_PAGE_SIZE: usize = 500; /// Expected embedding dimensions for nomic-embed-text model. @@ -1584,11 +1656,16 @@ pub struct EmbedResult { /// Embed documents that need embedding. /// /// Process: -/// 1. Query dirty_sources ordered by queued_at -/// 2. For each: regenerate document, compute new hash -/// 3. ALWAYS upsert document (labels/paths may change even if content_hash unchanged) -/// 4. Track whether content_hash changed (for stats) -/// 5. Delete from dirty_sources (or record error on failure) +/// 1. Select documents needing embeddings: +/// - Pending: missing embedding_metadata row OR content_hash mismatch +/// - RetryFailed: embedding_metadata.last_error IS NOT NULL +/// 2. Page through candidates using keyset pagination (id > last_id) +/// to avoid rescanning already-processed rows +/// 3. Batch texts -> Ollama `/api/embed` with concurrent HTTP requests +/// 4. Write embeddings + embedding_metadata in per-batch transactions +/// 5. Failed batches record `last_error` in embedding_metadata +/// (excluded from Pending selection; retried via RetryFailed) +/// 6. Progress reported as (embedded + failed) vs total_pending pub async fn embed_documents( conn: &Connection, client: &OllamaClient, @@ -1605,9 +1682,11 @@ pub async fn embed_documents( return Ok(result); } - // Page through pending documents to avoid loading all into memory + // Page through pending documents using keyset pagination to avoid + // both memory pressure and OFFSET performance degradation. + let mut last_id: i64 = 0; loop { - let pending = find_pending_documents(conn, DB_PAGE_SIZE, selection)?; + let pending = find_pending_documents(conn, DB_PAGE_SIZE, last_id, selection)?; if pending.is_empty() { break; } @@ -1640,6 +1719,11 @@ pub async fn embed_documents( collect_writes(conn, &meta, res, &mut result)?; } + // Advance keyset cursor for next page + if let Some(last) = pending.last() { + last_id = last.id; + } + if let Some(ref cb) = progress_callback { cb(result.embedded + result.failed, total_pending); } @@ -1718,14 +1802,16 @@ fn count_pending_documents(conn: &Connection, selection: EmbedSelection) -> Resu Ok(count) } -/// Find pending documents for embedding. +/// Find pending documents for embedding using keyset pagination. /// -/// IMPORTANT: Uses deterministic ORDER BY d.id to ensure consistent -/// paging behavior. Without ordering, SQLite may return rows in -/// different orders across calls, causing missed or duplicate documents. +/// IMPORTANT: Uses keyset pagination (d.id > last_id) instead of OFFSET. +/// OFFSET degrades O(n²) on large result sets because SQLite must scan +/// and discard all rows before the offset. Keyset pagination is O(1) per page +/// since the index seek goes directly to the starting row. fn find_pending_documents( conn: &Connection, limit: usize, + last_id: i64, selection: EmbedSelection, ) -> Result> { let sql = match selection { @@ -1733,8 +1819,9 @@ fn find_pending_documents( "SELECT d.id, d.content_text, d.content_hash FROM documents d LEFT JOIN embedding_metadata em ON d.id = em.document_id - WHERE em.document_id IS NULL - OR em.content_hash != d.content_hash + WHERE (em.document_id IS NULL + OR em.content_hash != d.content_hash) + AND d.id > ? ORDER BY d.id LIMIT ?", EmbedSelection::RetryFailed => @@ -1742,13 +1829,14 @@ fn find_pending_documents( FROM documents d JOIN embedding_metadata em ON d.id = em.document_id WHERE em.last_error IS NOT NULL + AND d.id > ? ORDER BY d.id LIMIT ?", }; let mut stmt = conn.prepare(sql)?; let docs = stmt - .query_map([limit], |row| { + .query_map(rusqlite::params![last_id, limit as i64], |row| { Ok(PendingDocument { id: row.get(0)?, content: row.get(1)?, @@ -1827,7 +1915,7 @@ fn record_embedding_error( --- -### 4.5 CLI: `gi embed` +### 4.5 CLI: `lore embed` **File:** `src/cli/commands/embed.rs` @@ -1928,7 +2016,7 @@ pub struct EmbedArgs { --- -### 4.6 CLI: `gi stats` +### 4.6 CLI: `lore stats` **File:** `src/cli/commands/stats.rs` @@ -2034,7 +2122,13 @@ pub struct RepairResult { /// Fixes: /// - Deletes orphaned embeddings (embedding_metadata rows with no matching document) /// - Clears stale embedding_metadata (hash mismatch) so they get re-embedded -/// - Repopulates FTS for documents missing from documents_fts +/// - Rebuilds FTS index from scratch (correct-by-construction) +/// +/// NOTE: FTS repair uses `rebuild` rather than partial row insertion. +/// With `content='documents'` (external-content FTS), partial repopulation +/// via INSERT of missing rows is fragile — if the external content table +/// and FTS content diverge in any way, partial fixes can leave the index +/// in an inconsistent state. A full rebuild is slower but guaranteed correct. pub fn run_repair(config: &Config) -> Result { let conn = open_db(config)?; @@ -2061,19 +2155,19 @@ pub fn run_repair(config: &Config) -> Result { [], )?; - // Repopulate FTS for missing documents - let fts_repopulated = conn.execute( - "INSERT INTO documents_fts(rowid, title, content_text) - SELECT id, COALESCE(title, ''), content_text - FROM documents - WHERE id NOT IN (SELECT rowid FROM documents_fts)", + // Rebuild FTS index from scratch — correct-by-construction. + // This re-reads all rows from the external content table (documents) + // and rebuilds the index. Slower than partial fix but guaranteed consistent. + conn.execute( + "INSERT INTO documents_fts(documents_fts) VALUES('rebuild')", [], )?; + let fts_rebuilt = 1; // rebuild is all-or-nothing Ok(RepairResult { orphaned_embeddings_deleted: orphaned_deleted, stale_embeddings_cleared: stale_cleared, - missing_fts_repopulated: fts_repopulated, + missing_fts_repopulated: fts_rebuilt, }) } @@ -2772,7 +2866,7 @@ pub fn record_fetch_error( current_attempt: i64, ) -> Result<()> { let now = now_ms(); - let next_attempt = compute_next_attempt_at(now, current_attempt + 1); + let next_attempt = crate::core::backoff::compute_next_attempt_at(now, current_attempt + 1); conn.execute( "UPDATE pending_discussion_fetches @@ -2786,14 +2880,44 @@ pub fn record_fetch_error( Ok(()) } +// NOTE: Backoff computation uses the shared utility in `src/core/backoff.rs`. +// See Phase 6.X below for the shared implementation. +``` + +**Acceptance Criteria:** +- [ ] Updated entities queued for discussion fetch +- [ ] Success removes from queue +- [ ] Failure increments attempt_count and sets next_attempt_at +- [ ] Processing bounded per run (max 100) +- [ ] Exponential backoff uses `next_attempt_at` (index-friendly, no overflow) +- [ ] Backoff computed with jitter to prevent thundering herd + +--- + +### 6.X Shared Backoff Utility + +**File:** `src/core/backoff.rs` + +Single implementation of exponential backoff with jitter, used by both +`dirty_sources` and `pending_discussion_fetches` queues. Living in `src/core/` +because it's a cross-cutting concern used by multiple modules. + +```rust +use rand::Rng; + /// Compute next_attempt_at with exponential backoff and jitter. /// /// Formula: now + min(3600000, 1000 * 2^attempt_count) * (0.9 to 1.1) /// - Capped at 1 hour to prevent runaway delays /// - ±10% jitter prevents synchronized retries after outages +/// +/// Used by: +/// - `dirty_sources` retry scheduling (document regeneration failures) +/// - `pending_discussion_fetches` retry scheduling (API fetch failures) +/// +/// Having one implementation prevents subtle divergence between queues +/// (e.g., different caps or jitter ranges). pub fn compute_next_attempt_at(now: i64, attempt_count: i64) -> i64 { - use rand::Rng; - // Cap attempt_count to prevent overflow (2^30 > 1 hour anyway) let capped_attempts = attempt_count.min(30) as u32; let base_delay_ms = 1000_i64.saturating_mul(1 << capped_attempts); @@ -2807,13 +2931,16 @@ pub fn compute_next_attempt_at(now: i64, attempt_count: i64) -> i64 { } ``` +**Update `src/core/mod.rs`:** +```rust +pub mod backoff; // Add to existing modules +``` + **Acceptance Criteria:** -- [ ] Updated entities queued for discussion fetch -- [ ] Success removes from queue -- [ ] Failure increments attempt_count and sets next_attempt_at -- [ ] Processing bounded per run (max 100) -- [ ] Exponential backoff uses `next_attempt_at` (index-friendly, no overflow) -- [ ] Backoff computed with jitter to prevent thundering herd +- [ ] Single implementation shared by both queue retry paths +- [ ] Cap at 1 hour prevents runaway delays +- [ ] Jitter prevents thundering herd after outage recovery +- [ ] Unit tests verify backoff curve and cap behavior --- @@ -2917,19 +3044,36 @@ fn delete_document( } /// Record a regeneration error on a dirty source for retry. +/// +/// IMPORTANT: Sets `next_attempt_at` using the shared backoff utility. +/// Without this, failed items would retry every run (hot-loop), defeating +/// the backoff design documented in the schema. fn record_dirty_error( conn: &Connection, source_type: SourceType, source_id: i64, error: &str, ) -> Result<()> { + let now = now_ms(); + + // Read current attempt_count from DB to compute backoff + let attempt_count: i64 = conn.query_row( + "SELECT attempt_count FROM dirty_sources WHERE source_type = ? AND source_id = ?", + rusqlite::params![source_type.as_str(), source_id], + |row| row.get(0), + )?; + + // Use shared backoff utility (same as pending_discussion_fetches) + let next_attempt_at = crate::core::backoff::compute_next_attempt_at(now, attempt_count + 1); + conn.execute( "UPDATE dirty_sources SET attempt_count = attempt_count + 1, last_attempt_at = ?, - last_error = ? + last_error = ?, + next_attempt_at = ? WHERE source_type = ? AND source_id = ?", - rusqlite::params![now_ms(), error, source_type.as_str(), source_id], + rusqlite::params![now, error, next_attempt_at, source_type.as_str(), source_id], )?; Ok(()) } @@ -3080,7 +3224,7 @@ fn get_document_id( --- -### 6.4 CLI: `gi sync` +### 6.4 CLI: `lore sync` **File:** `src/cli/commands/sync.rs` @@ -3198,7 +3342,8 @@ pub struct SyncArgs { | FTS query sanitization | `src/search/fts.rs` (mod tests) | `to_fts_query()` edge cases: `-`, `"`, `:`, `*`, `C++` | | SourceType parsing | `src/documents/extractor.rs` (mod tests) | `parse()` accepts aliases: `mr`, `mrs`, `issue`, etc. | | SearchFilters | `src/search/filters.rs` (mod tests) | `has_any_filter()`, `clamp_limit()` | -| Backoff logic | `src/ingestion/dirty_tracker.rs` (mod tests) | Exponential backoff query timing | +| Backoff logic | `src/core/backoff.rs` (mod tests) | Shared exponential backoff curve, cap, jitter | +| Hydration | `src/cli/commands/search.rs` (mod tests) | Single round-trip, label/path aggregation | ### Integration Tests @@ -3232,25 +3377,29 @@ Each query must have at least one expected URL in top 10 results. | Command | Expected | Pass Criteria | |---------|----------|---------------| -| `gi generate-docs` | Progress, count | Completes, count > 0 | -| `gi generate-docs` (re-run) | 0 regenerated | Hash comparison works | -| `gi embed` | Progress, count | Completes, count matches docs | -| `gi embed` (re-run) | 0 embedded | Skips unchanged | -| `gi embed --retry-failed` | Processes failed | Only failed docs processed | -| `gi stats` | Coverage stats | Shows 100% after embed | -| `gi stats` | Queue depths | Shows dirty_sources and pending_discussion_fetches counts | -| `gi search "auth" --mode=lexical` | Results | Works without Ollama | -| `gi search "auth"` | Hybrid results | Vector + FTS combined | -| `gi search "auth"` (Ollama down) | FTS results + warning | Graceful degradation, warning in response | -| `gi search "auth" --explain` | Rank breakdown | Shows vector/FTS/RRF | -| `gi search "auth" --type=mr` | Filtered results | Only MRs | -| `gi search "auth" --type=mrs` | Filtered results | Alias works | -| `gi search "auth" --label=bug` | Filtered results | Only labeled docs | -| `gi search "-DWITH_SSL"` | Results | Leading dash doesn't cause FTS error | -| `gi search 'C++'` | Results | Special chars in query work | -| `gi search "nonexistent123"` | No results | Graceful empty state | -| `gi sync` | Full pipeline | All steps complete | -| `gi sync --no-embed` | Skip embedding | Docs generated, not embedded | +| `lore generate-docs` | Progress, count | Completes, count > 0 | +| `lore generate-docs` (re-run) | 0 regenerated | Hash comparison works | +| `lore embed` | Progress, count | Completes, count matches docs | +| `lore embed` (re-run) | 0 embedded | Skips unchanged | +| `lore embed --retry-failed` | Processes failed | Only failed docs processed | +| `lore stats` | Coverage stats | Shows 100% after embed | +| `lore stats` | Queue depths | Shows dirty_sources and pending_discussion_fetches counts | +| `lore search "auth" --mode=lexical` | Results | Works without Ollama | +| `lore search "auth"` | Hybrid results | Vector + FTS combined | +| `lore search "auth"` (Ollama down) | FTS results + warning | Graceful degradation, warning in response | +| `lore search "auth" --explain` | Rank breakdown | Shows vector/FTS/RRF | +| `lore search "auth" --type=mr` | Filtered results | Only MRs | +| `lore search "auth" --type=mrs` | Filtered results | Alias works | +| `lore search "auth" --label=bug` | Filtered results | Only labeled docs | +| `lore search "-DWITH_SSL"` | Results | Leading dash doesn't cause FTS error | +| `lore search 'C++'` | Results | Special chars in query work | +| `lore search "auth" --updated-after 2024-01-01` | Filtered results | Only recently updated docs | +| `lore search "nonexistent123"` | No results | Graceful empty state | +| `lore search "auth" --mode=semantic` (no embeddings) | Actionable error | Tells user to run `lore embed` first | +| `lore sync` | Full pipeline | All steps complete | +| `lore sync --no-embed` | Skip embedding | Docs generated, not embedded | +| `lore generate-docs --full` | Progress, count | Keyset pagination completes without OFFSET degradation | +| `lore stats --check --repair` | Repair results | FTS rebuilt, orphans cleaned | --- @@ -3274,43 +3423,59 @@ Each query must have at least one expected URL in top 10 results. ## Success Criteria -Checkpoint 3 is complete when: +Checkpoint 3 is complete when all three gates pass: + +### Gate A: Lexical MVP 1. **Lexical search works without Ollama** - - `gi search "query" --mode=lexical` returns relevant results - - All filters functional + - `lore search "query" --mode=lexical` returns relevant results + - All filters functional (including `--updated-after`) - FTS5 syntax errors prevented by query sanitization - Special characters in queries work correctly (`-DWITH_SSL`, `C++`) + - Search results hydrated in single DB round-trip (no N+1) -2. **Semantic search works with Ollama** - - `gi embed` completes successfully - - `gi search "query"` returns semantically relevant results +2. **Document generation is correct** + - Full and incremental modes use the same regenerator codepath + - `--full` uses keyset pagination (no OFFSET degradation) + - FTS triggers use COALESCE for NULL-safe operation + +### Gate B: Hybrid MVP + +3. **Semantic search works with Ollama** + - `lore embed` completes successfully + - `lore search "query"` returns semantically relevant results - `--explain` shows ranking breakdown + - `--mode=semantic` with 0% embedding coverage returns actionable error -3. **Hybrid search combines both** +4. **Hybrid search combines both** - Documents appearing in both retrievers rank higher - Graceful degradation when Ollama unavailable (falls back to FTS) - Transient embed failures don't fail the entire search - Warning message included in response on degradation + - Embedding pipeline uses keyset pagination for consistent paging -4. **Incremental sync is efficient** - - `gi sync` only processes changed entities +### Gate C: Sync MVP + +5. **Incremental sync is efficient** + - `lore sync` only processes changed entities - Re-embedding only happens for changed documents - Progress visible during long syncs - - Queue backoff prevents hot-loop retries on persistent failures + - Queue backoff actually prevents hot-loop retries (both queues set `next_attempt_at`) + - Shared backoff utility ensures consistent behavior across queues -5. **Data integrity maintained** +6. **Data integrity maintained** - All counts match between tables - No orphaned records - Hashes consistent - `get_existing_hash()` properly distinguishes "not found" from DB errors + - `--repair` uses FTS `rebuild` for correct-by-construction repair -6. **Observability** - - `gi stats` shows queue depths and failed item counts +7. **Observability** + - `lore stats` shows queue depths and failed item counts - Failed items visible for operator intervention - Deterministic ordering ensures consistent paging -7. **Tests pass** - - Unit tests for core algorithms (including FTS sanitization, backoff) +8. **Tests pass** + - Unit tests for core algorithms (including FTS sanitization, shared backoff, hydration) - Integration tests for pipelines - Golden queries return expected results