diff --git a/docs/prd/checkpoint-3.md b/docs/prd/checkpoint-3.md index 2457096..9a23033 100644 --- a/docs/prd/checkpoint-3.md +++ b/docs/prd/checkpoint-3.md @@ -1,7 +1,5 @@ # Checkpoint 3: Search & Sync MVP -> **Note:** The project was renamed from "gitlab-inbox" to "gitlore" and the CLI from "gi" to "lore". References to "gi" in this document should be read as "lore". - > **Status:** Planning > **Prerequisite:** Checkpoints 0, 1, 2 complete (issues, MRs, discussions ingested) > **Goal:** Deliver working semantic + lexical hybrid search with efficient incremental sync @@ -17,13 +15,27 @@ All code integrates with existing `gitlore` infrastructure: --- -## Executive Summary +## Executive Summary (Gated Milestones) + +This checkpoint ships in three gates to reduce integration risk. Each gate is independently verifiable and shippable: + +**Gate A (Lexical MVP):** documents + FTS + filters + `lore search --mode=lexical` + `lore stats` +**Gate B (Hybrid MVP):** embeddings + vector + RRF fusion + graceful degradation +**Gate C (Sync MVP):** `lore sync` orchestration + queues/backoff + integrity check/repair **Deliverables:** + +**Gate A** 1. Document generation from issues/MRs/discussions with FTS5 indexing -2. Ollama-powered embedding pipeline with sqlite-vec storage -3. Hybrid search (RRF-ranked vector + lexical) with rich filtering -4. Orchestrated `gi sync` command with incremental re-embedding +2. Lexical search + filters + snippets + `lore stats` + +**Gate B** +3. Ollama-powered embedding pipeline with sqlite-vec storage +4. Hybrid search (RRF-ranked vector + lexical) with rich filtering + graceful degradation + +**Gate C** +5. Orchestrated `lore sync` command with incremental doc regen + re-embedding +6. Integrity checks + repair paths for FTS/embeddings consistency **Key Design Decisions:** - Documents are the search unit (not raw entities) @@ -144,15 +156,19 @@ CREATE VIRTUAL TABLE documents_fts USING fts5( prefix='2 3 4' ); --- Keep FTS in sync via triggers +-- Keep FTS in sync via triggers. +-- IMPORTANT: COALESCE(title, '') ensures FTS5 external-content table never +-- receives NULL values, which can cause inconsistencies with delete operations. +-- FTS5 delete requires exact match of original values; NULL != NULL in SQL, +-- so a NULL title on insert would make the delete trigger fail silently. CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN INSERT INTO documents_fts(rowid, title, content_text) - VALUES (new.id, new.title, new.content_text); + VALUES (new.id, COALESCE(new.title, ''), new.content_text); END; CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN INSERT INTO documents_fts(documents_fts, rowid, title, content_text) - VALUES('delete', old.id, old.title, old.content_text); + VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text); END; -- Only rebuild FTS when searchable text actually changes (not metadata-only updates) @@ -160,9 +176,9 @@ CREATE TRIGGER documents_au AFTER UPDATE ON documents WHEN old.title IS NOT new.title OR old.content_text != new.content_text BEGIN INSERT INTO documents_fts(documents_fts, rowid, title, content_text) - VALUES('delete', old.id, old.title, old.content_text); + VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text); INSERT INTO documents_fts(rowid, title, content_text) - VALUES (new.id, new.title, new.content_text); + VALUES (new.id, COALESCE(new.title, ''), new.content_text); END; ``` @@ -490,7 +506,7 @@ pub struct NoteContent { --- -### 2.4 CLI: `gi generate-docs` (Incremental by Default) +### 2.4 CLI: `lore generate-docs` (Incremental by Default) **File:** `src/cli/commands/generate_docs.rs` @@ -518,84 +534,83 @@ pub struct GenerateDocsResult { pub skipped: usize, // Unchanged documents } -/// Chunk size for --full mode transactions. +/// Chunk size for --full mode dirty queue seeding. /// Balances throughput against WAL file growth and memory pressure. const FULL_MODE_CHUNK_SIZE: usize = 2000; /// Run document generation (incremental by default). /// +/// IMPORTANT: Both modes use the same regenerator codepath to avoid +/// logic divergence in label/path hashing, deletion semantics, and +/// write-optimization behavior. The only difference is how dirty_sources +/// gets populated. +/// /// Incremental mode (default): -/// - Processes only items in dirty_sources queue +/// - Processes only items already in dirty_sources queue /// - Fast for routine syncs /// /// Full mode (--full): -/// - Regenerates ALL documents from scratch -/// - Uses chunked transactions (2k docs/tx) to bound WAL growth +/// - Seeds dirty_sources with ALL source entities in chunks +/// - Drains through the same regenerator pipeline +/// - Uses keyset pagination (WHERE id > last_id) to avoid OFFSET degradation +/// - Final FTS optimize after all chunks complete /// - Use when schema changes or after migration pub fn run_generate_docs( config: &Config, full: bool, project_filter: Option<&str>, ) -> Result { + let conn = open_db(config)?; + if full { - // Full mode: regenerate everything using chunked transactions + // Full mode: seed dirty_sources with all source entities, then drain. + // Uses keyset pagination to avoid O(n²) OFFSET degradation on large tables. // - // Using chunked transactions instead of a single giant transaction: - // - Bounds WAL file growth (single 50k-doc tx could balloon WAL) - // - Reduces memory pressure from statement caches - // - Allows progress reporting between chunks - // - Crash partway through leaves partial but consistent state + // Seeding is chunked to bound WAL growth: + // 1. For each source type (issues, MRs, discussions): + // a. Query next chunk WHERE id > last_id ORDER BY id LIMIT chunk_size + // b. INSERT OR IGNORE each into dirty_sources + // c. Advance last_id = chunk.last().id + // d. Loop until chunk is empty + // 2. Drain dirty_sources through regenerator (same as incremental) + // 3. Final FTS optimize (not full rebuild — triggers handle consistency) // - // Steps per chunk: - // 1. BEGIN IMMEDIATE transaction - // 2. Query next batch of sources (issues/MRs/discussions) - // 3. For each: generate document, compute hash - // 4. Upsert into `documents` table (FTS triggers auto-fire) - // 5. Populate `document_labels` and `document_paths` - // 6. COMMIT - // 7. Report progress, loop to next chunk - // - // After all chunks: - // 8. Single final transaction for FTS rebuild: - // INSERT INTO documents_fts(documents_fts) VALUES('rebuild') - // - // Example implementation: - let conn = open_db(config)?; - let mut result = GenerateDocsResult::default(); - let mut offset = 0; + // Benefits of unified codepath: + // - No divergence in label/path hash behavior + // - No divergence in deletion semantics + // - No divergence in write-optimization logic (labels_hash, paths_hash) + // - FTS triggers fire identically in both modes + // Seed issues + let mut last_id: i64 = 0; loop { - // Process issues in chunks - let issues: Vec = query_issues(&conn, project_filter, FULL_MODE_CHUNK_SIZE, offset)?; - if issues.is_empty() { break; } - + let chunk = query_issue_ids_after(&conn, project_filter, FULL_MODE_CHUNK_SIZE, last_id)?; + if chunk.is_empty() { break; } let tx = conn.transaction()?; - for issue in &issues { - let doc = generate_issue_document(issue)?; - upsert_document(&tx, &doc)?; - result.issues += 1; + for id in &chunk { + mark_dirty(&tx, SourceType::Issue, *id)?; } tx.commit()?; - - offset += issues.len(); - // Report progress here if using indicatif + last_id = *chunk.last().unwrap(); } - // Similar chunked loops for MRs and discussions... + // Similar keyset-paginated seeding for MRs and discussions... - // Final FTS rebuild in its own transaction - let tx = conn.transaction()?; - tx.execute( - "INSERT INTO documents_fts(documents_fts) VALUES('rebuild')", + // Report: seeding complete, now regenerating + } + + // Both modes: drain dirty_sources through the regenerator + let regen = regenerate_dirty_documents(&conn)?; + + if full { + // FTS optimize after bulk operations (compacts index segments) + conn.execute( + "INSERT INTO documents_fts(documents_fts) VALUES('optimize')", [], )?; - tx.commit()?; - } else { - // Incremental mode: process dirty_sources only - // 1. Query dirty_sources (bounded by LIMIT) - // 2. Regenerate only those documents - // 3. Clear from dirty_sources after processing } + + // Map regen -> GenerateDocsResult stats todo!() } @@ -849,7 +864,7 @@ pub fn search_fts( )?; let results = stmt - .query_map([&safe_query, &limit.to_string()], |row| { + .query_map(rusqlite::params![safe_query, limit as i64], |row| { Ok(FtsResult { document_id: row.get(0)?, rank: row.get(1)?, @@ -897,10 +912,11 @@ pub struct SearchFilters { pub source_type: Option, pub author: Option, pub project_id: Option, - pub after: Option, // ms epoch - pub labels: Vec, // AND logic + pub after: Option, // ms epoch (created_at >=) + pub updated_after: Option, // ms epoch (updated_at >=) + pub labels: Vec, // AND logic pub path: Option, - pub limit: usize, // Default 20, max 100 + pub limit: usize, // Default 20, max 100 } impl SearchFilters { @@ -910,6 +926,7 @@ impl SearchFilters { || self.author.is_some() || self.project_id.is_some() || self.after.is_some() + || self.updated_after.is_some() || !self.labels.is_empty() || self.path.is_some() } @@ -990,6 +1007,11 @@ pub fn apply_filters( params.push(Box::new(after)); } + if let Some(updated_after) = filters.updated_after { + conditions.push("d.updated_at >= ?".into()); + params.push(Box::new(updated_after)); + } + // Labels: AND logic - all labels must be present for label in &filters.labels { conditions.push( @@ -1064,6 +1086,7 @@ pub fn apply_filters( | `--author` | `author_username` | Exact match | | `--project` | `project_id` | Resolve path to ID | | `--after` | `created_at` | `>= date` (ms epoch) | +| `--updated-after` | `updated_at` | `>= date` (ms epoch), common triage filter | | `--label` | `document_labels` | JOIN, multiple = AND | | `--path` | `document_paths` | JOIN, trailing `/` = prefix | | `--limit` | N/A | Default 20, max 100 | @@ -1072,6 +1095,7 @@ pub fn apply_filters( - [ ] Each filter correctly restricts results - [ ] Multiple `--label` flags use AND logic - [ ] Path prefix vs exact match works correctly +- [ ] `--updated-after` filters on updated_at (not created_at) - [ ] Filters compose (all applied together) - [ ] Ranking order preserved after filtering (ORDER BY position) - [ ] Limit clamped to valid range [1, 100] @@ -1080,7 +1104,7 @@ pub fn apply_filters( --- -### 3.4 CLI: `gi search --mode=lexical` +### 3.4 CLI: `lore search --mode=lexical` **File:** `src/cli/commands/search.rs` @@ -1141,9 +1165,49 @@ pub fn run_search( explain: bool, ) -> Result { // 1. Parse query and filters - // 2. Execute search based on mode - // 3. Apply post-retrieval filters - // 4. Format and return results + // 2. Execute search based on mode -> ranked doc_ids (+ explain ranks) + // 3. Apply post-retrieval filters preserving ranking order + // 4. HYDRATE in one DB round-trip (see hydration query below): + // - documents fields (title, url, created_at, updated_at, content_text) + // - project_path via JOIN projects + // - labels aggregated via json_group_array + // - paths aggregated via json_group_array (optional) + // 5. Attach snippet: + // - prefer FTS snippet when doc hit FTS + // - fallback: truncated content_text via generate_fallback_snippet() + // 6. For --mode=semantic with 0% embedding coverage: + // return early with actionable error message (distinct from "Ollama down") + todo!() +} + +/// Hydration query: fetch all display fields for ranked doc IDs in a single round-trip. +/// +/// Uses json_each(?) to preserve ranking order from the search pipeline. +/// Aggregates labels and paths inline to avoid N+1 queries. +/// +/// ```sql +/// SELECT d.id, d.source_type, d.title, d.url, d.author_username, +/// d.created_at, d.updated_at, d.content_text, +/// p.path AS project_path, +/// (SELECT json_group_array(dl.label_name) +/// FROM document_labels dl WHERE dl.document_id = d.id) AS labels, +/// (SELECT json_group_array(dp.path) +/// FROM document_paths dp WHERE dp.document_id = d.id) AS paths +/// FROM json_each(?) AS j +/// JOIN documents d ON d.id = j.value +/// JOIN projects p ON p.id = d.project_id +/// ORDER BY j.key +/// ``` +/// +/// This single query replaces what would otherwise be: +/// - 1 query per document for metadata +/// - 1 query per document for labels +/// - 1 query per document for paths +/// For 20 results, that's 60 queries reduced to 1. +fn hydrate_results( + conn: &Connection, + doc_ids: &[i64], +) -> Result> { todo!() } @@ -1240,6 +1304,10 @@ pub struct SearchArgs { #[arg(long)] after: Option, + /// Filter by updated date (recently active items) + #[arg(long)] + updated_after: Option, + /// Filter by label (can specify multiple) #[arg(long, action = clap::ArgAction::Append)] label: Vec, @@ -1266,12 +1334,15 @@ pub struct SearchArgs { **Acceptance Criteria:** - [ ] Works without Ollama running -- [ ] All filters functional +- [ ] All filters functional (including `--updated-after`) - [ ] Human-readable output with snippets - [ ] Semantic-only results get fallback snippets from content_text +- [ ] Results hydrated in single DB round-trip (no N+1 queries) - [ ] JSON output matches schema - [ ] Empty results show helpful message - [ ] "No data indexed" message if documents table empty +- [ ] `--mode=semantic` with 0% embedding coverage returns actionable error + (distinct from "Ollama unavailable" — tells user to run `lore embed` first) - [ ] `--fts-mode=safe` (default) preserves prefix `*` while escaping special chars - [ ] `--fts-mode=raw` passes FTS5 MATCH syntax through unchanged @@ -1535,7 +1606,7 @@ impl GiError { // ... existing mappings ... Self::OllamaUnavailable { .. } => Some("Start Ollama: ollama serve"), Self::OllamaModelNotFound { model } => Some("Pull the model: ollama pull nomic-embed-text"), - Self::EmbeddingFailed { .. } => Some("Check Ollama logs or retry with 'gi embed --retry-failed'"), + Self::EmbeddingFailed { .. } => Some("Check Ollama logs or retry with 'lore embed --retry-failed'"), } } } @@ -1558,6 +1629,7 @@ use crate::embedding::OllamaClient; const BATCH_SIZE: usize = 32; /// SQLite page size for paging through pending documents. +/// Uses keyset paging (id > last_id) to avoid rescanning previously-processed rows. const DB_PAGE_SIZE: usize = 500; /// Expected embedding dimensions for nomic-embed-text model. @@ -1584,11 +1656,16 @@ pub struct EmbedResult { /// Embed documents that need embedding. /// /// Process: -/// 1. Query dirty_sources ordered by queued_at -/// 2. For each: regenerate document, compute new hash -/// 3. ALWAYS upsert document (labels/paths may change even if content_hash unchanged) -/// 4. Track whether content_hash changed (for stats) -/// 5. Delete from dirty_sources (or record error on failure) +/// 1. Select documents needing embeddings: +/// - Pending: missing embedding_metadata row OR content_hash mismatch +/// - RetryFailed: embedding_metadata.last_error IS NOT NULL +/// 2. Page through candidates using keyset pagination (id > last_id) +/// to avoid rescanning already-processed rows +/// 3. Batch texts -> Ollama `/api/embed` with concurrent HTTP requests +/// 4. Write embeddings + embedding_metadata in per-batch transactions +/// 5. Failed batches record `last_error` in embedding_metadata +/// (excluded from Pending selection; retried via RetryFailed) +/// 6. Progress reported as (embedded + failed) vs total_pending pub async fn embed_documents( conn: &Connection, client: &OllamaClient, @@ -1605,9 +1682,11 @@ pub async fn embed_documents( return Ok(result); } - // Page through pending documents to avoid loading all into memory + // Page through pending documents using keyset pagination to avoid + // both memory pressure and OFFSET performance degradation. + let mut last_id: i64 = 0; loop { - let pending = find_pending_documents(conn, DB_PAGE_SIZE, selection)?; + let pending = find_pending_documents(conn, DB_PAGE_SIZE, last_id, selection)?; if pending.is_empty() { break; } @@ -1640,6 +1719,11 @@ pub async fn embed_documents( collect_writes(conn, &meta, res, &mut result)?; } + // Advance keyset cursor for next page + if let Some(last) = pending.last() { + last_id = last.id; + } + if let Some(ref cb) = progress_callback { cb(result.embedded + result.failed, total_pending); } @@ -1718,14 +1802,16 @@ fn count_pending_documents(conn: &Connection, selection: EmbedSelection) -> Resu Ok(count) } -/// Find pending documents for embedding. +/// Find pending documents for embedding using keyset pagination. /// -/// IMPORTANT: Uses deterministic ORDER BY d.id to ensure consistent -/// paging behavior. Without ordering, SQLite may return rows in -/// different orders across calls, causing missed or duplicate documents. +/// IMPORTANT: Uses keyset pagination (d.id > last_id) instead of OFFSET. +/// OFFSET degrades O(n²) on large result sets because SQLite must scan +/// and discard all rows before the offset. Keyset pagination is O(1) per page +/// since the index seek goes directly to the starting row. fn find_pending_documents( conn: &Connection, limit: usize, + last_id: i64, selection: EmbedSelection, ) -> Result> { let sql = match selection { @@ -1733,8 +1819,9 @@ fn find_pending_documents( "SELECT d.id, d.content_text, d.content_hash FROM documents d LEFT JOIN embedding_metadata em ON d.id = em.document_id - WHERE em.document_id IS NULL - OR em.content_hash != d.content_hash + WHERE (em.document_id IS NULL + OR em.content_hash != d.content_hash) + AND d.id > ? ORDER BY d.id LIMIT ?", EmbedSelection::RetryFailed => @@ -1742,13 +1829,14 @@ fn find_pending_documents( FROM documents d JOIN embedding_metadata em ON d.id = em.document_id WHERE em.last_error IS NOT NULL + AND d.id > ? ORDER BY d.id LIMIT ?", }; let mut stmt = conn.prepare(sql)?; let docs = stmt - .query_map([limit], |row| { + .query_map(rusqlite::params![last_id, limit as i64], |row| { Ok(PendingDocument { id: row.get(0)?, content: row.get(1)?, @@ -1827,7 +1915,7 @@ fn record_embedding_error( --- -### 4.5 CLI: `gi embed` +### 4.5 CLI: `lore embed` **File:** `src/cli/commands/embed.rs` @@ -1928,7 +2016,7 @@ pub struct EmbedArgs { --- -### 4.6 CLI: `gi stats` +### 4.6 CLI: `lore stats` **File:** `src/cli/commands/stats.rs` @@ -2034,7 +2122,13 @@ pub struct RepairResult { /// Fixes: /// - Deletes orphaned embeddings (embedding_metadata rows with no matching document) /// - Clears stale embedding_metadata (hash mismatch) so they get re-embedded -/// - Repopulates FTS for documents missing from documents_fts +/// - Rebuilds FTS index from scratch (correct-by-construction) +/// +/// NOTE: FTS repair uses `rebuild` rather than partial row insertion. +/// With `content='documents'` (external-content FTS), partial repopulation +/// via INSERT of missing rows is fragile — if the external content table +/// and FTS content diverge in any way, partial fixes can leave the index +/// in an inconsistent state. A full rebuild is slower but guaranteed correct. pub fn run_repair(config: &Config) -> Result { let conn = open_db(config)?; @@ -2061,19 +2155,19 @@ pub fn run_repair(config: &Config) -> Result { [], )?; - // Repopulate FTS for missing documents - let fts_repopulated = conn.execute( - "INSERT INTO documents_fts(rowid, title, content_text) - SELECT id, COALESCE(title, ''), content_text - FROM documents - WHERE id NOT IN (SELECT rowid FROM documents_fts)", + // Rebuild FTS index from scratch — correct-by-construction. + // This re-reads all rows from the external content table (documents) + // and rebuilds the index. Slower than partial fix but guaranteed consistent. + conn.execute( + "INSERT INTO documents_fts(documents_fts) VALUES('rebuild')", [], )?; + let fts_rebuilt = 1; // rebuild is all-or-nothing Ok(RepairResult { orphaned_embeddings_deleted: orphaned_deleted, stale_embeddings_cleared: stale_cleared, - missing_fts_repopulated: fts_repopulated, + missing_fts_repopulated: fts_rebuilt, }) } @@ -2772,7 +2866,7 @@ pub fn record_fetch_error( current_attempt: i64, ) -> Result<()> { let now = now_ms(); - let next_attempt = compute_next_attempt_at(now, current_attempt + 1); + let next_attempt = crate::core::backoff::compute_next_attempt_at(now, current_attempt + 1); conn.execute( "UPDATE pending_discussion_fetches @@ -2786,14 +2880,44 @@ pub fn record_fetch_error( Ok(()) } +// NOTE: Backoff computation uses the shared utility in `src/core/backoff.rs`. +// See Phase 6.X below for the shared implementation. +``` + +**Acceptance Criteria:** +- [ ] Updated entities queued for discussion fetch +- [ ] Success removes from queue +- [ ] Failure increments attempt_count and sets next_attempt_at +- [ ] Processing bounded per run (max 100) +- [ ] Exponential backoff uses `next_attempt_at` (index-friendly, no overflow) +- [ ] Backoff computed with jitter to prevent thundering herd + +--- + +### 6.X Shared Backoff Utility + +**File:** `src/core/backoff.rs` + +Single implementation of exponential backoff with jitter, used by both +`dirty_sources` and `pending_discussion_fetches` queues. Living in `src/core/` +because it's a cross-cutting concern used by multiple modules. + +```rust +use rand::Rng; + /// Compute next_attempt_at with exponential backoff and jitter. /// /// Formula: now + min(3600000, 1000 * 2^attempt_count) * (0.9 to 1.1) /// - Capped at 1 hour to prevent runaway delays /// - ±10% jitter prevents synchronized retries after outages +/// +/// Used by: +/// - `dirty_sources` retry scheduling (document regeneration failures) +/// - `pending_discussion_fetches` retry scheduling (API fetch failures) +/// +/// Having one implementation prevents subtle divergence between queues +/// (e.g., different caps or jitter ranges). pub fn compute_next_attempt_at(now: i64, attempt_count: i64) -> i64 { - use rand::Rng; - // Cap attempt_count to prevent overflow (2^30 > 1 hour anyway) let capped_attempts = attempt_count.min(30) as u32; let base_delay_ms = 1000_i64.saturating_mul(1 << capped_attempts); @@ -2807,13 +2931,16 @@ pub fn compute_next_attempt_at(now: i64, attempt_count: i64) -> i64 { } ``` +**Update `src/core/mod.rs`:** +```rust +pub mod backoff; // Add to existing modules +``` + **Acceptance Criteria:** -- [ ] Updated entities queued for discussion fetch -- [ ] Success removes from queue -- [ ] Failure increments attempt_count and sets next_attempt_at -- [ ] Processing bounded per run (max 100) -- [ ] Exponential backoff uses `next_attempt_at` (index-friendly, no overflow) -- [ ] Backoff computed with jitter to prevent thundering herd +- [ ] Single implementation shared by both queue retry paths +- [ ] Cap at 1 hour prevents runaway delays +- [ ] Jitter prevents thundering herd after outage recovery +- [ ] Unit tests verify backoff curve and cap behavior --- @@ -2917,19 +3044,36 @@ fn delete_document( } /// Record a regeneration error on a dirty source for retry. +/// +/// IMPORTANT: Sets `next_attempt_at` using the shared backoff utility. +/// Without this, failed items would retry every run (hot-loop), defeating +/// the backoff design documented in the schema. fn record_dirty_error( conn: &Connection, source_type: SourceType, source_id: i64, error: &str, ) -> Result<()> { + let now = now_ms(); + + // Read current attempt_count from DB to compute backoff + let attempt_count: i64 = conn.query_row( + "SELECT attempt_count FROM dirty_sources WHERE source_type = ? AND source_id = ?", + rusqlite::params![source_type.as_str(), source_id], + |row| row.get(0), + )?; + + // Use shared backoff utility (same as pending_discussion_fetches) + let next_attempt_at = crate::core::backoff::compute_next_attempt_at(now, attempt_count + 1); + conn.execute( "UPDATE dirty_sources SET attempt_count = attempt_count + 1, last_attempt_at = ?, - last_error = ? + last_error = ?, + next_attempt_at = ? WHERE source_type = ? AND source_id = ?", - rusqlite::params![now_ms(), error, source_type.as_str(), source_id], + rusqlite::params![now, error, next_attempt_at, source_type.as_str(), source_id], )?; Ok(()) } @@ -3080,7 +3224,7 @@ fn get_document_id( --- -### 6.4 CLI: `gi sync` +### 6.4 CLI: `lore sync` **File:** `src/cli/commands/sync.rs` @@ -3198,7 +3342,8 @@ pub struct SyncArgs { | FTS query sanitization | `src/search/fts.rs` (mod tests) | `to_fts_query()` edge cases: `-`, `"`, `:`, `*`, `C++` | | SourceType parsing | `src/documents/extractor.rs` (mod tests) | `parse()` accepts aliases: `mr`, `mrs`, `issue`, etc. | | SearchFilters | `src/search/filters.rs` (mod tests) | `has_any_filter()`, `clamp_limit()` | -| Backoff logic | `src/ingestion/dirty_tracker.rs` (mod tests) | Exponential backoff query timing | +| Backoff logic | `src/core/backoff.rs` (mod tests) | Shared exponential backoff curve, cap, jitter | +| Hydration | `src/cli/commands/search.rs` (mod tests) | Single round-trip, label/path aggregation | ### Integration Tests @@ -3232,25 +3377,29 @@ Each query must have at least one expected URL in top 10 results. | Command | Expected | Pass Criteria | |---------|----------|---------------| -| `gi generate-docs` | Progress, count | Completes, count > 0 | -| `gi generate-docs` (re-run) | 0 regenerated | Hash comparison works | -| `gi embed` | Progress, count | Completes, count matches docs | -| `gi embed` (re-run) | 0 embedded | Skips unchanged | -| `gi embed --retry-failed` | Processes failed | Only failed docs processed | -| `gi stats` | Coverage stats | Shows 100% after embed | -| `gi stats` | Queue depths | Shows dirty_sources and pending_discussion_fetches counts | -| `gi search "auth" --mode=lexical` | Results | Works without Ollama | -| `gi search "auth"` | Hybrid results | Vector + FTS combined | -| `gi search "auth"` (Ollama down) | FTS results + warning | Graceful degradation, warning in response | -| `gi search "auth" --explain` | Rank breakdown | Shows vector/FTS/RRF | -| `gi search "auth" --type=mr` | Filtered results | Only MRs | -| `gi search "auth" --type=mrs` | Filtered results | Alias works | -| `gi search "auth" --label=bug` | Filtered results | Only labeled docs | -| `gi search "-DWITH_SSL"` | Results | Leading dash doesn't cause FTS error | -| `gi search 'C++'` | Results | Special chars in query work | -| `gi search "nonexistent123"` | No results | Graceful empty state | -| `gi sync` | Full pipeline | All steps complete | -| `gi sync --no-embed` | Skip embedding | Docs generated, not embedded | +| `lore generate-docs` | Progress, count | Completes, count > 0 | +| `lore generate-docs` (re-run) | 0 regenerated | Hash comparison works | +| `lore embed` | Progress, count | Completes, count matches docs | +| `lore embed` (re-run) | 0 embedded | Skips unchanged | +| `lore embed --retry-failed` | Processes failed | Only failed docs processed | +| `lore stats` | Coverage stats | Shows 100% after embed | +| `lore stats` | Queue depths | Shows dirty_sources and pending_discussion_fetches counts | +| `lore search "auth" --mode=lexical` | Results | Works without Ollama | +| `lore search "auth"` | Hybrid results | Vector + FTS combined | +| `lore search "auth"` (Ollama down) | FTS results + warning | Graceful degradation, warning in response | +| `lore search "auth" --explain` | Rank breakdown | Shows vector/FTS/RRF | +| `lore search "auth" --type=mr` | Filtered results | Only MRs | +| `lore search "auth" --type=mrs` | Filtered results | Alias works | +| `lore search "auth" --label=bug` | Filtered results | Only labeled docs | +| `lore search "-DWITH_SSL"` | Results | Leading dash doesn't cause FTS error | +| `lore search 'C++'` | Results | Special chars in query work | +| `lore search "auth" --updated-after 2024-01-01` | Filtered results | Only recently updated docs | +| `lore search "nonexistent123"` | No results | Graceful empty state | +| `lore search "auth" --mode=semantic` (no embeddings) | Actionable error | Tells user to run `lore embed` first | +| `lore sync` | Full pipeline | All steps complete | +| `lore sync --no-embed` | Skip embedding | Docs generated, not embedded | +| `lore generate-docs --full` | Progress, count | Keyset pagination completes without OFFSET degradation | +| `lore stats --check --repair` | Repair results | FTS rebuilt, orphans cleaned | --- @@ -3274,43 +3423,59 @@ Each query must have at least one expected URL in top 10 results. ## Success Criteria -Checkpoint 3 is complete when: +Checkpoint 3 is complete when all three gates pass: + +### Gate A: Lexical MVP 1. **Lexical search works without Ollama** - - `gi search "query" --mode=lexical` returns relevant results - - All filters functional + - `lore search "query" --mode=lexical` returns relevant results + - All filters functional (including `--updated-after`) - FTS5 syntax errors prevented by query sanitization - Special characters in queries work correctly (`-DWITH_SSL`, `C++`) + - Search results hydrated in single DB round-trip (no N+1) -2. **Semantic search works with Ollama** - - `gi embed` completes successfully - - `gi search "query"` returns semantically relevant results +2. **Document generation is correct** + - Full and incremental modes use the same regenerator codepath + - `--full` uses keyset pagination (no OFFSET degradation) + - FTS triggers use COALESCE for NULL-safe operation + +### Gate B: Hybrid MVP + +3. **Semantic search works with Ollama** + - `lore embed` completes successfully + - `lore search "query"` returns semantically relevant results - `--explain` shows ranking breakdown + - `--mode=semantic` with 0% embedding coverage returns actionable error -3. **Hybrid search combines both** +4. **Hybrid search combines both** - Documents appearing in both retrievers rank higher - Graceful degradation when Ollama unavailable (falls back to FTS) - Transient embed failures don't fail the entire search - Warning message included in response on degradation + - Embedding pipeline uses keyset pagination for consistent paging -4. **Incremental sync is efficient** - - `gi sync` only processes changed entities +### Gate C: Sync MVP + +5. **Incremental sync is efficient** + - `lore sync` only processes changed entities - Re-embedding only happens for changed documents - Progress visible during long syncs - - Queue backoff prevents hot-loop retries on persistent failures + - Queue backoff actually prevents hot-loop retries (both queues set `next_attempt_at`) + - Shared backoff utility ensures consistent behavior across queues -5. **Data integrity maintained** +6. **Data integrity maintained** - All counts match between tables - No orphaned records - Hashes consistent - `get_existing_hash()` properly distinguishes "not found" from DB errors + - `--repair` uses FTS `rebuild` for correct-by-construction repair -6. **Observability** - - `gi stats` shows queue depths and failed item counts +7. **Observability** + - `lore stats` shows queue depths and failed item counts - Failed items visible for operator intervention - Deterministic ordering ensures consistent paging -7. **Tests pass** - - Unit tests for core algorithms (including FTS sanitization, backoff) +8. **Tests pass** + - Unit tests for core algorithms (including FTS sanitization, shared backoff, hydration) - Integration tests for pipelines - Golden queries return expected results