diff --git a/migrations/007_documents.sql b/migrations/007_documents.sql new file mode 100644 index 0000000..8c8e032 --- /dev/null +++ b/migrations/007_documents.sql @@ -0,0 +1,84 @@ +-- Migration 007: Documents, Document Labels, Document Paths, Dirty Sources, Pending Discussion Fetches +-- Schema version: 7 +-- Adds CP3 document storage and queue tables for search pipeline + +-- Unified searchable documents (derived from issues/MRs/discussions) +CREATE TABLE documents ( + id INTEGER PRIMARY KEY, + source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')), + source_id INTEGER NOT NULL, -- local DB id in the source table + project_id INTEGER NOT NULL REFERENCES projects(id), + author_username TEXT, -- for discussions: first note author + label_names TEXT, -- JSON array (display/debug only) + created_at INTEGER, -- ms epoch UTC + updated_at INTEGER, -- ms epoch UTC + url TEXT, + title TEXT, -- null for discussions + content_text TEXT NOT NULL, -- canonical text for embedding/search + content_hash TEXT NOT NULL, -- SHA-256 for change detection + labels_hash TEXT NOT NULL DEFAULT '', -- SHA-256 over sorted labels (write optimization) + paths_hash TEXT NOT NULL DEFAULT '', -- SHA-256 over sorted paths (write optimization) + is_truncated INTEGER NOT NULL DEFAULT 0, + truncated_reason TEXT CHECK ( + truncated_reason IN ( + 'token_limit_middle_drop','single_note_oversized','first_last_oversized', + 'hard_cap_oversized' + ) + OR truncated_reason IS NULL + ), + UNIQUE(source_type, source_id) +); + +CREATE INDEX idx_documents_project_updated ON documents(project_id, updated_at); +CREATE INDEX idx_documents_author ON documents(author_username); +CREATE INDEX idx_documents_source ON documents(source_type, source_id); +CREATE INDEX idx_documents_hash ON documents(content_hash); + +-- Fast label filtering (indexed exact-match) +CREATE TABLE document_labels ( + document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + label_name TEXT NOT NULL, + PRIMARY KEY(document_id, label_name) +) WITHOUT ROWID; +CREATE INDEX idx_document_labels_label ON document_labels(label_name); + +-- Fast path filtering (DiffNote file paths) +CREATE TABLE document_paths ( + document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + path TEXT NOT NULL, + PRIMARY KEY(document_id, path) +) WITHOUT ROWID; +CREATE INDEX idx_document_paths_path ON document_paths(path); + +-- Queue for incremental document regeneration (with retry tracking) +-- Uses next_attempt_at for index-friendly backoff queries +CREATE TABLE dirty_sources ( + source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')), + source_id INTEGER NOT NULL, + queued_at INTEGER NOT NULL, -- ms epoch UTC + attempt_count INTEGER NOT NULL DEFAULT 0, + last_attempt_at INTEGER, + last_error TEXT, + next_attempt_at INTEGER, -- ms epoch UTC; NULL means ready immediately + PRIMARY KEY(source_type, source_id) +); +CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at); + +-- Resumable queue for dependent discussion fetching +-- Uses next_attempt_at for index-friendly backoff queries +CREATE TABLE pending_discussion_fetches ( + project_id INTEGER NOT NULL REFERENCES projects(id), + noteable_type TEXT NOT NULL, -- 'Issue' | 'MergeRequest' + noteable_iid INTEGER NOT NULL, + queued_at INTEGER NOT NULL, -- ms epoch UTC + attempt_count INTEGER NOT NULL DEFAULT 0, + last_attempt_at INTEGER, + last_error TEXT, + next_attempt_at INTEGER, -- ms epoch UTC; NULL means ready immediately + PRIMARY KEY(project_id, noteable_type, noteable_iid) +); +CREATE INDEX idx_pending_discussions_next_attempt ON pending_discussion_fetches(next_attempt_at); + +-- Update schema version +INSERT INTO schema_version (version, applied_at, description) +VALUES (7, strftime('%s', 'now') * 1000, 'Documents, labels, paths, dirty sources, pending discussion fetches'); diff --git a/migrations/008_fts5.sql b/migrations/008_fts5.sql new file mode 100644 index 0000000..1929c28 --- /dev/null +++ b/migrations/008_fts5.sql @@ -0,0 +1,42 @@ +-- Migration 008: FTS5 Full-Text Search Index +-- Schema version: 8 +-- Adds full-text search on documents table with sync triggers + +-- Full-text search with porter stemmer and prefix indexes for type-ahead +CREATE VIRTUAL TABLE documents_fts USING fts5( + title, + content_text, + content='documents', + content_rowid='id', + tokenize='porter unicode61', + prefix='2 3 4' +); + +-- Keep FTS in sync via triggers. +-- IMPORTANT: COALESCE(title, '') ensures FTS5 external-content table never +-- receives NULL values, which can cause inconsistencies with delete operations. +-- FTS5 delete requires exact match of original values; NULL != NULL in SQL, +-- so a NULL title on insert would make the delete trigger fail silently. +CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN + INSERT INTO documents_fts(rowid, title, content_text) + VALUES (new.id, COALESCE(new.title, ''), new.content_text); +END; + +CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, rowid, title, content_text) + VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text); +END; + +-- Only rebuild FTS when searchable text actually changes (not metadata-only updates) +CREATE TRIGGER documents_au AFTER UPDATE ON documents +WHEN old.title IS NOT new.title OR old.content_text != new.content_text +BEGIN + INSERT INTO documents_fts(documents_fts, rowid, title, content_text) + VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text); + INSERT INTO documents_fts(rowid, title, content_text) + VALUES (new.id, COALESCE(new.title, ''), new.content_text); +END; + +-- Update schema version +INSERT INTO schema_version (version, applied_at, description) +VALUES (8, strftime('%s', 'now') * 1000, 'FTS5 full-text search index with sync triggers'); diff --git a/migrations/009_embeddings.sql b/migrations/009_embeddings.sql new file mode 100644 index 0000000..5374a72 --- /dev/null +++ b/migrations/009_embeddings.sql @@ -0,0 +1,54 @@ +-- Migration 009: Embeddings (Gate B) +-- Schema version: 9 +-- Adds sqlite-vec vector storage and embedding metadata for semantic search +-- Requires sqlite-vec extension to be loaded before applying + +-- NOTE: sqlite-vec vec0 virtual tables cannot participate in FK cascades. +-- We must use an explicit trigger to delete orphan embeddings when documents +-- are deleted. See documents_embeddings_ad trigger below. + +-- sqlite-vec virtual table for vector search +-- Storage rule: embeddings.rowid = document_id * 1000 + chunk_index +-- This encodes (document_id, chunk_index) into a single integer rowid. +-- Supports up to 1000 chunks per document (32M chars at 32k/chunk). +CREATE VIRTUAL TABLE embeddings USING vec0( + embedding float[768] +); + +-- Embedding provenance + change detection (one row per chunk) +-- NOTE: Two hash columns serve different purposes: +-- document_hash: SHA-256 of full documents.content_text (staleness detection) +-- chunk_hash: SHA-256 of this individual chunk's text (debug/provenance) +-- Pending detection uses document_hash (not chunk_hash) because staleness is +-- a document-level condition: if the document changed, ALL chunks need re-embedding. +CREATE TABLE embedding_metadata ( + document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL DEFAULT 0, -- 0-indexed position within document + model TEXT NOT NULL, -- 'nomic-embed-text' + dims INTEGER NOT NULL, -- 768 + document_hash TEXT NOT NULL, -- SHA-256 of full documents.content_text (staleness) + chunk_hash TEXT NOT NULL, -- SHA-256 of this chunk's text (provenance) + created_at INTEGER NOT NULL, -- ms epoch UTC + last_error TEXT, -- error message from last failed attempt + attempt_count INTEGER NOT NULL DEFAULT 0, + last_attempt_at INTEGER, -- ms epoch UTC + PRIMARY KEY(document_id, chunk_index) +); + +CREATE INDEX idx_embedding_metadata_errors + ON embedding_metadata(last_error) WHERE last_error IS NOT NULL; +CREATE INDEX idx_embedding_metadata_doc ON embedding_metadata(document_id); + +-- CRITICAL: Delete ALL chunk embeddings when a document is deleted. +-- vec0 virtual tables don't support FK ON DELETE CASCADE, so we need this trigger. +-- embedding_metadata has ON DELETE CASCADE, so only vec0 needs explicit cleanup. +-- Range: [document_id * 1000, document_id * 1000 + 999] +CREATE TRIGGER documents_embeddings_ad AFTER DELETE ON documents BEGIN + DELETE FROM embeddings + WHERE rowid >= old.id * 1000 + AND rowid < (old.id + 1) * 1000; +END; + +-- Update schema version +INSERT INTO schema_version (version, applied_at, description) +VALUES (9, strftime('%s', 'now') * 1000, 'Embeddings vec0 table, metadata, orphan cleanup trigger');