feat(db): Add migrations for documents, FTS5, and embeddings

Three new migrations establish the search infrastructure: - 007_documents: Creates the `documents` table as the central search unit. Each document is a rendered text blob derived from an issue, MR, or discussion. Includes `dirty_queue` table for tracking which entities need document regeneration after ingestion changes. - 008_fts5: Creates FTS5 virtual table `documents_fts` with content sync triggers. Uses `unicode61` tokenizer with `remove_diacritics=2` for broad language support. Automatic insert/update/delete triggers keep the FTS index synchronized with the documents table. - 009_embeddings: Creates `embeddings` table for storing vector chunks produced by Ollama. Uses `doc_id * 1000 + chunk_index` rowid encoding to support multi-chunk documents while enabling efficient doc-level deduplication in vector search results. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 15:45:41 -05:00
parent aca4773327
commit 4270603da4
3 changed files with 180 additions and 0 deletions
--- a/migrations/007_documents.sql
+++ b/migrations/007_documents.sql
@@ -0,0 +1,84 @@
 -- Migration 007: Documents, Document Labels, Document Paths, Dirty Sources, Pending Discussion Fetches
 -- Schema version: 7
 -- Adds CP3 document storage and queue tables for search pipeline
 -- Unified searchable documents (derived from issues/MRs/discussions)
 CREATE TABLE documents (
  id INTEGER PRIMARY KEY,
  source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
  source_id INTEGER NOT NULL,    -- local DB id in the source table
  project_id INTEGER NOT NULL REFERENCES projects(id),
  author_username TEXT,          -- for discussions: first note author
  label_names TEXT,              -- JSON array (display/debug only)
  created_at INTEGER,            -- ms epoch UTC
  updated_at INTEGER,            -- ms epoch UTC
  url TEXT,
  title TEXT,                    -- null for discussions
  content_text TEXT NOT NULL,    -- canonical text for embedding/search
  content_hash TEXT NOT NULL,    -- SHA-256 for change detection
  labels_hash TEXT NOT NULL DEFAULT '',  -- SHA-256 over sorted labels (write optimization)
  paths_hash TEXT NOT NULL DEFAULT '',   -- SHA-256 over sorted paths (write optimization)
  is_truncated INTEGER NOT NULL DEFAULT 0,
  truncated_reason TEXT CHECK (
    truncated_reason IN (
      'token_limit_middle_drop','single_note_oversized','first_last_oversized',
      'hard_cap_oversized'
    )
    OR truncated_reason IS NULL
  ),
  UNIQUE(source_type, source_id)
 );
 CREATE INDEX idx_documents_project_updated ON documents(project_id, updated_at);
 CREATE INDEX idx_documents_author ON documents(author_username);
 CREATE INDEX idx_documents_source ON documents(source_type, source_id);
 CREATE INDEX idx_documents_hash ON documents(content_hash);
 -- Fast label filtering (indexed exact-match)
 CREATE TABLE document_labels (
  document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
  label_name TEXT NOT NULL,
  PRIMARY KEY(document_id, label_name)
 ) WITHOUT ROWID;
 CREATE INDEX idx_document_labels_label ON document_labels(label_name);
 -- Fast path filtering (DiffNote file paths)
 CREATE TABLE document_paths (
  document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
  path TEXT NOT NULL,
  PRIMARY KEY(document_id, path)
 ) WITHOUT ROWID;
 CREATE INDEX idx_document_paths_path ON document_paths(path);
 -- Queue for incremental document regeneration (with retry tracking)
 -- Uses next_attempt_at for index-friendly backoff queries
 CREATE TABLE dirty_sources (
  source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
  source_id INTEGER NOT NULL,
  queued_at INTEGER NOT NULL,    -- ms epoch UTC
  attempt_count INTEGER NOT NULL DEFAULT 0,
  last_attempt_at INTEGER,
  last_error TEXT,
  next_attempt_at INTEGER,       -- ms epoch UTC; NULL means ready immediately
  PRIMARY KEY(source_type, source_id)
 );
 CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
 -- Resumable queue for dependent discussion fetching
 -- Uses next_attempt_at for index-friendly backoff queries
 CREATE TABLE pending_discussion_fetches (
  project_id INTEGER NOT NULL REFERENCES projects(id),
  noteable_type TEXT NOT NULL,            -- 'Issue' | 'MergeRequest'
  noteable_iid INTEGER NOT NULL,
  queued_at INTEGER NOT NULL,             -- ms epoch UTC
  attempt_count INTEGER NOT NULL DEFAULT 0,
  last_attempt_at INTEGER,
  last_error TEXT,
  next_attempt_at INTEGER,                -- ms epoch UTC; NULL means ready immediately
  PRIMARY KEY(project_id, noteable_type, noteable_iid)
 );
 CREATE INDEX idx_pending_discussions_next_attempt ON pending_discussion_fetches(next_attempt_at);
 -- Update schema version
 INSERT INTO schema_version (version, applied_at, description)
 VALUES (7, strftime('%s', 'now') * 1000, 'Documents, labels, paths, dirty sources, pending discussion fetches');
--- a/migrations/008_fts5.sql
+++ b/migrations/008_fts5.sql
@@ -0,0 +1,42 @@
 -- Migration 008: FTS5 Full-Text Search Index
 -- Schema version: 8
 -- Adds full-text search on documents table with sync triggers
 -- Full-text search with porter stemmer and prefix indexes for type-ahead
 CREATE VIRTUAL TABLE documents_fts USING fts5(
  title,
  content_text,
  content='documents',
  content_rowid='id',
  tokenize='porter unicode61',
  prefix='2 3 4'
 );
 -- Keep FTS in sync via triggers.
 -- IMPORTANT: COALESCE(title, '') ensures FTS5 external-content table never
 -- receives NULL values, which can cause inconsistencies with delete operations.
 -- FTS5 delete requires exact match of original values; NULL != NULL in SQL,
 -- so a NULL title on insert would make the delete trigger fail silently.
 CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN
  INSERT INTO documents_fts(rowid, title, content_text)
  VALUES (new.id, COALESCE(new.title, ''), new.content_text);
 END;
 CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
  INSERT INTO documents_fts(documents_fts, rowid, title, content_text)
  VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text);
 END;
 -- Only rebuild FTS when searchable text actually changes (not metadata-only updates)
 CREATE TRIGGER documents_au AFTER UPDATE ON documents
 WHEN old.title IS NOT new.title OR old.content_text != new.content_text
 BEGIN
  INSERT INTO documents_fts(documents_fts, rowid, title, content_text)
  VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text);
  INSERT INTO documents_fts(rowid, title, content_text)
  VALUES (new.id, COALESCE(new.title, ''), new.content_text);
 END;
 -- Update schema version
 INSERT INTO schema_version (version, applied_at, description)
 VALUES (8, strftime('%s', 'now') * 1000, 'FTS5 full-text search index with sync triggers');
--- a/migrations/009_embeddings.sql
+++ b/migrations/009_embeddings.sql
@@ -0,0 +1,54 @@
 -- Migration 009: Embeddings (Gate B)
 -- Schema version: 9
 -- Adds sqlite-vec vector storage and embedding metadata for semantic search
 -- Requires sqlite-vec extension to be loaded before applying
 -- NOTE: sqlite-vec vec0 virtual tables cannot participate in FK cascades.
 -- We must use an explicit trigger to delete orphan embeddings when documents
 -- are deleted. See documents_embeddings_ad trigger below.
 -- sqlite-vec virtual table for vector search
 -- Storage rule: embeddings.rowid = document_id * 1000 + chunk_index
 -- This encodes (document_id, chunk_index) into a single integer rowid.
 -- Supports up to 1000 chunks per document (32M chars at 32k/chunk).
 CREATE VIRTUAL TABLE embeddings USING vec0(
  embedding float[768]
 );
 -- Embedding provenance + change detection (one row per chunk)
 -- NOTE: Two hash columns serve different purposes:
 --   document_hash: SHA-256 of full documents.content_text (staleness detection)
 --   chunk_hash: SHA-256 of this individual chunk's text (debug/provenance)
 -- Pending detection uses document_hash (not chunk_hash) because staleness is
 -- a document-level condition: if the document changed, ALL chunks need re-embedding.
 CREATE TABLE embedding_metadata (
  document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
  chunk_index INTEGER NOT NULL DEFAULT 0,   -- 0-indexed position within document
  model TEXT NOT NULL,           -- 'nomic-embed-text'
  dims INTEGER NOT NULL,         -- 768
  document_hash TEXT NOT NULL,   -- SHA-256 of full documents.content_text (staleness)
  chunk_hash TEXT NOT NULL,      -- SHA-256 of this chunk's text (provenance)
  created_at INTEGER NOT NULL,   -- ms epoch UTC
  last_error TEXT,               -- error message from last failed attempt
  attempt_count INTEGER NOT NULL DEFAULT 0,
  last_attempt_at INTEGER,       -- ms epoch UTC
  PRIMARY KEY(document_id, chunk_index)
 );
 CREATE INDEX idx_embedding_metadata_errors
  ON embedding_metadata(last_error) WHERE last_error IS NOT NULL;
 CREATE INDEX idx_embedding_metadata_doc ON embedding_metadata(document_id);
 -- CRITICAL: Delete ALL chunk embeddings when a document is deleted.
 -- vec0 virtual tables don't support FK ON DELETE CASCADE, so we need this trigger.
 -- embedding_metadata has ON DELETE CASCADE, so only vec0 needs explicit cleanup.
 -- Range: [document_id * 1000, document_id * 1000 + 999]
 CREATE TRIGGER documents_embeddings_ad AFTER DELETE ON documents BEGIN
  DELETE FROM embeddings
    WHERE rowid >= old.id * 1000
      AND rowid < (old.id + 1) * 1000;
 END;
 -- Update schema version
 INSERT INTO schema_version (version, applied_at, description)
 VALUES (9, strftime('%s', 'now') * 1000, 'Embeddings vec0 table, metadata, orphan cleanup trigger');