gitlore/migrations/009_embeddings.sql

-- Migration 009: Embeddings (Gate B)
-- Schema version: 9
-- Adds sqlite-vec vector storage and embedding metadata for semantic search
-- Requires sqlite-vec extension to be loaded before applying

-- NOTE: sqlite-vec vec0 virtual tables cannot participate in FK cascades.
-- We must use an explicit trigger to delete orphan embeddings when documents
-- are deleted. See documents_embeddings_ad trigger below.

-- sqlite-vec virtual table for vector search
-- Storage rule: embeddings.rowid = document_id * 1000 + chunk_index
-- This encodes (document_id, chunk_index) into a single integer rowid.
-- Supports up to 1000 chunks per document (32M chars at 32k/chunk).
CREATE VIRTUAL TABLE embeddings USING vec0(
  embedding float[768]
);

-- Embedding provenance + change detection (one row per chunk)
-- NOTE: Two hash columns serve different purposes:
--   document_hash: SHA-256 of full documents.content_text (staleness detection)
--   chunk_hash: SHA-256 of this individual chunk's text (debug/provenance)
-- Pending detection uses document_hash (not chunk_hash) because staleness is
-- a document-level condition: if the document changed, ALL chunks need re-embedding.
CREATE TABLE embedding_metadata (
  document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
  chunk_index INTEGER NOT NULL DEFAULT 0,   -- 0-indexed position within document
  model TEXT NOT NULL,           -- 'nomic-embed-text'
  dims INTEGER NOT NULL,         -- 768
  document_hash TEXT NOT NULL,   -- SHA-256 of full documents.content_text (staleness)
  chunk_hash TEXT NOT NULL,      -- SHA-256 of this chunk's text (provenance)
  created_at INTEGER NOT NULL,   -- ms epoch UTC
  last_error TEXT,               -- error message from last failed attempt
  attempt_count INTEGER NOT NULL DEFAULT 0,
  last_attempt_at INTEGER,       -- ms epoch UTC
  PRIMARY KEY(document_id, chunk_index)
);

CREATE INDEX idx_embedding_metadata_errors
  ON embedding_metadata(last_error) WHERE last_error IS NOT NULL;
CREATE INDEX idx_embedding_metadata_doc ON embedding_metadata(document_id);

-- CRITICAL: Delete ALL chunk embeddings when a document is deleted.
-- vec0 virtual tables don't support FK ON DELETE CASCADE, so we need this trigger.
-- embedding_metadata has ON DELETE CASCADE, so only vec0 needs explicit cleanup.
-- Range: [document_id * 1000, document_id * 1000 + 999]
CREATE TRIGGER documents_embeddings_ad AFTER DELETE ON documents BEGIN
  DELETE FROM embeddings
    WHERE rowid >= old.id * 1000
      AND rowid < (old.id + 1) * 1000;
END;

-- Update schema version
INSERT INTO schema_version (version, applied_at, description)
VALUES (9, strftime('%s', 'now') * 1000, 'Embeddings vec0 table, metadata, orphan cleanup trigger');