-- Migration 009: Embeddings (Gate B) -- Schema version: 9 -- Adds sqlite-vec vector storage and embedding metadata for semantic search -- Requires sqlite-vec extension to be loaded before applying -- NOTE: sqlite-vec vec0 virtual tables cannot participate in FK cascades. -- We must use an explicit trigger to delete orphan embeddings when documents -- are deleted. See documents_embeddings_ad trigger below. -- sqlite-vec virtual table for vector search -- Storage rule: embeddings.rowid = document_id * 1000 + chunk_index -- This encodes (document_id, chunk_index) into a single integer rowid. -- Supports up to 1000 chunks per document (32M chars at 32k/chunk). CREATE VIRTUAL TABLE embeddings USING vec0( embedding float[768] ); -- Embedding provenance + change detection (one row per chunk) -- NOTE: Two hash columns serve different purposes: -- document_hash: SHA-256 of full documents.content_text (staleness detection) -- chunk_hash: SHA-256 of this individual chunk's text (debug/provenance) -- Pending detection uses document_hash (not chunk_hash) because staleness is -- a document-level condition: if the document changed, ALL chunks need re-embedding. CREATE TABLE embedding_metadata ( document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL DEFAULT 0, -- 0-indexed position within document model TEXT NOT NULL, -- 'nomic-embed-text' dims INTEGER NOT NULL, -- 768 document_hash TEXT NOT NULL, -- SHA-256 of full documents.content_text (staleness) chunk_hash TEXT NOT NULL, -- SHA-256 of this chunk's text (provenance) created_at INTEGER NOT NULL, -- ms epoch UTC last_error TEXT, -- error message from last failed attempt attempt_count INTEGER NOT NULL DEFAULT 0, last_attempt_at INTEGER, -- ms epoch UTC PRIMARY KEY(document_id, chunk_index) ); CREATE INDEX idx_embedding_metadata_errors ON embedding_metadata(last_error) WHERE last_error IS NOT NULL; CREATE INDEX idx_embedding_metadata_doc ON embedding_metadata(document_id); -- CRITICAL: Delete ALL chunk embeddings when a document is deleted. -- vec0 virtual tables don't support FK ON DELETE CASCADE, so we need this trigger. -- embedding_metadata has ON DELETE CASCADE, so only vec0 needs explicit cleanup. -- Range: [document_id * 1000, document_id * 1000 + 999] CREATE TRIGGER documents_embeddings_ad AFTER DELETE ON documents BEGIN DELETE FROM embeddings WHERE rowid >= old.id * 1000 AND rowid < (old.id + 1) * 1000; END; -- Update schema version INSERT INTO schema_version (version, applied_at, description) VALUES (9, strftime('%s', 'now') * 1000, 'Embeddings vec0 table, metadata, orphan cleanup trigger');