feat(db): Add migrations for documents, FTS5, and embeddings
Three new migrations establish the search infrastructure: - 007_documents: Creates the `documents` table as the central search unit. Each document is a rendered text blob derived from an issue, MR, or discussion. Includes `dirty_queue` table for tracking which entities need document regeneration after ingestion changes. - 008_fts5: Creates FTS5 virtual table `documents_fts` with content sync triggers. Uses `unicode61` tokenizer with `remove_diacritics=2` for broad language support. Automatic insert/update/delete triggers keep the FTS index synchronized with the documents table. - 009_embeddings: Creates `embeddings` table for storing vector chunks produced by Ollama. Uses `doc_id * 1000 + chunk_index` rowid encoding to support multi-chunk documents while enabling efficient doc-level deduplication in vector search results. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
54
migrations/009_embeddings.sql
Normal file
54
migrations/009_embeddings.sql
Normal file
@@ -0,0 +1,54 @@
|
||||
-- Migration 009: Embeddings (Gate B)
|
||||
-- Schema version: 9
|
||||
-- Adds sqlite-vec vector storage and embedding metadata for semantic search
|
||||
-- Requires sqlite-vec extension to be loaded before applying
|
||||
|
||||
-- NOTE: sqlite-vec vec0 virtual tables cannot participate in FK cascades.
|
||||
-- We must use an explicit trigger to delete orphan embeddings when documents
|
||||
-- are deleted. See documents_embeddings_ad trigger below.
|
||||
|
||||
-- sqlite-vec virtual table for vector search
|
||||
-- Storage rule: embeddings.rowid = document_id * 1000 + chunk_index
|
||||
-- This encodes (document_id, chunk_index) into a single integer rowid.
|
||||
-- Supports up to 1000 chunks per document (32M chars at 32k/chunk).
|
||||
CREATE VIRTUAL TABLE embeddings USING vec0(
|
||||
embedding float[768]
|
||||
);
|
||||
|
||||
-- Embedding provenance + change detection (one row per chunk)
|
||||
-- NOTE: Two hash columns serve different purposes:
|
||||
-- document_hash: SHA-256 of full documents.content_text (staleness detection)
|
||||
-- chunk_hash: SHA-256 of this individual chunk's text (debug/provenance)
|
||||
-- Pending detection uses document_hash (not chunk_hash) because staleness is
|
||||
-- a document-level condition: if the document changed, ALL chunks need re-embedding.
|
||||
CREATE TABLE embedding_metadata (
|
||||
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
chunk_index INTEGER NOT NULL DEFAULT 0, -- 0-indexed position within document
|
||||
model TEXT NOT NULL, -- 'nomic-embed-text'
|
||||
dims INTEGER NOT NULL, -- 768
|
||||
document_hash TEXT NOT NULL, -- SHA-256 of full documents.content_text (staleness)
|
||||
chunk_hash TEXT NOT NULL, -- SHA-256 of this chunk's text (provenance)
|
||||
created_at INTEGER NOT NULL, -- ms epoch UTC
|
||||
last_error TEXT, -- error message from last failed attempt
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER, -- ms epoch UTC
|
||||
PRIMARY KEY(document_id, chunk_index)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_embedding_metadata_errors
|
||||
ON embedding_metadata(last_error) WHERE last_error IS NOT NULL;
|
||||
CREATE INDEX idx_embedding_metadata_doc ON embedding_metadata(document_id);
|
||||
|
||||
-- CRITICAL: Delete ALL chunk embeddings when a document is deleted.
|
||||
-- vec0 virtual tables don't support FK ON DELETE CASCADE, so we need this trigger.
|
||||
-- embedding_metadata has ON DELETE CASCADE, so only vec0 needs explicit cleanup.
|
||||
-- Range: [document_id * 1000, document_id * 1000 + 999]
|
||||
CREATE TRIGGER documents_embeddings_ad AFTER DELETE ON documents BEGIN
|
||||
DELETE FROM embeddings
|
||||
WHERE rowid >= old.id * 1000
|
||||
AND rowid < (old.id + 1) * 1000;
|
||||
END;
|
||||
|
||||
-- Update schema version
|
||||
INSERT INTO schema_version (version, applied_at, description)
|
||||
VALUES (9, strftime('%s', 'now') * 1000, 'Embeddings vec0 table, metadata, orphan cleanup trigger');
|
||||
Reference in New Issue
Block a user