Three new migrations establish the search infrastructure: - 007_documents: Creates the `documents` table as the central search unit. Each document is a rendered text blob derived from an issue, MR, or discussion. Includes `dirty_queue` table for tracking which entities need document regeneration after ingestion changes. - 008_fts5: Creates FTS5 virtual table `documents_fts` with content sync triggers. Uses `unicode61` tokenizer with `remove_diacritics=2` for broad language support. Automatic insert/update/delete triggers keep the FTS index synchronized with the documents table. - 009_embeddings: Creates `embeddings` table for storing vector chunks produced by Ollama. Uses `doc_id * 1000 + chunk_index` rowid encoding to support multi-chunk documents while enabling efficient doc-level deduplication in vector search results. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
85 lines
3.7 KiB
SQL
85 lines
3.7 KiB
SQL
-- Migration 007: Documents, Document Labels, Document Paths, Dirty Sources, Pending Discussion Fetches
|
|
-- Schema version: 7
|
|
-- Adds CP3 document storage and queue tables for search pipeline
|
|
|
|
-- Unified searchable documents (derived from issues/MRs/discussions)
|
|
CREATE TABLE documents (
|
|
id INTEGER PRIMARY KEY,
|
|
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
|
|
source_id INTEGER NOT NULL, -- local DB id in the source table
|
|
project_id INTEGER NOT NULL REFERENCES projects(id),
|
|
author_username TEXT, -- for discussions: first note author
|
|
label_names TEXT, -- JSON array (display/debug only)
|
|
created_at INTEGER, -- ms epoch UTC
|
|
updated_at INTEGER, -- ms epoch UTC
|
|
url TEXT,
|
|
title TEXT, -- null for discussions
|
|
content_text TEXT NOT NULL, -- canonical text for embedding/search
|
|
content_hash TEXT NOT NULL, -- SHA-256 for change detection
|
|
labels_hash TEXT NOT NULL DEFAULT '', -- SHA-256 over sorted labels (write optimization)
|
|
paths_hash TEXT NOT NULL DEFAULT '', -- SHA-256 over sorted paths (write optimization)
|
|
is_truncated INTEGER NOT NULL DEFAULT 0,
|
|
truncated_reason TEXT CHECK (
|
|
truncated_reason IN (
|
|
'token_limit_middle_drop','single_note_oversized','first_last_oversized',
|
|
'hard_cap_oversized'
|
|
)
|
|
OR truncated_reason IS NULL
|
|
),
|
|
UNIQUE(source_type, source_id)
|
|
);
|
|
|
|
CREATE INDEX idx_documents_project_updated ON documents(project_id, updated_at);
|
|
CREATE INDEX idx_documents_author ON documents(author_username);
|
|
CREATE INDEX idx_documents_source ON documents(source_type, source_id);
|
|
CREATE INDEX idx_documents_hash ON documents(content_hash);
|
|
|
|
-- Fast label filtering (indexed exact-match)
|
|
CREATE TABLE document_labels (
|
|
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
label_name TEXT NOT NULL,
|
|
PRIMARY KEY(document_id, label_name)
|
|
) WITHOUT ROWID;
|
|
CREATE INDEX idx_document_labels_label ON document_labels(label_name);
|
|
|
|
-- Fast path filtering (DiffNote file paths)
|
|
CREATE TABLE document_paths (
|
|
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
path TEXT NOT NULL,
|
|
PRIMARY KEY(document_id, path)
|
|
) WITHOUT ROWID;
|
|
CREATE INDEX idx_document_paths_path ON document_paths(path);
|
|
|
|
-- Queue for incremental document regeneration (with retry tracking)
|
|
-- Uses next_attempt_at for index-friendly backoff queries
|
|
CREATE TABLE dirty_sources (
|
|
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
|
|
source_id INTEGER NOT NULL,
|
|
queued_at INTEGER NOT NULL, -- ms epoch UTC
|
|
attempt_count INTEGER NOT NULL DEFAULT 0,
|
|
last_attempt_at INTEGER,
|
|
last_error TEXT,
|
|
next_attempt_at INTEGER, -- ms epoch UTC; NULL means ready immediately
|
|
PRIMARY KEY(source_type, source_id)
|
|
);
|
|
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
|
|
|
|
-- Resumable queue for dependent discussion fetching
|
|
-- Uses next_attempt_at for index-friendly backoff queries
|
|
CREATE TABLE pending_discussion_fetches (
|
|
project_id INTEGER NOT NULL REFERENCES projects(id),
|
|
noteable_type TEXT NOT NULL, -- 'Issue' | 'MergeRequest'
|
|
noteable_iid INTEGER NOT NULL,
|
|
queued_at INTEGER NOT NULL, -- ms epoch UTC
|
|
attempt_count INTEGER NOT NULL DEFAULT 0,
|
|
last_attempt_at INTEGER,
|
|
last_error TEXT,
|
|
next_attempt_at INTEGER, -- ms epoch UTC; NULL means ready immediately
|
|
PRIMARY KEY(project_id, noteable_type, noteable_iid)
|
|
);
|
|
CREATE INDEX idx_pending_discussions_next_attempt ON pending_discussion_fetches(next_attempt_at);
|
|
|
|
-- Update schema version
|
|
INSERT INTO schema_version (version, applied_at, description)
|
|
VALUES (7, strftime('%s', 'now') * 1000, 'Documents, labels, paths, dirty sources, pending discussion fetches');
|