Compare commits
12 Commits
f4dba386c9
...
41d20f1374
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
41d20f1374 | ||
|
|
9b63671df9 | ||
|
|
d235f2b4dd | ||
|
|
daf5a73019 | ||
|
|
559f0702ad | ||
|
|
d5bdb24b0f | ||
|
|
723703bed9 | ||
|
|
20edff4ab1 | ||
|
|
d31d5292f2 | ||
|
|
6e22f120d0 | ||
|
|
4270603da4 | ||
|
|
aca4773327 |
File diff suppressed because one or more lines are too long
@@ -1 +1 @@
|
||||
bd-lcb
|
||||
bd-35o
|
||||
|
||||
40
AGENTS.md
40
AGENTS.md
@@ -33,38 +33,50 @@ The `lore` CLI has a robot mode optimized for AI agent consumption with structur
|
||||
|
||||
```bash
|
||||
# Explicit flag
|
||||
lore --robot list issues
|
||||
lore --robot issues -n 10
|
||||
|
||||
# JSON shorthand (-J)
|
||||
lore -J issues -n 10
|
||||
|
||||
# Auto-detection (when stdout is not a TTY)
|
||||
lore list issues | jq .
|
||||
lore issues | jq .
|
||||
|
||||
# Environment variable
|
||||
LORE_ROBOT=true lore list issues
|
||||
LORE_ROBOT=1 lore issues
|
||||
```
|
||||
|
||||
### Robot Mode Commands
|
||||
|
||||
```bash
|
||||
# List issues/MRs with JSON output
|
||||
lore --robot list issues --limit=10
|
||||
lore --robot list mrs --state=opened
|
||||
lore --robot issues -n 10
|
||||
lore --robot mrs -s opened
|
||||
|
||||
# Show detailed entity info
|
||||
lore --robot issues 123
|
||||
lore --robot mrs 456 -p group/repo
|
||||
|
||||
# Count entities
|
||||
lore --robot count issues
|
||||
lore --robot count discussions --type=mr
|
||||
lore --robot count discussions --for mr
|
||||
|
||||
# Show detailed entity info
|
||||
lore --robot show issue 123
|
||||
lore --robot show mr 456 --project=group/repo
|
||||
# Search indexed documents
|
||||
lore --robot search "authentication bug"
|
||||
|
||||
# Check sync status
|
||||
lore --robot sync-status
|
||||
lore --robot status
|
||||
|
||||
# Run ingestion (quiet, JSON summary)
|
||||
lore --robot ingest --type=issues
|
||||
# Run full sync pipeline
|
||||
lore --robot sync
|
||||
|
||||
# Run ingestion only
|
||||
lore --robot ingest issues
|
||||
|
||||
# Check environment health
|
||||
lore --robot doctor
|
||||
|
||||
# Document and index statistics
|
||||
lore --robot stats
|
||||
```
|
||||
|
||||
### Response Format
|
||||
@@ -102,8 +114,8 @@ Errors return structured JSON to stderr:
|
||||
|
||||
### Best Practices
|
||||
|
||||
- Use `lore --robot` for all agent interactions
|
||||
- Use `lore --robot` or `lore -J` for all agent interactions
|
||||
- Check exit codes for error handling
|
||||
- Parse JSON errors from stderr
|
||||
- Use `--limit` to control response size
|
||||
- Use `-n` / `--limit` to control response size
|
||||
- TTY detection handles piped commands automatically
|
||||
|
||||
60
Cargo.lock
generated
60
Cargo.lock
generated
@@ -1093,6 +1093,7 @@ dependencies = [
|
||||
"futures",
|
||||
"indicatif",
|
||||
"open",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
@@ -1339,6 +1340,15 @@ dependencies = [
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.106"
|
||||
@@ -1363,6 +1373,36 @@ version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.18"
|
||||
@@ -2553,6 +2593,26 @@ dependencies = [
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.8.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dafd85c832c1b68bbb4ec0c72c7f6f4fc5179627d2bc7c26b30e4c0cc11e76cc"
|
||||
dependencies = [
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.8.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cb7e4e8436d9db52fbd6625dbf2f45243ab84994a72882ec8227b99e72b439a"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.6"
|
||||
|
||||
@@ -40,6 +40,7 @@ thiserror = "2"
|
||||
dirs = "6"
|
||||
url = "2"
|
||||
urlencoding = "2"
|
||||
rand = "0.8"
|
||||
sha2 = "0.10"
|
||||
flate2 = "1"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
||||
441
README.md
441
README.md
@@ -1,6 +1,6 @@
|
||||
# Gitlore
|
||||
|
||||
Local GitLab data management with semantic search. Syncs issues, MRs, discussions, and notes from GitLab to a local SQLite database for fast, offline-capable querying and filtering.
|
||||
Local GitLab data management with semantic search. Syncs issues, MRs, discussions, and notes from GitLab to a local SQLite database for fast, offline-capable querying, filtering, and hybrid search.
|
||||
|
||||
## Features
|
||||
|
||||
@@ -9,8 +9,10 @@ Local GitLab data management with semantic search. Syncs issues, MRs, discussion
|
||||
- **Full re-sync**: Reset cursors and fetch all data from scratch when needed
|
||||
- **Multi-project**: Track issues and MRs across multiple GitLab projects
|
||||
- **Rich filtering**: Filter by state, author, assignee, labels, milestone, due date, draft status, reviewer, branches
|
||||
- **Hybrid search**: Combines FTS5 lexical search with Ollama-powered vector embeddings via Reciprocal Rank Fusion
|
||||
- **Raw payload storage**: Preserves original GitLab API responses for debugging
|
||||
- **Discussion threading**: Full support for issue and MR discussions including inline code review comments
|
||||
- **Robot mode**: Machine-readable JSON output with structured errors and meaningful exit codes
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -32,25 +34,28 @@ cargo build --release
|
||||
lore init
|
||||
|
||||
# Verify authentication
|
||||
lore auth-test
|
||||
lore auth
|
||||
|
||||
# Sync issues from GitLab
|
||||
lore ingest --type issues
|
||||
|
||||
# Sync merge requests from GitLab
|
||||
lore ingest --type mrs
|
||||
# Sync everything from GitLab (issues + MRs + docs + embeddings)
|
||||
lore sync
|
||||
|
||||
# List recent issues
|
||||
lore list issues --limit 10
|
||||
lore issues -n 10
|
||||
|
||||
# List open merge requests
|
||||
lore list mrs --state opened
|
||||
lore mrs -s opened
|
||||
|
||||
# Show issue details
|
||||
lore show issue 123 --project group/repo
|
||||
lore issues 123
|
||||
|
||||
# Show MR details with discussions
|
||||
lore show mr 456 --project group/repo
|
||||
lore mrs 456
|
||||
|
||||
# Search across all indexed data
|
||||
lore search "authentication bug"
|
||||
|
||||
# Robot mode (machine-readable JSON)
|
||||
lore -J issues -n 5 | jq .
|
||||
```
|
||||
|
||||
## Configuration
|
||||
@@ -79,6 +84,12 @@ Configuration is stored in `~/.config/lore/config.json` (or `$XDG_CONFIG_HOME/lo
|
||||
},
|
||||
"storage": {
|
||||
"compressRawPayloads": true
|
||||
},
|
||||
"embedding": {
|
||||
"provider": "ollama",
|
||||
"model": "nomic-embed-text",
|
||||
"baseUrl": "http://localhost:11434",
|
||||
"concurrency": 4
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -87,9 +98,9 @@ Configuration is stored in `~/.config/lore/config.json` (or `$XDG_CONFIG_HOME/lo
|
||||
|
||||
| Section | Field | Default | Description |
|
||||
|---------|-------|---------|-------------|
|
||||
| `gitlab` | `baseUrl` | — | GitLab instance URL (required) |
|
||||
| `gitlab` | `baseUrl` | -- | GitLab instance URL (required) |
|
||||
| `gitlab` | `tokenEnvVar` | `GITLAB_TOKEN` | Environment variable containing API token |
|
||||
| `projects` | `path` | — | Project path (e.g., `group/project`) |
|
||||
| `projects` | `path` | -- | Project path (e.g., `group/project`) |
|
||||
| `sync` | `backfillDays` | `14` | Days to backfill on initial sync |
|
||||
| `sync` | `staleLockMinutes` | `10` | Minutes before sync lock considered stale |
|
||||
| `sync` | `heartbeatIntervalSeconds` | `30` | Frequency of lock heartbeat updates |
|
||||
@@ -107,7 +118,7 @@ Configuration is stored in `~/.config/lore/config.json` (or `$XDG_CONFIG_HOME/lo
|
||||
### Config File Resolution
|
||||
|
||||
The config file is resolved in this order:
|
||||
1. `--config` CLI flag
|
||||
1. `--config` / `-c` CLI flag
|
||||
2. `LORE_CONFIG_PATH` environment variable
|
||||
3. `~/.config/lore/config.json` (XDG default)
|
||||
4. `./lore.config.json` (local fallback for development)
|
||||
@@ -116,7 +127,7 @@ The config file is resolved in this order:
|
||||
|
||||
Create a personal access token with `read_api` scope:
|
||||
|
||||
1. Go to GitLab → Settings → Access Tokens
|
||||
1. Go to GitLab > Settings > Access Tokens
|
||||
2. Create token with `read_api` scope
|
||||
3. Export it: `export GITLAB_TOKEN=glpat-xxxxxxxxxxxx`
|
||||
|
||||
@@ -126,12 +137,185 @@ Create a personal access token with `read_api` scope:
|
||||
|----------|---------|----------|
|
||||
| `GITLAB_TOKEN` | GitLab API authentication token (name configurable via `gitlab.tokenEnvVar`) | Yes |
|
||||
| `LORE_CONFIG_PATH` | Override config file location | No |
|
||||
| `LORE_ROBOT` | Enable robot mode globally (set to `true` or `1`) | No |
|
||||
| `XDG_CONFIG_HOME` | XDG Base Directory for config (fallback: `~/.config`) | No |
|
||||
| `XDG_DATA_HOME` | XDG Base Directory for data (fallback: `~/.local/share`) | No |
|
||||
| `RUST_LOG` | Logging level filter (e.g., `lore=debug`) | No |
|
||||
|
||||
## Commands
|
||||
|
||||
### `lore issues`
|
||||
|
||||
Query issues from local database, or show a specific issue.
|
||||
|
||||
```bash
|
||||
lore issues # Recent issues (default 50)
|
||||
lore issues 123 # Show issue #123 with discussions
|
||||
lore issues 123 -p group/repo # Disambiguate by project
|
||||
lore issues -n 100 # More results
|
||||
lore issues -s opened # Only open issues
|
||||
lore issues -s closed # Only closed issues
|
||||
lore issues -a username # By author (@ prefix optional)
|
||||
lore issues -A username # By assignee (@ prefix optional)
|
||||
lore issues -l bug # By label (AND logic)
|
||||
lore issues -l bug -l urgent # Multiple labels
|
||||
lore issues -m "v1.0" # By milestone title
|
||||
lore issues --since 7d # Updated in last 7 days
|
||||
lore issues --since 2w # Updated in last 2 weeks
|
||||
lore issues --since 2024-01-01 # Updated since date
|
||||
lore issues --due-before 2024-12-31 # Due before date
|
||||
lore issues --has-due # Only issues with due dates
|
||||
lore issues -p group/repo # Filter by project
|
||||
lore issues --sort created --asc # Sort by created date, ascending
|
||||
lore issues -o # Open first result in browser
|
||||
```
|
||||
|
||||
When listing, output includes: IID, title, state, author, assignee, labels, and update time.
|
||||
|
||||
When showing a single issue (e.g., `lore issues 123`), output includes: title, description, state, author, assignees, labels, milestone, due date, web URL, and threaded discussions.
|
||||
|
||||
### `lore mrs`
|
||||
|
||||
Query merge requests from local database, or show a specific MR.
|
||||
|
||||
```bash
|
||||
lore mrs # Recent MRs (default 50)
|
||||
lore mrs 456 # Show MR !456 with discussions
|
||||
lore mrs 456 -p group/repo # Disambiguate by project
|
||||
lore mrs -n 100 # More results
|
||||
lore mrs -s opened # Only open MRs
|
||||
lore mrs -s merged # Only merged MRs
|
||||
lore mrs -s closed # Only closed MRs
|
||||
lore mrs -s locked # Only locked MRs
|
||||
lore mrs -s all # All states
|
||||
lore mrs -a username # By author (@ prefix optional)
|
||||
lore mrs -A username # By assignee (@ prefix optional)
|
||||
lore mrs -r username # By reviewer (@ prefix optional)
|
||||
lore mrs -d # Only draft/WIP MRs
|
||||
lore mrs -D # Exclude draft MRs
|
||||
lore mrs --target main # By target branch
|
||||
lore mrs --source feature/foo # By source branch
|
||||
lore mrs -l needs-review # By label (AND logic)
|
||||
lore mrs --since 7d # Updated in last 7 days
|
||||
lore mrs -p group/repo # Filter by project
|
||||
lore mrs --sort created --asc # Sort by created date, ascending
|
||||
lore mrs -o # Open first result in browser
|
||||
```
|
||||
|
||||
When listing, output includes: IID, title (with [DRAFT] prefix if applicable), state, author, assignee, labels, and update time.
|
||||
|
||||
When showing a single MR (e.g., `lore mrs 456`), output includes: title, description, state, draft status, author, assignees, reviewers, labels, source/target branches, merge status, web URL, and threaded discussions. Inline code review comments (DiffNotes) display file context in the format `[src/file.ts:45]`.
|
||||
|
||||
### `lore search`
|
||||
|
||||
Search across indexed documents using hybrid (lexical + semantic), lexical-only, or semantic-only modes.
|
||||
|
||||
```bash
|
||||
lore search "authentication bug" # Hybrid search (default)
|
||||
lore search "login flow" --mode lexical # FTS5 lexical only
|
||||
lore search "login flow" --mode semantic # Vector similarity only
|
||||
lore search "auth" --type issue # Filter by source type
|
||||
lore search "auth" --type mr # MR documents only
|
||||
lore search "auth" --type discussion # Discussion documents only
|
||||
lore search "deploy" --author username # Filter by author
|
||||
lore search "deploy" -p group/repo # Filter by project
|
||||
lore search "deploy" --label backend # Filter by label (AND logic)
|
||||
lore search "deploy" --path src/ # Filter by file path (trailing / for prefix)
|
||||
lore search "deploy" --after 7d # Created after (7d, 2w, or YYYY-MM-DD)
|
||||
lore search "deploy" --updated-after 2w # Updated after
|
||||
lore search "deploy" -n 50 # Limit results (default 20, max 100)
|
||||
lore search "deploy" --explain # Show ranking explanation per result
|
||||
lore search "deploy" --fts-mode raw # Raw FTS5 query syntax (advanced)
|
||||
```
|
||||
|
||||
Requires `lore generate-docs` (or `lore sync`) to have been run at least once. Semantic mode requires Ollama with the configured embedding model.
|
||||
|
||||
### `lore sync`
|
||||
|
||||
Run the full sync pipeline: ingest from GitLab, generate searchable documents, and compute embeddings.
|
||||
|
||||
```bash
|
||||
lore sync # Full pipeline
|
||||
lore sync --full # Reset cursors, fetch everything
|
||||
lore sync --force # Override stale lock
|
||||
lore sync --no-embed # Skip embedding step
|
||||
lore sync --no-docs # Skip document regeneration
|
||||
```
|
||||
|
||||
### `lore ingest`
|
||||
|
||||
Sync data from GitLab to local database. Runs only the ingestion step (no doc generation or embeddings).
|
||||
|
||||
```bash
|
||||
lore ingest # Ingest everything (issues + MRs)
|
||||
lore ingest issues # Issues only
|
||||
lore ingest mrs # MRs only
|
||||
lore ingest issues -p group/repo # Single project
|
||||
lore ingest --force # Override stale lock
|
||||
lore ingest --full # Full re-sync (reset cursors)
|
||||
```
|
||||
|
||||
The `--full` flag resets sync cursors and discussion watermarks, then fetches all data from scratch. Useful when:
|
||||
- Assignee data or other fields were missing from earlier syncs
|
||||
- You want to ensure complete data after schema changes
|
||||
- Troubleshooting sync issues
|
||||
|
||||
### `lore generate-docs`
|
||||
|
||||
Extract searchable documents from ingested issues, MRs, and discussions for the FTS5 index.
|
||||
|
||||
```bash
|
||||
lore generate-docs # Incremental (dirty items only)
|
||||
lore generate-docs --full # Full rebuild
|
||||
lore generate-docs -p group/repo # Single project
|
||||
```
|
||||
|
||||
### `lore embed`
|
||||
|
||||
Generate vector embeddings for documents via Ollama. Requires Ollama running with the configured embedding model.
|
||||
|
||||
```bash
|
||||
lore embed # Embed new/changed documents
|
||||
lore embed --retry-failed # Retry previously failed embeddings
|
||||
```
|
||||
|
||||
### `lore count`
|
||||
|
||||
Count entities in local database.
|
||||
|
||||
```bash
|
||||
lore count issues # Total issues
|
||||
lore count mrs # Total MRs (with state breakdown)
|
||||
lore count discussions # Total discussions
|
||||
lore count discussions --for issue # Issue discussions only
|
||||
lore count discussions --for mr # MR discussions only
|
||||
lore count notes # Total notes (system vs user breakdown)
|
||||
lore count notes --for issue # Issue notes only
|
||||
```
|
||||
|
||||
### `lore stats`
|
||||
|
||||
Show document and index statistics, with optional integrity checks.
|
||||
|
||||
```bash
|
||||
lore stats # Document and index statistics
|
||||
lore stats --check # Run integrity checks
|
||||
lore stats --check --repair # Repair integrity issues
|
||||
```
|
||||
|
||||
### `lore status`
|
||||
|
||||
Show current sync state and watermarks.
|
||||
|
||||
```bash
|
||||
lore status
|
||||
```
|
||||
|
||||
Displays:
|
||||
- Last sync run details (status, timing)
|
||||
- Cursor positions per project and resource type (issues and MRs)
|
||||
- Data summary counts
|
||||
|
||||
### `lore init`
|
||||
|
||||
Initialize configuration and database interactively.
|
||||
@@ -142,12 +326,12 @@ lore init --force # Overwrite existing config
|
||||
lore init --non-interactive # Fail if prompts needed
|
||||
```
|
||||
|
||||
### `lore auth-test`
|
||||
### `lore auth`
|
||||
|
||||
Verify GitLab authentication is working.
|
||||
|
||||
```bash
|
||||
lore auth-test
|
||||
lore auth
|
||||
# Authenticated as @username (Full Name)
|
||||
# GitLab: https://gitlab.com
|
||||
```
|
||||
@@ -157,8 +341,7 @@ lore auth-test
|
||||
Check environment health and configuration.
|
||||
|
||||
```bash
|
||||
lore doctor # Human-readable output
|
||||
lore doctor --json # JSON output for scripting
|
||||
lore doctor
|
||||
```
|
||||
|
||||
Checks performed:
|
||||
@@ -168,132 +351,6 @@ Checks performed:
|
||||
- Project accessibility
|
||||
- Ollama connectivity (optional)
|
||||
|
||||
### `lore ingest`
|
||||
|
||||
Sync data from GitLab to local database.
|
||||
|
||||
```bash
|
||||
# Issues
|
||||
lore ingest --type issues # Sync all projects
|
||||
lore ingest --type issues --project group/repo # Single project
|
||||
lore ingest --type issues --force # Override stale lock
|
||||
lore ingest --type issues --full # Full re-sync (reset cursors)
|
||||
|
||||
# Merge Requests
|
||||
lore ingest --type mrs # Sync all projects
|
||||
lore ingest --type mrs --project group/repo # Single project
|
||||
lore ingest --type mrs --full # Full re-sync (reset cursors)
|
||||
```
|
||||
|
||||
The `--full` flag resets sync cursors and discussion watermarks, then fetches all data from scratch. Useful when:
|
||||
- Assignee data or other fields were missing from earlier syncs
|
||||
- You want to ensure complete data after schema changes
|
||||
- Troubleshooting sync issues
|
||||
|
||||
### `lore list issues`
|
||||
|
||||
Query issues from local database.
|
||||
|
||||
```bash
|
||||
lore list issues # Recent issues (default 50)
|
||||
lore list issues --limit 100 # More results
|
||||
lore list issues --state opened # Only open issues
|
||||
lore list issues --state closed # Only closed issues
|
||||
lore list issues --author username # By author (@ prefix optional)
|
||||
lore list issues --assignee username # By assignee (@ prefix optional)
|
||||
lore list issues --label bug # By label (AND logic)
|
||||
lore list issues --label bug --label urgent # Multiple labels
|
||||
lore list issues --milestone "v1.0" # By milestone title
|
||||
lore list issues --since 7d # Updated in last 7 days
|
||||
lore list issues --since 2w # Updated in last 2 weeks
|
||||
lore list issues --since 2024-01-01 # Updated since date
|
||||
lore list issues --due-before 2024-12-31 # Due before date
|
||||
lore list issues --has-due-date # Only issues with due dates
|
||||
lore list issues --project group/repo # Filter by project
|
||||
lore list issues --sort created --order asc # Sort options
|
||||
lore list issues --open # Open first result in browser
|
||||
lore list issues --json # JSON output
|
||||
```
|
||||
|
||||
Output includes: IID, title, state, author, assignee, labels, and update time.
|
||||
|
||||
### `lore list mrs`
|
||||
|
||||
Query merge requests from local database.
|
||||
|
||||
```bash
|
||||
lore list mrs # Recent MRs (default 50)
|
||||
lore list mrs --limit 100 # More results
|
||||
lore list mrs --state opened # Only open MRs
|
||||
lore list mrs --state merged # Only merged MRs
|
||||
lore list mrs --state closed # Only closed MRs
|
||||
lore list mrs --state locked # Only locked MRs
|
||||
lore list mrs --state all # All states
|
||||
lore list mrs --author username # By author (@ prefix optional)
|
||||
lore list mrs --assignee username # By assignee (@ prefix optional)
|
||||
lore list mrs --reviewer username # By reviewer (@ prefix optional)
|
||||
lore list mrs --draft # Only draft/WIP MRs
|
||||
lore list mrs --no-draft # Exclude draft MRs
|
||||
lore list mrs --target-branch main # By target branch
|
||||
lore list mrs --source-branch feature/foo # By source branch
|
||||
lore list mrs --label needs-review # By label (AND logic)
|
||||
lore list mrs --since 7d # Updated in last 7 days
|
||||
lore list mrs --project group/repo # Filter by project
|
||||
lore list mrs --sort created --order asc # Sort options
|
||||
lore list mrs --open # Open first result in browser
|
||||
lore list mrs --json # JSON output
|
||||
```
|
||||
|
||||
Output includes: IID, title (with [DRAFT] prefix if applicable), state, author, assignee, labels, and update time.
|
||||
|
||||
### `lore show issue`
|
||||
|
||||
Display detailed issue information.
|
||||
|
||||
```bash
|
||||
lore show issue 123 # Show issue #123
|
||||
lore show issue 123 --project group/repo # Disambiguate if needed
|
||||
```
|
||||
|
||||
Shows: title, description, state, author, assignees, labels, milestone, due date, web URL, and threaded discussions.
|
||||
|
||||
### `lore show mr`
|
||||
|
||||
Display detailed merge request information.
|
||||
|
||||
```bash
|
||||
lore show mr 456 # Show MR !456
|
||||
lore show mr 456 --project group/repo # Disambiguate if needed
|
||||
```
|
||||
|
||||
Shows: title, description, state, draft status, author, assignees, reviewers, labels, source/target branches, merge status, web URL, and threaded discussions. Inline code review comments (DiffNotes) display file context in the format `[src/file.ts:45]`.
|
||||
|
||||
### `lore count`
|
||||
|
||||
Count entities in local database.
|
||||
|
||||
```bash
|
||||
lore count issues # Total issues
|
||||
lore count mrs # Total MRs (with state breakdown)
|
||||
lore count discussions # Total discussions
|
||||
lore count discussions --type issue # Issue discussions only
|
||||
lore count discussions --type mr # MR discussions only
|
||||
lore count notes # Total notes (shows system vs user breakdown)
|
||||
```
|
||||
|
||||
### `lore sync-status`
|
||||
|
||||
Show current sync state and watermarks.
|
||||
|
||||
```bash
|
||||
lore sync-status
|
||||
```
|
||||
|
||||
Displays:
|
||||
- Last sync run details (status, timing)
|
||||
- Cursor positions per project and resource type (issues and MRs)
|
||||
- Data summary counts
|
||||
|
||||
### `lore migrate`
|
||||
|
||||
Run pending database migrations.
|
||||
@@ -302,8 +359,6 @@ Run pending database migrations.
|
||||
lore migrate
|
||||
```
|
||||
|
||||
Shows current schema version and applies any pending migrations.
|
||||
|
||||
### `lore version`
|
||||
|
||||
Show version information.
|
||||
@@ -312,26 +367,67 @@ Show version information.
|
||||
lore version
|
||||
```
|
||||
|
||||
### `lore backup`
|
||||
## Robot Mode
|
||||
|
||||
Create timestamped database backup.
|
||||
Machine-readable JSON output for scripting and AI agent consumption.
|
||||
|
||||
### Activation
|
||||
|
||||
```bash
|
||||
lore backup
|
||||
# Global flag
|
||||
lore --robot issues -n 5
|
||||
|
||||
# JSON shorthand (-J)
|
||||
lore -J issues -n 5
|
||||
|
||||
# Environment variable
|
||||
LORE_ROBOT=1 lore issues -n 5
|
||||
|
||||
# Auto-detection (when stdout is not a TTY)
|
||||
lore issues -n 5 | jq .
|
||||
```
|
||||
|
||||
*Note: Not yet implemented.*
|
||||
### Response Format
|
||||
|
||||
### `lore reset`
|
||||
All commands return consistent JSON:
|
||||
|
||||
Delete database and reset all state.
|
||||
```json
|
||||
{"ok": true, "data": {...}, "meta": {...}}
|
||||
```
|
||||
|
||||
Errors return structured JSON to stderr:
|
||||
|
||||
```json
|
||||
{"error": {"code": "CONFIG_NOT_FOUND", "message": "...", "suggestion": "Run 'lore init'"}}
|
||||
```
|
||||
|
||||
### Exit Codes
|
||||
|
||||
| Code | Meaning |
|
||||
|------|---------|
|
||||
| 0 | Success |
|
||||
| 1 | Internal error |
|
||||
| 2 | Config not found |
|
||||
| 3 | Config invalid |
|
||||
| 4 | Token not set |
|
||||
| 5 | GitLab auth failed |
|
||||
| 6 | Resource not found |
|
||||
| 7 | Rate limited |
|
||||
| 8 | Network error |
|
||||
| 9 | Database locked |
|
||||
| 10 | Database error |
|
||||
| 11 | Migration failed |
|
||||
| 12 | I/O error |
|
||||
| 13 | Transform error |
|
||||
|
||||
## Global Options
|
||||
|
||||
```bash
|
||||
lore reset --confirm
|
||||
lore -c /path/to/config.json <command> # Use alternate config
|
||||
lore --robot <command> # Machine-readable JSON
|
||||
lore -J <command> # JSON shorthand
|
||||
```
|
||||
|
||||
*Note: Not yet implemented.*
|
||||
|
||||
## Database Schema
|
||||
|
||||
Data is stored in SQLite with WAL mode and foreign keys enabled. Main tables:
|
||||
@@ -350,6 +446,9 @@ Data is stored in SQLite with WAL mode and foreign keys enabled. Main tables:
|
||||
| `mr_reviewers` | Many-to-many MR-reviewer relationships |
|
||||
| `discussions` | Issue/MR discussion threads |
|
||||
| `notes` | Individual notes within discussions (with system note flag and DiffNote position data) |
|
||||
| `documents` | Extracted searchable text for FTS and embedding |
|
||||
| `documents_fts` | FTS5 full-text search index |
|
||||
| `embeddings` | Vector embeddings for semantic search |
|
||||
| `sync_runs` | Audit trail of sync operations |
|
||||
| `sync_cursors` | Cursor positions for incremental sync |
|
||||
| `app_locks` | Crash-safe single-flight lock |
|
||||
@@ -358,12 +457,6 @@ Data is stored in SQLite with WAL mode and foreign keys enabled. Main tables:
|
||||
|
||||
The database is stored at `~/.local/share/lore/lore.db` by default (XDG compliant).
|
||||
|
||||
## Global Options
|
||||
|
||||
```bash
|
||||
lore --config /path/to/config.json <command> # Use alternate config
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
@@ -371,10 +464,10 @@ lore --config /path/to/config.json <command> # Use alternate config
|
||||
cargo test
|
||||
|
||||
# Run with debug logging
|
||||
RUST_LOG=lore=debug lore list issues
|
||||
RUST_LOG=lore=debug lore issues
|
||||
|
||||
# Run with trace logging
|
||||
RUST_LOG=lore=trace lore ingest --type issues
|
||||
RUST_LOG=lore=trace lore ingest issues
|
||||
|
||||
# Check formatting
|
||||
cargo fmt --check
|
||||
@@ -386,7 +479,8 @@ cargo clippy
|
||||
## Tech Stack
|
||||
|
||||
- **Rust** (2024 edition)
|
||||
- **SQLite** via rusqlite (bundled)
|
||||
- **SQLite** via rusqlite (bundled) with FTS5 and sqlite-vec
|
||||
- **Ollama** for vector embeddings (nomic-embed-text)
|
||||
- **clap** for CLI parsing
|
||||
- **reqwest** for HTTP
|
||||
- **tokio** for async runtime
|
||||
@@ -394,23 +488,6 @@ cargo clippy
|
||||
- **tracing** for logging
|
||||
- **indicatif** for progress bars
|
||||
|
||||
## Current Status
|
||||
|
||||
This is Checkpoint 2 (CP2) of the Gitlore project. Currently implemented:
|
||||
|
||||
- Issue ingestion with cursor-based incremental sync
|
||||
- Merge request ingestion with cursor-based incremental sync
|
||||
- Discussion and note syncing for issues and MRs
|
||||
- DiffNote support for inline code review comments
|
||||
- Rich filtering and querying for both issues and MRs
|
||||
- Full re-sync capability with watermark reset
|
||||
|
||||
Not yet implemented:
|
||||
- Semantic search with embeddings (CP3+)
|
||||
- Backup and reset commands
|
||||
|
||||
See [SPEC.md](SPEC.md) for the full project roadmap and architecture.
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
||||
354
docs/api-efficiency-findings.md
Normal file
354
docs/api-efficiency-findings.md
Normal file
@@ -0,0 +1,354 @@
|
||||
# API Efficiency & Observability Findings
|
||||
|
||||
> **Status:** Draft - working through items
|
||||
> **Context:** Audit of gitlore's GitLab API usage, data processing, and observability gaps
|
||||
> **Interactive reference:** `api-review.html` (root of repo, open in browser)
|
||||
|
||||
---
|
||||
|
||||
## Checkpoint 3 Alignment
|
||||
|
||||
Checkpoint 3 (`docs/prd/checkpoint-3.md`) introduces `lore sync` orchestration, document generation, and search. Several findings here overlap with that work. This section maps the relationship so effort isn't duplicated and so CP3 implementation can absorb the right instrumentation as it's built.
|
||||
|
||||
### Direct overlaps (CP3 partially addresses)
|
||||
|
||||
| Finding | CP3 coverage | Remaining gap |
|
||||
|---------|-------------|---------------|
|
||||
| **P0-1** sync_runs never written | `lore sync` step 7 says "record sync_run". `SyncResult` struct defined with counts. | Only covers the new `lore sync` command. Existing `lore ingest` still won't write sync_runs. Either instrument `lore ingest` separately or have `lore sync` subsume it entirely. |
|
||||
| **P0-2** No timing | `print_sync` captures wall-clock `elapsed_secs` / `elapsed_ms` in robot mode JSON `meta` envelope. | Wall-clock only. No per-phase, per-API-call, or per-DB-write breakdown. The `SyncResult` struct has counts but no duration fields. |
|
||||
| **P2-1** Discussion full-refresh | CP3 introduces `pending_discussion_fetches` queue with exponential backoff and bounded processing per sync. Structures the work better. | Same full-refresh strategy per entity. The queue adds retry resilience but doesn't reduce the number of API calls for unchanged discussions. |
|
||||
|
||||
### Different scope (complementary, no overlap)
|
||||
|
||||
| Finding | Why no overlap |
|
||||
|---------|---------------|
|
||||
| **P0-3** metrics_json schema | CP3 doesn't reference the `metrics_json` column. `SyncResult` is printed/returned but not persisted there. |
|
||||
| **P0-4** Discussion sync telemetry columns | CP3's queue system (`pending_discussion_fetches`) is a replacement architecture. The existing per-MR telemetry columns (`discussions_sync_attempts`, `_last_error`) aren't referenced in CP3. Decide: use CP3's queue table or wire up the existing columns? |
|
||||
| **P0-5** Progress events lack timing | CP3 lists "Progress visible during long syncs" as acceptance criteria but doesn't spec timing in events. |
|
||||
| **P1-\*** Free data capture | CP3 doesn't touch GitLab API response field coverage at all. These are independent. |
|
||||
| **P2-2** Keyset pagination (GitLab API) | CP3 uses keyset pagination for local SQLite queries (document seeding, embedding pipelines). Completely different from using GitLab API keyset pagination. |
|
||||
| **P2-3** ETags | Not mentioned in CP3. |
|
||||
| **P2-4** Labels enrichment | Not mentioned in CP3. |
|
||||
| **P3-\*** Structural improvements | Not in CP3 scope. |
|
||||
|
||||
### Recommendation
|
||||
|
||||
CP3's `lore sync` orchestrator is the natural integration point for P0 instrumentation. Rather than retrofitting `lore ingest` separately, the most efficient path is:
|
||||
|
||||
1. Build P0 timing instrumentation as a reusable layer (e.g., a `SyncMetrics` struct that accumulates phase timings)
|
||||
2. Wire it into the CP3 `run_sync` implementation as it's built
|
||||
3. Have `run_sync` persist the full metrics (counts + timing) to `sync_runs.metrics_json`
|
||||
4. Decide whether `lore ingest` becomes a thin wrapper around `lore sync --no-docs --no-embed` or stays separate with its own sync_runs recording
|
||||
|
||||
This avoids building instrumentation twice and ensures the new sync pipeline is observable from day one.
|
||||
|
||||
### Decision: `lore ingest` goes away
|
||||
|
||||
`lore sync` becomes the single command for all data fetching. First run does a full fetch (equivalent to today's `lore ingest`), subsequent runs are incremental via cursors. `lore ingest` becomes a hidden deprecated alias.
|
||||
|
||||
Implications:
|
||||
- P0 instrumentation only needs to be built in one place (`run_sync`)
|
||||
- CP3 Gate C owns the sync_runs lifecycle end-to-end
|
||||
- The existing `lore ingest issues` / `lore ingest mrs` code becomes internal functions called by `run_sync`, not standalone CLI commands
|
||||
- `lore sync` always syncs everything: issues, MRs, discussions, documents, embeddings (with `--no-embed` / `--no-docs` to opt out of later stages)
|
||||
|
||||
---
|
||||
|
||||
## Implementation Sequence
|
||||
|
||||
### Phase A: Before CP3 (independent, enriches data model)
|
||||
|
||||
**Do first.** Migration + struct changes only. No architectural dependency. Gets richer source data into the DB before CP3's document generation pipeline locks in its schema.
|
||||
|
||||
1. **P1 batch: free data capture** - All ~11 fields in a single migration. `user_notes_count`, `upvotes`, `downvotes`, `confidential`, `has_conflicts`, `blocking_discussions_resolved`, `merge_commit_sha`, `discussion_locked`, `task_completion_status`, `issue_type`, `issue references`.
|
||||
2. **P1-10: MR milestones** - Reuse existing issue milestone transformer. Slightly more work, same migration.
|
||||
|
||||
### Phase B: During CP3 Gate C (`lore sync`)
|
||||
|
||||
**Build instrumentation into the sync orchestrator as it's constructed.** Not a separate effort.
|
||||
|
||||
3. **P0-1 + P0-2 + P0-3** - `SyncMetrics` struct accumulating phase timings. `run_sync` writes to `sync_runs` with full `metrics_json` on completion.
|
||||
4. **P0-4** - Decide: use CP3's `pending_discussion_fetches` queue or existing per-MR telemetry columns. Wire up the winner.
|
||||
5. **P0-5** - Add `elapsed_ms` to `*Complete` progress event variants.
|
||||
6. **Deprecate `lore ingest`** - Hidden alias pointing to `lore sync`. Remove from help output.
|
||||
|
||||
### Phase C: After CP3 ships, informed by real metrics
|
||||
|
||||
**Only pursue items that P0 data proves matter.**
|
||||
|
||||
7. **P2-1: Discussion optimization** - Check metrics_json from real runs. If discussion phase is <10% of wall-clock, skip.
|
||||
8. **P2-2: Keyset pagination** - Check primary fetch timing on largest project. If fast, skip.
|
||||
9. **P2-4: Labels enrichment** - If label colors are needed for any UI surface.
|
||||
|
||||
### Phase D: Future (needs a forcing function)
|
||||
|
||||
10. **P3-1: Users table** - When a UI needs display names / avatars.
|
||||
11. **P2-3: ETags** - Only if P2-1 doesn't sufficiently reduce discussion overhead.
|
||||
12. **P3-2/3/4: GraphQL, Events API, Webhooks** - Architectural shifts. Only if pull-based sync hits a scaling wall.
|
||||
|
||||
---
|
||||
|
||||
## Priority 0: Observability (prerequisite for everything else)
|
||||
|
||||
We can't evaluate any efficiency question without measurement. Gitlore has no runtime performance instrumentation. The infrastructure for it was scaffolded (sync_runs table, metrics_json column, discussion sync telemetry columns) but never wired up.
|
||||
|
||||
### P0-1: sync_runs table is never written to
|
||||
|
||||
**Location:** Schema in `migrations/001_initial.sql:25-34`, read in `src/cli/commands/sync_status.rs:69-72`
|
||||
|
||||
The table exists and `lore status` reads from it, but no code ever INSERTs or UPDATEs rows. The entire audit trail is empty.
|
||||
|
||||
```sql
|
||||
-- Exists in schema, never populated
|
||||
CREATE TABLE sync_runs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
started_at INTEGER NOT NULL,
|
||||
heartbeat_at INTEGER NOT NULL,
|
||||
finished_at INTEGER,
|
||||
status TEXT NOT NULL, -- 'running' | 'succeeded' | 'failed'
|
||||
command TEXT NOT NULL,
|
||||
error TEXT,
|
||||
metrics_json TEXT -- never written
|
||||
);
|
||||
```
|
||||
|
||||
**What to do:** Instrument the ingest orchestrator to record sync runs. Each `lore ingest issues` / `lore ingest mrs` invocation should:
|
||||
- INSERT a row with status='running' at start
|
||||
- UPDATE with status='succeeded'/'failed' + finished_at on completion
|
||||
- Populate metrics_json with the IngestProjectResult / IngestMrProjectResult counters
|
||||
|
||||
### P0-2: No operation timing anywhere
|
||||
|
||||
**Location:** Rate limiter in `src/gitlab/client.rs:20-65`, orchestrator in `src/ingestion/orchestrator.rs`
|
||||
|
||||
`Instant::now()` is used only for rate limiter enforcement. No operation durations are measured or logged. We don't know:
|
||||
|
||||
- How long a full issue ingest takes
|
||||
- How long discussion sync takes per entity
|
||||
- How long individual API requests take (network latency)
|
||||
- How long database writes take per batch
|
||||
- How long rate limiter sleeps accumulate to
|
||||
- How long pagination takes across pages
|
||||
|
||||
**What to do:** Add timing instrumentation at these levels:
|
||||
|
||||
| Level | What to time | Where |
|
||||
|-------|-------------|-------|
|
||||
| **Run** | Total ingest wall-clock time | orchestrator entry/exit |
|
||||
| **Phase** | Primary fetch vs discussion sync | orchestrator phase boundaries |
|
||||
| **API call** | Individual HTTP request round-trip | client.rs request method |
|
||||
| **DB write** | Transaction duration per batch | ingestion store functions |
|
||||
| **Rate limiter** | Cumulative sleep time per run | client.rs acquire() |
|
||||
|
||||
Store phase-level and run-level timing in `metrics_json`. Log API-call-level timing at debug level.
|
||||
|
||||
### P0-3: metrics_json has no defined schema
|
||||
|
||||
**What to do:** Define what goes in there. Strawman based on existing IngestProjectResult fields plus timing:
|
||||
|
||||
```json
|
||||
{
|
||||
"wall_clock_ms": 14200,
|
||||
"phases": {
|
||||
"primary_fetch": {
|
||||
"duration_ms": 8400,
|
||||
"api_calls": 12,
|
||||
"items_fetched": 1143,
|
||||
"items_upserted": 87,
|
||||
"pages": 12,
|
||||
"rate_limit_sleep_ms": 1200
|
||||
},
|
||||
"discussion_sync": {
|
||||
"duration_ms": 5800,
|
||||
"entities_checked": 87,
|
||||
"entities_synced": 14,
|
||||
"entities_skipped": 73,
|
||||
"api_calls": 22,
|
||||
"discussions_fetched": 156,
|
||||
"notes_upserted": 412,
|
||||
"rate_limit_sleep_ms": 2200
|
||||
}
|
||||
},
|
||||
"db": {
|
||||
"labels_created": 3,
|
||||
"raw_payloads_stored": 87,
|
||||
"raw_payloads_deduped": 42
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### P0-4: Discussion sync telemetry columns are dead code
|
||||
|
||||
**Location:** `merge_requests` table columns: `discussions_sync_last_attempt_at`, `discussions_sync_attempts`, `discussions_sync_last_error`
|
||||
|
||||
These exist in the schema but are never read or written. They were designed for tracking retry behavior on failed discussion syncs.
|
||||
|
||||
**What to do:** Wire these up during discussion sync. On attempt: set last_attempt_at and increment attempts. On failure: set last_error. On success: reset attempts to 0. This provides per-entity visibility into discussion sync health.
|
||||
|
||||
### P0-5: Progress events carry no timing
|
||||
|
||||
**Location:** `src/ingestion/orchestrator.rs:28-53`
|
||||
|
||||
ProgressEvent variants (`IssueFetched`, `DiscussionSynced`, etc.) carry only counts. Adding elapsed_ms to at least `*Complete` variants would give callers (CLI progress bars, robot mode output) real throughput numbers.
|
||||
|
||||
---
|
||||
|
||||
## Priority 1: Free data capture (zero API cost)
|
||||
|
||||
These fields are already in the API responses gitlore receives. Storing them requires only Rust struct additions and DB column migrations. No additional API calls.
|
||||
|
||||
### P1-1: user_notes_count (Issues + MRs)
|
||||
|
||||
**API field:** `user_notes_count` (integer)
|
||||
**Value:** Could short-circuit discussion re-sync. If count hasn't changed, discussions probably haven't changed either. Also useful for "most discussed" queries.
|
||||
**Effort:** Add field to serde struct, add DB column, store during transform.
|
||||
|
||||
### P1-2: upvotes / downvotes (Issues + MRs)
|
||||
|
||||
**API field:** `upvotes`, `downvotes` (integers)
|
||||
**Value:** Engagement metrics for triage. "Most upvoted open issues" is a common query.
|
||||
**Effort:** Same pattern as above.
|
||||
|
||||
### P1-3: confidential (Issues)
|
||||
|
||||
**API field:** `confidential` (boolean)
|
||||
**Value:** Security-sensitive filtering. Important to know when exposing issue data.
|
||||
**Effort:** Low.
|
||||
|
||||
### P1-4: has_conflicts (MRs)
|
||||
|
||||
**API field:** `has_conflicts` (boolean)
|
||||
**Value:** Identify MRs needing rebase. Useful for "stale MR" detection.
|
||||
**Effort:** Low.
|
||||
|
||||
### P1-5: blocking_discussions_resolved (MRs)
|
||||
|
||||
**API field:** `blocking_discussions_resolved` (boolean)
|
||||
**Value:** MR readiness indicator without joining the discussions table.
|
||||
**Effort:** Low.
|
||||
|
||||
### P1-6: merge_commit_sha (MRs)
|
||||
|
||||
**API field:** `merge_commit_sha` (string, nullable)
|
||||
**Value:** Trace merged MRs to specific commits in git history.
|
||||
**Effort:** Low.
|
||||
|
||||
### P1-7: discussion_locked (Issues + MRs)
|
||||
|
||||
**API field:** `discussion_locked` (boolean)
|
||||
**Value:** Know if new comments can be added. Useful for robot mode consumers.
|
||||
**Effort:** Low.
|
||||
|
||||
### P1-8: task_completion_status (Issues + MRs)
|
||||
|
||||
**API field:** `task_completion_status` (object: `{count, completed_count}`)
|
||||
**Value:** Track task-list checkbox progress without parsing markdown.
|
||||
**Effort:** Low. Store as two integer columns or a small JSON blob.
|
||||
|
||||
### P1-9: issue_type (Issues)
|
||||
|
||||
**API field:** `issue_type` (string: "issue" | "incident" | "test_case")
|
||||
**Value:** Distinguish issues vs incidents vs test cases for filtering.
|
||||
**Effort:** Low.
|
||||
|
||||
### P1-10: MR milestone (MRs)
|
||||
|
||||
**API field:** `milestone` (object, same structure as on issues)
|
||||
**Current state:** Milestones are fully stored for issues but completely ignored for MRs.
|
||||
**Value:** "Which MRs are in milestone X?" Currently impossible to query locally.
|
||||
**Effort:** Medium - reuse existing milestone transformer from issue pipeline.
|
||||
|
||||
### P1-11: Issue references (Issues)
|
||||
|
||||
**API field:** `references` (object: `{short, relative, full}`)
|
||||
**Current state:** Stored for MRs (`references_short`, `references_full`), dropped for issues.
|
||||
**Value:** Cross-project issue references (e.g., `group/project#42`).
|
||||
**Effort:** Low.
|
||||
|
||||
---
|
||||
|
||||
## Priority 2: Efficiency improvements (requires measurement from P0 first)
|
||||
|
||||
These are potential optimizations. **Do not implement until P0 instrumentation proves they matter.**
|
||||
|
||||
### P2-1: Discussion full-refresh strategy
|
||||
|
||||
**Current behavior:** When an issue/MR's `updated_at` advances, ALL its discussions are deleted and re-fetched from scratch.
|
||||
|
||||
**Potential optimization:** Use `user_notes_count` (P1-1) to detect whether discussions actually changed. Skip re-sync if count is unchanged.
|
||||
|
||||
**Why we need P0 first:** The full-refresh may be fast enough. Since we already fetch the data from GitLab, the DELETE+INSERT is just local SQLite I/O. If discussion sync for a typical entity takes <100ms locally, this isn't worth optimizing. We need the per-entity timing from P0-2 to know.
|
||||
|
||||
**Trade-offs to consider:**
|
||||
- Full-refresh catches edited and deleted notes. Incremental would miss those.
|
||||
- `user_notes_count` doesn't change when notes are edited, only when added/removed.
|
||||
- Full-refresh is simpler to reason about for consistency.
|
||||
|
||||
### P2-2: Keyset pagination
|
||||
|
||||
**Current behavior:** Offset-based (`page=N&per_page=100`).
|
||||
**Alternative:** Keyset pagination (`pagination=keyset`), O(1) per page instead of O(N).
|
||||
|
||||
**Why we need P0 first:** Only matters for large projects (>10K issues). Most projects will never hit enough pages for this to be measurable. P0 timing of pagination will show if this is a bottleneck.
|
||||
|
||||
**Note:** Gitlore already parses `Link` headers for next-page detection, which is the client-side mechanism keyset pagination uses. So partial support exists.
|
||||
|
||||
### P2-3: ETag / conditional requests
|
||||
|
||||
**Current behavior:** All requests are unconditional.
|
||||
**Alternative:** Cache ETags, send `If-None-Match`, get 304s back.
|
||||
|
||||
**Why we need P0 first:** The cursor-based sync already avoids re-fetching unchanged data for primary resources. ETags would mainly help with discussion re-fetches where nothing changed. If P2-1 (user_notes_count skip) is implemented, ETags become less valuable.
|
||||
|
||||
### P2-4: Labels API enrichment
|
||||
|
||||
**Current behavior:** Labels extracted from the `labels[]` string array in issue/MR responses. The `labels` table has `color` and `description` columns that may not be populated.
|
||||
**Alternative:** Single call to `GET /projects/:id/labels` per project per sync to populate label metadata.
|
||||
**Cost:** 1 API call per project per sync run.
|
||||
**Value:** Label colors for UI rendering, descriptions for tooltips.
|
||||
|
||||
---
|
||||
|
||||
## Priority 3: Structural improvements (future consideration)
|
||||
|
||||
### P3-1: Users table
|
||||
|
||||
**Current state:** Only `username` stored. Author `name`, `avatar_url`, `web_url`, `state` are in every API response but discarded.
|
||||
**Proposal:** Create a `users` table, upsert on every encounter. Zero API cost.
|
||||
**Value:** Richer user display, detect blocked/deactivated users.
|
||||
|
||||
### P3-2: GraphQL API for field-precise fetching
|
||||
|
||||
**Current state:** REST API returns ~40-50 fields per entity. Gitlore uses ~15-23.
|
||||
**Alternative:** GraphQL API allows requesting exactly the fields needed.
|
||||
**Trade-offs:** Different pagination model, potentially less stable API, more complex client code. The bandwidth savings are real but likely minor compared to discussion re-fetch overhead.
|
||||
|
||||
### P3-3: Events API for lightweight change detection
|
||||
|
||||
**Endpoint:** `GET /projects/:id/events`
|
||||
**Value:** Lightweight "has anything changed?" check before running full issue/MR sync. Could replace or supplement the cursor-based approach for very active projects.
|
||||
|
||||
### P3-4: Webhook-based push sync
|
||||
|
||||
**Endpoint:** `POST /projects/:id/hooks` (setup), then receive pushes.
|
||||
**Value:** Near-real-time sync without polling cost. Eliminates all rate-limit concerns.
|
||||
**Barrier:** Requires a listener endpoint, which changes the architecture from pull-only CLI to something with a daemon/server component.
|
||||
|
||||
---
|
||||
|
||||
## Working notes
|
||||
|
||||
_Space for recording decisions as we work through items._
|
||||
|
||||
### Decisions made
|
||||
|
||||
| Item | Decision | Rationale |
|
||||
|------|----------|-----------|
|
||||
| `lore ingest` | Remove. `lore sync` is the single entry point. | No reason to separate initial load from incremental updates. First run = full fetch, subsequent = cursor-based delta. |
|
||||
| CP3 alignment | Build P0 instrumentation into CP3 Gate C, not separately. | Avoids building in two places. `lore sync` owns the full lifecycle. |
|
||||
| P2 timing | Defer all efficiency optimizations until P0 metrics from real runs are available. | Can't evaluate trade-offs without measurement. |
|
||||
|
||||
### Open questions
|
||||
|
||||
- What's the typical project size (issue/MR count) for gitlore users? This determines whether keyset pagination (P2-2) matters.
|
||||
- Is there a plan for a web UI or TUI? That would increase the value of P3-1 (users table) and P2-4 (label colors).
|
||||
456
docs/phase-a-spec.md
Normal file
456
docs/phase-a-spec.md
Normal file
@@ -0,0 +1,456 @@
|
||||
# Phase A: Complete API Field Capture
|
||||
|
||||
> **Status:** Draft
|
||||
> **Guiding principle:** Mirror everything GitLab gives us.
|
||||
> - **Lossless mirror:** the raw API JSON stored behind `raw_payload_id`. This is the true complete representation of every API response.
|
||||
> - **Relational projection:** a stable, query-optimized subset of fields we commit to keeping current on every re-sync.
|
||||
> This preserves maximum context for processing and analysis while avoiding unbounded schema growth.
|
||||
> **Migration:** 007_complete_field_capture.sql
|
||||
> **Prerequisite:** None (independent of CP3)
|
||||
|
||||
---
|
||||
|
||||
## Scope
|
||||
|
||||
One migration. Three categories of work:
|
||||
|
||||
1. **New columns** on `issues` and `merge_requests` for fields currently dropped by serde or dropped during transform
|
||||
2. **New serde fields** on `GitLabIssue` and `GitLabMergeRequest` to deserialize currently-silently-dropped JSON fields
|
||||
3. **Transformer + insert updates** to pass the new fields through to the DB
|
||||
|
||||
No new tables. No new API calls. No new endpoints. All data comes from responses we already receive.
|
||||
|
||||
---
|
||||
|
||||
## Issues: Field Gap Inventory
|
||||
|
||||
### Currently stored
|
||||
id, iid, project_id, title, description, state, author_username, created_at, updated_at, web_url, due_date, milestone_id, milestone_title, raw_payload_id, last_seen_at, discussions_synced_for_updated_at, labels (junction), assignees (junction)
|
||||
|
||||
### Currently deserialized but dropped during transform
|
||||
| API Field | Status | Action |
|
||||
|-----------|--------|--------|
|
||||
| `closed_at` | Deserialized in serde struct, but no DB column exists and transformer never populates it | Add column in migration 007, wire up in IssueRow + transform + INSERT |
|
||||
| `author.id` | Deserialized | Store as `author_id` column |
|
||||
| `author.name` | Deserialized | Store as `author_name` column |
|
||||
|
||||
### Currently silently dropped by serde (not in GitLabIssue struct)
|
||||
| API Field | Type | DB Column | Notes |
|
||||
|-----------|------|-----------|-------|
|
||||
| `issue_type` | Option\<String\> | `issue_type` | Canonical field (lowercase, e.g. "issue"); preferred for DB storage |
|
||||
| `upvotes` | i64 | `upvotes` | |
|
||||
| `downvotes` | i64 | `downvotes` | |
|
||||
| `user_notes_count` | i64 | `user_notes_count` | Useful for discussion sync optimization |
|
||||
| `merge_requests_count` | i64 | `merge_requests_count` | Count of linked MRs |
|
||||
| `confidential` | bool | `confidential` | 0/1 |
|
||||
| `discussion_locked` | bool | `discussion_locked` | 0/1 |
|
||||
| `weight` | Option\<i64\> | `weight` | Premium/Ultimate, null on Free |
|
||||
| `time_stats.time_estimate` | i64 | `time_estimate` | Seconds |
|
||||
| `time_stats.total_time_spent` | i64 | `time_spent` | Seconds |
|
||||
| `time_stats.human_time_estimate` | Option\<String\> | `human_time_estimate` | e.g. "3h 30m" |
|
||||
| `time_stats.human_total_time_spent` | Option\<String\> | `human_time_spent` | e.g. "1h 15m" |
|
||||
| `task_completion_status.count` | i64 | `task_count` | Checkbox total |
|
||||
| `task_completion_status.completed_count` | i64 | `task_completed_count` | Checkboxes checked |
|
||||
| `has_tasks` | bool | `has_tasks` | 0/1 |
|
||||
| `severity` | Option\<String\> | `severity` | Incident severity |
|
||||
| `closed_by` | Option\<object\> | `closed_by_username` | Who closed it (username only, consistent with author pattern) |
|
||||
| `imported` | bool | `imported` | 0/1 |
|
||||
| `imported_from` | Option\<String\> | `imported_from` | Import source |
|
||||
| `moved_to_id` | Option\<i64\> | `moved_to_id` | Target issue if moved |
|
||||
| `references.short` | String | `references_short` | e.g. "#42" |
|
||||
| `references.relative` | String | `references_relative` | e.g. "#42" or "group/proj#42" |
|
||||
| `references.full` | String | `references_full` | e.g. "group/project#42" |
|
||||
| `health_status` | Option\<String\> | `health_status` | Ultimate only |
|
||||
| `type` | Option\<String\> | (transform-only) | Uppercase category (e.g. "ISSUE"); fallback for `issue_type` -- lowercased before storage. Not stored as separate column; raw JSON remains lossless. |
|
||||
| `epic.id` | Option\<i64\> | `epic_id` | Premium/Ultimate, null on Free |
|
||||
| `epic.iid` | Option\<i64\> | `epic_iid` | |
|
||||
| `epic.title` | Option\<String\> | `epic_title` | |
|
||||
| `epic.url` | Option\<String\> | `epic_url` | |
|
||||
| `epic.group_id` | Option\<i64\> | `epic_group_id` | |
|
||||
| `iteration.id` | Option\<i64\> | `iteration_id` | Premium/Ultimate, null on Free |
|
||||
| `iteration.iid` | Option\<i64\> | `iteration_iid` | |
|
||||
| `iteration.title` | Option\<String\> | `iteration_title` | |
|
||||
| `iteration.state` | Option\<i64\> | `iteration_state` | Enum: 1=upcoming, 2=current, 3=closed |
|
||||
| `iteration.start_date` | Option\<String\> | `iteration_start_date` | ISO date |
|
||||
| `iteration.due_date` | Option\<String\> | `iteration_due_date` | ISO date |
|
||||
|
||||
---
|
||||
|
||||
## Merge Requests: Field Gap Inventory
|
||||
|
||||
### Currently stored
|
||||
id, iid, project_id, title, description, state, draft, author_username, source_branch, target_branch, head_sha, references_short, references_full, detailed_merge_status, merge_user_username, created_at, updated_at, merged_at, closed_at, last_seen_at, web_url, raw_payload_id, discussions_synced_for_updated_at, discussions_sync_last_attempt_at, discussions_sync_attempts, discussions_sync_last_error, labels (junction), assignees (junction), reviewers (junction)
|
||||
|
||||
### Currently deserialized but dropped during transform
|
||||
| API Field | Status | Action |
|
||||
|-----------|--------|--------|
|
||||
| `author.id` | Deserialized | Store as `author_id` column |
|
||||
| `author.name` | Deserialized | Store as `author_name` column |
|
||||
| `work_in_progress` | Used transiently for `draft` fallback | Already handled, no change needed |
|
||||
| `merge_status` (legacy) | Used transiently for `detailed_merge_status` fallback | Already handled, no change needed |
|
||||
| `merged_by` | Used transiently for `merge_user` fallback | Already handled, no change needed |
|
||||
|
||||
### Currently silently dropped by serde (not in GitLabMergeRequest struct)
|
||||
| API Field | Type | DB Column | Notes |
|
||||
|-----------|------|-----------|-------|
|
||||
| `upvotes` | i64 | `upvotes` | |
|
||||
| `downvotes` | i64 | `downvotes` | |
|
||||
| `user_notes_count` | i64 | `user_notes_count` | |
|
||||
| `source_project_id` | i64 | `source_project_id` | Fork source |
|
||||
| `target_project_id` | i64 | `target_project_id` | Fork target |
|
||||
| `milestone` | Option\<object\> | `milestone_id`, `milestone_title` | Reuse issue milestone pattern |
|
||||
| `merge_when_pipeline_succeeds` | bool | `merge_when_pipeline_succeeds` | 0/1, auto-merge flag |
|
||||
| `merge_commit_sha` | Option\<String\> | `merge_commit_sha` | Commit ref after merge |
|
||||
| `squash_commit_sha` | Option\<String\> | `squash_commit_sha` | Commit ref after squash |
|
||||
| `discussion_locked` | bool | `discussion_locked` | 0/1 |
|
||||
| `should_remove_source_branch` | Option\<bool\> | `should_remove_source_branch` | 0/1 |
|
||||
| `force_remove_source_branch` | Option\<bool\> | `force_remove_source_branch` | 0/1 |
|
||||
| `squash` | bool | `squash` | 0/1 |
|
||||
| `squash_on_merge` | bool | `squash_on_merge` | 0/1 |
|
||||
| `has_conflicts` | bool | `has_conflicts` | 0/1 |
|
||||
| `blocking_discussions_resolved` | bool | `blocking_discussions_resolved` | 0/1 |
|
||||
| `time_stats.time_estimate` | i64 | `time_estimate` | Seconds |
|
||||
| `time_stats.total_time_spent` | i64 | `time_spent` | Seconds |
|
||||
| `time_stats.human_time_estimate` | Option\<String\> | `human_time_estimate` | |
|
||||
| `time_stats.human_total_time_spent` | Option\<String\> | `human_time_spent` | |
|
||||
| `task_completion_status.count` | i64 | `task_count` | |
|
||||
| `task_completion_status.completed_count` | i64 | `task_completed_count` | |
|
||||
| `closed_by` | Option\<object\> | `closed_by_username` | |
|
||||
| `prepared_at` | Option\<String\> | `prepared_at` | ISO datetime in API; store as ms epoch via `iso_to_ms()`, nullable |
|
||||
| `merge_after` | Option\<String\> | `merge_after` | ISO datetime in API; store as ms epoch via `iso_to_ms()`, nullable (scheduled merge) |
|
||||
| `imported` | bool | `imported` | 0/1 |
|
||||
| `imported_from` | Option\<String\> | `imported_from` | |
|
||||
| `approvals_before_merge` | Option\<i64\> | `approvals_before_merge` | Deprecated, scheduled for removal in GitLab API v5; store best-effort, keep nullable |
|
||||
| `references.relative` | String | `references_relative` | Currently only short + full stored |
|
||||
| `confidential` | bool | `confidential` | 0/1 (MRs can be confidential too) |
|
||||
| `iteration.id` | Option\<i64\> | `iteration_id` | Premium/Ultimate, null on Free |
|
||||
| `iteration.iid` | Option\<i64\> | `iteration_iid` | |
|
||||
| `iteration.title` | Option\<String\> | `iteration_title` | |
|
||||
| `iteration.state` | Option\<i64\> | `iteration_state` | |
|
||||
| `iteration.start_date` | Option\<String\> | `iteration_start_date` | ISO date |
|
||||
| `iteration.due_date` | Option\<String\> | `iteration_due_date` | ISO date |
|
||||
|
||||
---
|
||||
|
||||
## Migration 007: complete_field_capture.sql
|
||||
|
||||
```sql
|
||||
-- Migration 007: Capture all remaining GitLab API response fields.
|
||||
-- Principle: mirror everything GitLab returns. No field left behind.
|
||||
|
||||
-- ============================================================
|
||||
-- ISSUES: new columns
|
||||
-- ============================================================
|
||||
|
||||
-- Fields currently deserialized but not stored
|
||||
ALTER TABLE issues ADD COLUMN closed_at INTEGER; -- ms epoch, deserialized but never stored until now
|
||||
ALTER TABLE issues ADD COLUMN author_id INTEGER; -- GitLab user ID
|
||||
ALTER TABLE issues ADD COLUMN author_name TEXT; -- Display name
|
||||
|
||||
-- Issue metadata
|
||||
ALTER TABLE issues ADD COLUMN issue_type TEXT; -- 'issue' | 'incident' | 'test_case'
|
||||
ALTER TABLE issues ADD COLUMN confidential INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN discussion_locked INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Engagement
|
||||
ALTER TABLE issues ADD COLUMN upvotes INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN downvotes INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN user_notes_count INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN merge_requests_count INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Time tracking
|
||||
ALTER TABLE issues ADD COLUMN time_estimate INTEGER NOT NULL DEFAULT 0; -- seconds
|
||||
ALTER TABLE issues ADD COLUMN time_spent INTEGER NOT NULL DEFAULT 0; -- seconds
|
||||
ALTER TABLE issues ADD COLUMN human_time_estimate TEXT;
|
||||
ALTER TABLE issues ADD COLUMN human_time_spent TEXT;
|
||||
|
||||
-- Task lists
|
||||
ALTER TABLE issues ADD COLUMN task_count INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN task_completed_count INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN has_tasks INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- References (MRs already have short + full)
|
||||
ALTER TABLE issues ADD COLUMN references_short TEXT; -- e.g. "#42"
|
||||
ALTER TABLE issues ADD COLUMN references_relative TEXT; -- context-dependent
|
||||
ALTER TABLE issues ADD COLUMN references_full TEXT; -- e.g. "group/project#42"
|
||||
|
||||
-- Close/move tracking
|
||||
ALTER TABLE issues ADD COLUMN closed_by_username TEXT;
|
||||
|
||||
-- Premium/Ultimate fields (nullable, null on Free tier)
|
||||
ALTER TABLE issues ADD COLUMN weight INTEGER;
|
||||
ALTER TABLE issues ADD COLUMN severity TEXT;
|
||||
ALTER TABLE issues ADD COLUMN health_status TEXT;
|
||||
|
||||
-- Import tracking
|
||||
ALTER TABLE issues ADD COLUMN imported INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE issues ADD COLUMN imported_from TEXT;
|
||||
ALTER TABLE issues ADD COLUMN moved_to_id INTEGER;
|
||||
|
||||
-- Epic (Premium/Ultimate, null on Free)
|
||||
ALTER TABLE issues ADD COLUMN epic_id INTEGER;
|
||||
ALTER TABLE issues ADD COLUMN epic_iid INTEGER;
|
||||
ALTER TABLE issues ADD COLUMN epic_title TEXT;
|
||||
ALTER TABLE issues ADD COLUMN epic_url TEXT;
|
||||
ALTER TABLE issues ADD COLUMN epic_group_id INTEGER;
|
||||
|
||||
-- Iteration (Premium/Ultimate, null on Free)
|
||||
ALTER TABLE issues ADD COLUMN iteration_id INTEGER;
|
||||
ALTER TABLE issues ADD COLUMN iteration_iid INTEGER;
|
||||
ALTER TABLE issues ADD COLUMN iteration_title TEXT;
|
||||
ALTER TABLE issues ADD COLUMN iteration_state INTEGER;
|
||||
ALTER TABLE issues ADD COLUMN iteration_start_date TEXT;
|
||||
ALTER TABLE issues ADD COLUMN iteration_due_date TEXT;
|
||||
|
||||
-- ============================================================
|
||||
-- MERGE REQUESTS: new columns
|
||||
-- ============================================================
|
||||
|
||||
-- Author enrichment
|
||||
ALTER TABLE merge_requests ADD COLUMN author_id INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN author_name TEXT;
|
||||
|
||||
-- Engagement
|
||||
ALTER TABLE merge_requests ADD COLUMN upvotes INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN downvotes INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN user_notes_count INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Fork tracking
|
||||
ALTER TABLE merge_requests ADD COLUMN source_project_id INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN target_project_id INTEGER;
|
||||
|
||||
-- Milestone (parity with issues)
|
||||
ALTER TABLE merge_requests ADD COLUMN milestone_id INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN milestone_title TEXT;
|
||||
|
||||
-- Merge behavior
|
||||
ALTER TABLE merge_requests ADD COLUMN merge_when_pipeline_succeeds INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN merge_commit_sha TEXT;
|
||||
ALTER TABLE merge_requests ADD COLUMN squash_commit_sha TEXT;
|
||||
ALTER TABLE merge_requests ADD COLUMN squash INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN squash_on_merge INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Merge readiness
|
||||
ALTER TABLE merge_requests ADD COLUMN has_conflicts INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN blocking_discussions_resolved INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Branch cleanup
|
||||
ALTER TABLE merge_requests ADD COLUMN should_remove_source_branch INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN force_remove_source_branch INTEGER;
|
||||
|
||||
-- Discussion lock
|
||||
ALTER TABLE merge_requests ADD COLUMN discussion_locked INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Time tracking
|
||||
ALTER TABLE merge_requests ADD COLUMN time_estimate INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN time_spent INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN human_time_estimate TEXT;
|
||||
ALTER TABLE merge_requests ADD COLUMN human_time_spent TEXT;
|
||||
|
||||
-- Task lists
|
||||
ALTER TABLE merge_requests ADD COLUMN task_count INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN task_completed_count INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Close tracking
|
||||
ALTER TABLE merge_requests ADD COLUMN closed_by_username TEXT;
|
||||
|
||||
-- Scheduling (API returns ISO datetimes; we store ms epoch for consistency)
|
||||
ALTER TABLE merge_requests ADD COLUMN prepared_at INTEGER; -- ms epoch after iso_to_ms()
|
||||
ALTER TABLE merge_requests ADD COLUMN merge_after INTEGER; -- ms epoch after iso_to_ms()
|
||||
|
||||
-- References (add relative, short + full already exist)
|
||||
ALTER TABLE merge_requests ADD COLUMN references_relative TEXT;
|
||||
|
||||
-- Import tracking
|
||||
ALTER TABLE merge_requests ADD COLUMN imported INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE merge_requests ADD COLUMN imported_from TEXT;
|
||||
|
||||
-- Premium/Ultimate
|
||||
ALTER TABLE merge_requests ADD COLUMN approvals_before_merge INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN confidential INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Iteration (Premium/Ultimate, null on Free)
|
||||
ALTER TABLE merge_requests ADD COLUMN iteration_id INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN iteration_iid INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN iteration_title TEXT;
|
||||
ALTER TABLE merge_requests ADD COLUMN iteration_state INTEGER;
|
||||
ALTER TABLE merge_requests ADD COLUMN iteration_start_date TEXT;
|
||||
ALTER TABLE merge_requests ADD COLUMN iteration_due_date TEXT;
|
||||
|
||||
-- Record migration version
|
||||
INSERT INTO schema_version (version, applied_at, description)
|
||||
VALUES (7, strftime('%s', 'now') * 1000, 'Complete API field capture for issues and merge requests');
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Serde Struct Changes
|
||||
|
||||
### Existing type changes
|
||||
|
||||
```
|
||||
GitLabReferences // Add: relative: Option<String> (with #[serde(default)])
|
||||
// Existing fields short + full remain unchanged
|
||||
GitLabIssue // Add #[derive(Default)] for test ergonomics
|
||||
GitLabMergeRequest // Add #[derive(Default)] for test ergonomics
|
||||
```
|
||||
|
||||
### New helper types needed
|
||||
|
||||
```
|
||||
GitLabTimeStats { time_estimate, total_time_spent, human_time_estimate, human_total_time_spent }
|
||||
GitLabTaskCompletionStatus { count, completed_count }
|
||||
GitLabClosedBy (reuse GitLabAuthor shape: id, username, name)
|
||||
GitLabEpic { id, iid, title, url, group_id }
|
||||
GitLabIteration { id, iid, title, state, start_date, due_date }
|
||||
```
|
||||
|
||||
### GitLabIssue: add fields
|
||||
|
||||
```
|
||||
type: Option<String> // #[serde(rename = "type")] -- fallback-only (uppercase category); "type" is reserved in Rust
|
||||
upvotes: i64 // #[serde(default)]
|
||||
downvotes: i64 // #[serde(default)]
|
||||
user_notes_count: i64 // #[serde(default)]
|
||||
merge_requests_count: i64 // #[serde(default)]
|
||||
confidential: bool // #[serde(default)]
|
||||
discussion_locked: bool // #[serde(default)]
|
||||
weight: Option<i64>
|
||||
time_stats: Option<GitLabTimeStats>
|
||||
task_completion_status: Option<GitLabTaskCompletionStatus>
|
||||
has_tasks: bool // #[serde(default)]
|
||||
references: Option<GitLabReferences>
|
||||
closed_by: Option<GitLabAuthor>
|
||||
severity: Option<String>
|
||||
health_status: Option<String>
|
||||
imported: bool // #[serde(default)]
|
||||
imported_from: Option<String>
|
||||
moved_to_id: Option<i64>
|
||||
issue_type: Option<String> // canonical field (lowercase); preferred for DB storage over `type`
|
||||
epic: Option<GitLabEpic>
|
||||
iteration: Option<GitLabIteration>
|
||||
```
|
||||
|
||||
### GitLabMergeRequest: add fields
|
||||
|
||||
```
|
||||
upvotes: i64 // #[serde(default)]
|
||||
downvotes: i64 // #[serde(default)]
|
||||
user_notes_count: i64 // #[serde(default)]
|
||||
source_project_id: Option<i64>
|
||||
target_project_id: Option<i64>
|
||||
milestone: Option<GitLabMilestone> // reuse existing type
|
||||
merge_when_pipeline_succeeds: bool // #[serde(default)]
|
||||
merge_commit_sha: Option<String>
|
||||
squash_commit_sha: Option<String>
|
||||
squash: bool // #[serde(default)]
|
||||
squash_on_merge: bool // #[serde(default)]
|
||||
has_conflicts: bool // #[serde(default)]
|
||||
blocking_discussions_resolved: bool // #[serde(default)]
|
||||
should_remove_source_branch: Option<bool>
|
||||
force_remove_source_branch: Option<bool>
|
||||
discussion_locked: bool // #[serde(default)]
|
||||
time_stats: Option<GitLabTimeStats>
|
||||
task_completion_status: Option<GitLabTaskCompletionStatus>
|
||||
closed_by: Option<GitLabAuthor>
|
||||
prepared_at: Option<String>
|
||||
merge_after: Option<String>
|
||||
imported: bool // #[serde(default)]
|
||||
imported_from: Option<String>
|
||||
approvals_before_merge: Option<i64>
|
||||
confidential: bool // #[serde(default)]
|
||||
iteration: Option<GitLabIteration>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Transformer Changes
|
||||
|
||||
### IssueRow: add fields
|
||||
|
||||
All new fields map 1:1 from the serde struct except:
|
||||
- `closed_at` -> `iso_to_ms()` conversion (already in serde struct, just not passed through)
|
||||
- `time_stats` -> flatten to 4 individual fields
|
||||
- `task_completion_status` -> flatten to 2 individual fields
|
||||
- `references` -> flatten to 3 individual fields
|
||||
- `closed_by` -> extract `username` only (consistent with author pattern)
|
||||
- `author` -> additionally extract `id` and `name` (currently only `username`)
|
||||
- `issue_type` -> store as-is (canonical, lowercase); fallback to lowercased `type` field if `issue_type` absent
|
||||
- `epic` -> flatten to 5 individual fields (id, iid, title, url, group_id)
|
||||
- `iteration` -> flatten to 6 individual fields (id, iid, title, state, start_date, due_date)
|
||||
|
||||
### NormalizedMergeRequest: add fields
|
||||
|
||||
Same patterns as issues, plus:
|
||||
- `milestone` -> reuse `upsert_milestone_tx` from issue pipeline, add `milestone_id` + `milestone_title`
|
||||
- `prepared_at`, `merge_after` -> `iso_to_ms()` conversion (API provides ISO datetimes)
|
||||
- `source_project_id`, `target_project_id` -> direct pass-through
|
||||
- `iteration` -> flatten to 6 individual fields (same as issues)
|
||||
|
||||
### Insert statement changes
|
||||
|
||||
Both `process_issue_in_transaction` and `process_mr_in_transaction` need their INSERT and ON CONFLICT DO UPDATE statements extended with all new columns. The ON CONFLICT clause should update all new fields on re-sync.
|
||||
|
||||
**Implementation note (reliability):** Define a single authoritative list of persisted columns per entity and generate/compose both SQL fragments from it:
|
||||
- INSERT column list + VALUES placeholders
|
||||
- ON CONFLICT DO UPDATE assignments
|
||||
|
||||
This prevents drift where a new field is added to one clause but not the other -- the most likely bug class with 40+ new columns.
|
||||
|
||||
---
|
||||
|
||||
## Prerequisite refactors (prep commits before main Phase A work)
|
||||
|
||||
### 1. Align issue transformer on `core::time`
|
||||
|
||||
The issue transformer (`transformers/issue.rs`) has a local `parse_timestamp()` that duplicates `iso_to_ms_strict()` from `core::time`. The MR transformer already uses the shared module. Before adding Phase A's optional timestamp fields (especially `closed_at` as `Option<String>`), migrate the issue transformer to use `iso_to_ms_strict()` and `iso_to_ms_opt_strict()` from `core::time`. This avoids duplicating the `opt` variant locally and establishes one timestamp parsing path across the codebase.
|
||||
|
||||
**Changes:** Replace `parse_timestamp()` calls with `iso_to_ms_strict()`, adapt or remove `TransformError::TimestampParse` (MR transformer uses `String` errors; align on that or on a shared error type).
|
||||
|
||||
### 2. Extract shared ingestion helpers
|
||||
|
||||
`upsert_milestone_tx` (in `ingestion/issues.rs`) and `upsert_label_tx` (duplicated in both `ingestion/issues.rs` and `ingestion/merge_requests.rs`) should be moved to a shared module (e.g., `src/ingestion/shared.rs`). MR ingestion needs `upsert_milestone_tx` for Phase A milestone support, and the label helper is already copy-pasted between files.
|
||||
|
||||
**Changes:** Create `src/ingestion/shared.rs`, move `upsert_milestone_tx`, `upsert_label_tx`, and `MilestoneRow` there. Update imports in both issue and MR ingestion modules.
|
||||
|
||||
---
|
||||
|
||||
## Files touched
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `migrations/007_complete_field_capture.sql` | New file |
|
||||
| `src/gitlab/types.rs` | Add `#[derive(Default)]` to `GitLabIssue` and `GitLabMergeRequest`; add `relative: Option<String>` to `GitLabReferences`; add fields to both structs; add `GitLabTimeStats`, `GitLabTaskCompletionStatus`, `GitLabEpic`, `GitLabIteration` |
|
||||
| `src/gitlab/transformers/issue.rs` | Remove local `parse_timestamp()`, switch to `core::time`; extend IssueRow, IssueWithMetadata, transform_issue() |
|
||||
| `src/gitlab/transformers/merge_request.rs` | Extend NormalizedMergeRequest, MergeRequestWithMetadata, transform_merge_request(); extract `references_relative` |
|
||||
| `src/ingestion/shared.rs` | New file: shared `upsert_milestone_tx`, `upsert_label_tx`, `MilestoneRow` |
|
||||
| `src/ingestion/issues.rs` | Extend INSERT/UPSERT SQL; import from shared module |
|
||||
| `src/ingestion/merge_requests.rs` | Extend INSERT/UPSERT SQL; import from shared module; add milestone upsert |
|
||||
| `src/core/db.rs` | Register migration 007 in `MIGRATIONS` array |
|
||||
|
||||
---
|
||||
|
||||
## What this does NOT include
|
||||
|
||||
- No new API endpoints called
|
||||
- No new tables (except reusing existing `milestones` for MRs)
|
||||
- No CLI changes (new fields are stored but not yet surfaced in `lore issues` / `lore mrs` output)
|
||||
- No changes to discussion/note ingestion (Phase A is issues + MRs only)
|
||||
- No observability instrumentation (that's Phase B)
|
||||
|
||||
---
|
||||
|
||||
## Rollout / Backfill Note
|
||||
|
||||
After applying Migration 007 and shipping transformer + UPSERT updates, **existing rows will not have the new columns populated** until issues/MRs are reprocessed. Plan on a **one-time full re-sync** (`lore ingest --type issues --full` and `lore ingest --type mrs --full`) to backfill the new fields. Until then, queries on new columns will return NULL/default values for previously-synced entities.
|
||||
|
||||
---
|
||||
|
||||
## Resolved decisions
|
||||
|
||||
| Field | Decision | Rationale |
|
||||
|-------|----------|-----------|
|
||||
| `subscribed` | **Excluded** | User-relative field (reflects token holder's subscription state, not an entity property). Changes meaning if the token is rotated to a different user. Not entity data. |
|
||||
| `_links` | **Excluded** | HATEOAS API navigation metadata, not entity data. Every URL is deterministically constructable from `project_id` + `iid` + GitLab base URL. Note: `closed_as_duplicate_of` inside `_links` contains a real entity reference -- extracting that is deferred to a future phase. |
|
||||
| `epic` / `iteration` | **Flatten to columns** | Same denormalization pattern as milestones. Epic gets 5 columns (`epic_id`, `epic_iid`, `epic_title`, `epic_url`, `epic_group_id`). Iteration gets 6 columns (`iteration_id`, `iteration_iid`, `iteration_title`, `iteration_state`, `iteration_start_date`, `iteration_due_date`). Both nullable (null on Free tier). |
|
||||
| `approvals_before_merge` | **Store best-effort** | Deprecated and scheduled for removal in GitLab API v5. Keep as `Option<i64>` / nullable column. Never depend on it for correctness -- it may disappear in a future GitLab release. |
|
||||
File diff suppressed because it is too large
Load Diff
84
migrations/007_documents.sql
Normal file
84
migrations/007_documents.sql
Normal file
@@ -0,0 +1,84 @@
|
||||
-- Migration 007: Documents, Document Labels, Document Paths, Dirty Sources, Pending Discussion Fetches
|
||||
-- Schema version: 7
|
||||
-- Adds CP3 document storage and queue tables for search pipeline
|
||||
|
||||
-- Unified searchable documents (derived from issues/MRs/discussions)
|
||||
CREATE TABLE documents (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
|
||||
source_id INTEGER NOT NULL, -- local DB id in the source table
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
author_username TEXT, -- for discussions: first note author
|
||||
label_names TEXT, -- JSON array (display/debug only)
|
||||
created_at INTEGER, -- ms epoch UTC
|
||||
updated_at INTEGER, -- ms epoch UTC
|
||||
url TEXT,
|
||||
title TEXT, -- null for discussions
|
||||
content_text TEXT NOT NULL, -- canonical text for embedding/search
|
||||
content_hash TEXT NOT NULL, -- SHA-256 for change detection
|
||||
labels_hash TEXT NOT NULL DEFAULT '', -- SHA-256 over sorted labels (write optimization)
|
||||
paths_hash TEXT NOT NULL DEFAULT '', -- SHA-256 over sorted paths (write optimization)
|
||||
is_truncated INTEGER NOT NULL DEFAULT 0,
|
||||
truncated_reason TEXT CHECK (
|
||||
truncated_reason IN (
|
||||
'token_limit_middle_drop','single_note_oversized','first_last_oversized',
|
||||
'hard_cap_oversized'
|
||||
)
|
||||
OR truncated_reason IS NULL
|
||||
),
|
||||
UNIQUE(source_type, source_id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_documents_project_updated ON documents(project_id, updated_at);
|
||||
CREATE INDEX idx_documents_author ON documents(author_username);
|
||||
CREATE INDEX idx_documents_source ON documents(source_type, source_id);
|
||||
CREATE INDEX idx_documents_hash ON documents(content_hash);
|
||||
|
||||
-- Fast label filtering (indexed exact-match)
|
||||
CREATE TABLE document_labels (
|
||||
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
label_name TEXT NOT NULL,
|
||||
PRIMARY KEY(document_id, label_name)
|
||||
) WITHOUT ROWID;
|
||||
CREATE INDEX idx_document_labels_label ON document_labels(label_name);
|
||||
|
||||
-- Fast path filtering (DiffNote file paths)
|
||||
CREATE TABLE document_paths (
|
||||
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
path TEXT NOT NULL,
|
||||
PRIMARY KEY(document_id, path)
|
||||
) WITHOUT ROWID;
|
||||
CREATE INDEX idx_document_paths_path ON document_paths(path);
|
||||
|
||||
-- Queue for incremental document regeneration (with retry tracking)
|
||||
-- Uses next_attempt_at for index-friendly backoff queries
|
||||
CREATE TABLE dirty_sources (
|
||||
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
|
||||
source_id INTEGER NOT NULL,
|
||||
queued_at INTEGER NOT NULL, -- ms epoch UTC
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER,
|
||||
last_error TEXT,
|
||||
next_attempt_at INTEGER, -- ms epoch UTC; NULL means ready immediately
|
||||
PRIMARY KEY(source_type, source_id)
|
||||
);
|
||||
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
|
||||
|
||||
-- Resumable queue for dependent discussion fetching
|
||||
-- Uses next_attempt_at for index-friendly backoff queries
|
||||
CREATE TABLE pending_discussion_fetches (
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
noteable_type TEXT NOT NULL, -- 'Issue' | 'MergeRequest'
|
||||
noteable_iid INTEGER NOT NULL,
|
||||
queued_at INTEGER NOT NULL, -- ms epoch UTC
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER,
|
||||
last_error TEXT,
|
||||
next_attempt_at INTEGER, -- ms epoch UTC; NULL means ready immediately
|
||||
PRIMARY KEY(project_id, noteable_type, noteable_iid)
|
||||
);
|
||||
CREATE INDEX idx_pending_discussions_next_attempt ON pending_discussion_fetches(next_attempt_at);
|
||||
|
||||
-- Update schema version
|
||||
INSERT INTO schema_version (version, applied_at, description)
|
||||
VALUES (7, strftime('%s', 'now') * 1000, 'Documents, labels, paths, dirty sources, pending discussion fetches');
|
||||
42
migrations/008_fts5.sql
Normal file
42
migrations/008_fts5.sql
Normal file
@@ -0,0 +1,42 @@
|
||||
-- Migration 008: FTS5 Full-Text Search Index
|
||||
-- Schema version: 8
|
||||
-- Adds full-text search on documents table with sync triggers
|
||||
|
||||
-- Full-text search with porter stemmer and prefix indexes for type-ahead
|
||||
CREATE VIRTUAL TABLE documents_fts USING fts5(
|
||||
title,
|
||||
content_text,
|
||||
content='documents',
|
||||
content_rowid='id',
|
||||
tokenize='porter unicode61',
|
||||
prefix='2 3 4'
|
||||
);
|
||||
|
||||
-- Keep FTS in sync via triggers.
|
||||
-- IMPORTANT: COALESCE(title, '') ensures FTS5 external-content table never
|
||||
-- receives NULL values, which can cause inconsistencies with delete operations.
|
||||
-- FTS5 delete requires exact match of original values; NULL != NULL in SQL,
|
||||
-- so a NULL title on insert would make the delete trigger fail silently.
|
||||
CREATE TRIGGER documents_ai AFTER INSERT ON documents BEGIN
|
||||
INSERT INTO documents_fts(rowid, title, content_text)
|
||||
VALUES (new.id, COALESCE(new.title, ''), new.content_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
|
||||
INSERT INTO documents_fts(documents_fts, rowid, title, content_text)
|
||||
VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text);
|
||||
END;
|
||||
|
||||
-- Only rebuild FTS when searchable text actually changes (not metadata-only updates)
|
||||
CREATE TRIGGER documents_au AFTER UPDATE ON documents
|
||||
WHEN old.title IS NOT new.title OR old.content_text != new.content_text
|
||||
BEGIN
|
||||
INSERT INTO documents_fts(documents_fts, rowid, title, content_text)
|
||||
VALUES('delete', old.id, COALESCE(old.title, ''), old.content_text);
|
||||
INSERT INTO documents_fts(rowid, title, content_text)
|
||||
VALUES (new.id, COALESCE(new.title, ''), new.content_text);
|
||||
END;
|
||||
|
||||
-- Update schema version
|
||||
INSERT INTO schema_version (version, applied_at, description)
|
||||
VALUES (8, strftime('%s', 'now') * 1000, 'FTS5 full-text search index with sync triggers');
|
||||
54
migrations/009_embeddings.sql
Normal file
54
migrations/009_embeddings.sql
Normal file
@@ -0,0 +1,54 @@
|
||||
-- Migration 009: Embeddings (Gate B)
|
||||
-- Schema version: 9
|
||||
-- Adds sqlite-vec vector storage and embedding metadata for semantic search
|
||||
-- Requires sqlite-vec extension to be loaded before applying
|
||||
|
||||
-- NOTE: sqlite-vec vec0 virtual tables cannot participate in FK cascades.
|
||||
-- We must use an explicit trigger to delete orphan embeddings when documents
|
||||
-- are deleted. See documents_embeddings_ad trigger below.
|
||||
|
||||
-- sqlite-vec virtual table for vector search
|
||||
-- Storage rule: embeddings.rowid = document_id * 1000 + chunk_index
|
||||
-- This encodes (document_id, chunk_index) into a single integer rowid.
|
||||
-- Supports up to 1000 chunks per document (32M chars at 32k/chunk).
|
||||
CREATE VIRTUAL TABLE embeddings USING vec0(
|
||||
embedding float[768]
|
||||
);
|
||||
|
||||
-- Embedding provenance + change detection (one row per chunk)
|
||||
-- NOTE: Two hash columns serve different purposes:
|
||||
-- document_hash: SHA-256 of full documents.content_text (staleness detection)
|
||||
-- chunk_hash: SHA-256 of this individual chunk's text (debug/provenance)
|
||||
-- Pending detection uses document_hash (not chunk_hash) because staleness is
|
||||
-- a document-level condition: if the document changed, ALL chunks need re-embedding.
|
||||
CREATE TABLE embedding_metadata (
|
||||
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
chunk_index INTEGER NOT NULL DEFAULT 0, -- 0-indexed position within document
|
||||
model TEXT NOT NULL, -- 'nomic-embed-text'
|
||||
dims INTEGER NOT NULL, -- 768
|
||||
document_hash TEXT NOT NULL, -- SHA-256 of full documents.content_text (staleness)
|
||||
chunk_hash TEXT NOT NULL, -- SHA-256 of this chunk's text (provenance)
|
||||
created_at INTEGER NOT NULL, -- ms epoch UTC
|
||||
last_error TEXT, -- error message from last failed attempt
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER, -- ms epoch UTC
|
||||
PRIMARY KEY(document_id, chunk_index)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_embedding_metadata_errors
|
||||
ON embedding_metadata(last_error) WHERE last_error IS NOT NULL;
|
||||
CREATE INDEX idx_embedding_metadata_doc ON embedding_metadata(document_id);
|
||||
|
||||
-- CRITICAL: Delete ALL chunk embeddings when a document is deleted.
|
||||
-- vec0 virtual tables don't support FK ON DELETE CASCADE, so we need this trigger.
|
||||
-- embedding_metadata has ON DELETE CASCADE, so only vec0 needs explicit cleanup.
|
||||
-- Range: [document_id * 1000, document_id * 1000 + 999]
|
||||
CREATE TRIGGER documents_embeddings_ad AFTER DELETE ON documents BEGIN
|
||||
DELETE FROM embeddings
|
||||
WHERE rowid >= old.id * 1000
|
||||
AND rowid < (old.id + 1) * 1000;
|
||||
END;
|
||||
|
||||
-- Update schema version
|
||||
INSERT INTO schema_version (version, applied_at, description)
|
||||
VALUES (9, strftime('%s', 'now') * 1000, 'Embeddings vec0 table, metadata, orphan cleanup trigger');
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Auth test command - verify GitLab authentication.
|
||||
|
||||
use crate::core::config::Config;
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::gitlab::GitLabClient;
|
||||
|
||||
/// Result of successful auth test.
|
||||
@@ -19,12 +19,12 @@ pub async fn run_auth_test(config_path: Option<&str>) -> Result<AuthTestResult>
|
||||
// 2. Get token from environment
|
||||
let token = std::env::var(&config.gitlab.token_env_var)
|
||||
.map(|t| t.trim().to_string())
|
||||
.map_err(|_| GiError::TokenNotSet {
|
||||
.map_err(|_| LoreError::TokenNotSet {
|
||||
env_var: config.gitlab.token_env_var.clone(),
|
||||
})?;
|
||||
|
||||
if token.is_empty() {
|
||||
return Err(GiError::TokenNotSet {
|
||||
return Err(LoreError::TokenNotSet {
|
||||
env_var: config.gitlab.token_env_var.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use serde::Serialize;
|
||||
|
||||
use crate::core::config::Config;
|
||||
use crate::core::db::{create_connection, get_schema_version, verify_pragmas};
|
||||
use crate::core::error::GiError;
|
||||
use crate::core::error::LoreError;
|
||||
use crate::core::paths::{get_config_path, get_db_path};
|
||||
use crate::gitlab::GitLabClient;
|
||||
|
||||
@@ -137,7 +137,7 @@ fn check_config(config_path: &str) -> (ConfigCheck, Option<Config>) {
|
||||
},
|
||||
Some(config),
|
||||
),
|
||||
Err(GiError::ConfigNotFound { path }) => (
|
||||
Err(LoreError::ConfigNotFound { path }) => (
|
||||
ConfigCheck {
|
||||
result: CheckResult {
|
||||
status: CheckStatus::Error,
|
||||
@@ -264,7 +264,7 @@ async fn check_gitlab(config: Option<&Config>) -> GitLabCheck {
|
||||
url: Some(config.gitlab.base_url.clone()),
|
||||
username: Some(user.username),
|
||||
},
|
||||
Err(GiError::GitLabAuthFailed) => GitLabCheck {
|
||||
Err(LoreError::GitLabAuthFailed) => GitLabCheck {
|
||||
result: CheckResult {
|
||||
status: CheckStatus::Error,
|
||||
message: Some("Authentication failed. Check your token.".to_string()),
|
||||
|
||||
88
src/cli/commands/embed.rs
Normal file
88
src/cli/commands/embed.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
//! Embed command: generate vector embeddings for documents via Ollama.
|
||||
|
||||
use console::style;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::embedding::ollama::{OllamaClient, OllamaConfig};
|
||||
use crate::embedding::pipeline::embed_documents;
|
||||
use crate::Config;
|
||||
|
||||
/// Result of the embed command.
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct EmbedCommandResult {
|
||||
pub embedded: usize,
|
||||
pub failed: usize,
|
||||
pub skipped: usize,
|
||||
}
|
||||
|
||||
/// Run the embed command.
|
||||
pub async fn run_embed(
|
||||
config: &Config,
|
||||
retry_failed: bool,
|
||||
) -> Result<EmbedCommandResult> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
|
||||
// Build Ollama config from user settings
|
||||
let ollama_config = OllamaConfig {
|
||||
base_url: config.embedding.base_url.clone(),
|
||||
model: config.embedding.model.clone(),
|
||||
..OllamaConfig::default()
|
||||
};
|
||||
let client = OllamaClient::new(ollama_config);
|
||||
|
||||
// Health check — fail fast if Ollama is down or model missing
|
||||
client.health_check().await?;
|
||||
|
||||
// If retry_failed, clear errors so they become pending again
|
||||
if retry_failed {
|
||||
conn.execute(
|
||||
"UPDATE embedding_metadata SET last_error = NULL, attempt_count = 0
|
||||
WHERE last_error IS NOT NULL",
|
||||
[],
|
||||
)?;
|
||||
}
|
||||
|
||||
let model_name = &config.embedding.model;
|
||||
let result = embed_documents(&conn, &client, model_name, None).await?;
|
||||
|
||||
Ok(EmbedCommandResult {
|
||||
embedded: result.embedded,
|
||||
failed: result.failed,
|
||||
skipped: result.skipped,
|
||||
})
|
||||
}
|
||||
|
||||
/// Print human-readable output.
|
||||
pub fn print_embed(result: &EmbedCommandResult) {
|
||||
println!(
|
||||
"{} Embedding complete",
|
||||
style("done").green().bold(),
|
||||
);
|
||||
println!(" Embedded: {}", result.embedded);
|
||||
if result.failed > 0 {
|
||||
println!(" Failed: {}", style(result.failed).red());
|
||||
}
|
||||
if result.skipped > 0 {
|
||||
println!(" Skipped: {}", result.skipped);
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON output.
|
||||
#[derive(Serialize)]
|
||||
struct EmbedJsonOutput<'a> {
|
||||
ok: bool,
|
||||
data: &'a EmbedCommandResult,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode output.
|
||||
pub fn print_embed_json(result: &EmbedCommandResult) {
|
||||
let output = EmbedJsonOutput {
|
||||
ok: true,
|
||||
data: result,
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
205
src/cli/commands/generate_docs.rs
Normal file
205
src/cli/commands/generate_docs.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
//! Generate searchable documents from ingested GitLab data.
|
||||
|
||||
use console::style;
|
||||
use rusqlite::Connection;
|
||||
use serde::Serialize;
|
||||
use tracing::info;
|
||||
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::documents::{regenerate_dirty_documents, SourceType};
|
||||
use crate::Config;
|
||||
|
||||
const FULL_MODE_CHUNK_SIZE: i64 = 2000;
|
||||
|
||||
/// Result of a generate-docs run.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GenerateDocsResult {
|
||||
pub regenerated: usize,
|
||||
pub unchanged: usize,
|
||||
pub errored: usize,
|
||||
pub seeded: usize,
|
||||
pub full_mode: bool,
|
||||
}
|
||||
|
||||
/// Run the generate-docs pipeline.
|
||||
///
|
||||
/// Default mode: process only existing dirty_sources entries.
|
||||
/// Full mode: seed dirty_sources with ALL entities, then drain.
|
||||
pub fn run_generate_docs(
|
||||
config: &Config,
|
||||
full: bool,
|
||||
project_filter: Option<&str>,
|
||||
) -> Result<GenerateDocsResult> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
let mut result = GenerateDocsResult {
|
||||
full_mode: full,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
if full {
|
||||
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
|
||||
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
|
||||
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
|
||||
}
|
||||
|
||||
let regen = regenerate_dirty_documents(&conn)?;
|
||||
result.regenerated = regen.regenerated;
|
||||
result.unchanged = regen.unchanged;
|
||||
result.errored = regen.errored;
|
||||
|
||||
if full {
|
||||
// Optimize FTS index after bulk rebuild
|
||||
let _ = conn.execute(
|
||||
"INSERT INTO documents_fts(documents_fts) VALUES('optimize')",
|
||||
[],
|
||||
);
|
||||
info!("FTS index optimized after full rebuild");
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Seed dirty_sources with all entities of the given type using keyset pagination.
|
||||
fn seed_dirty(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
project_filter: Option<&str>,
|
||||
) -> Result<usize> {
|
||||
let table = match source_type {
|
||||
SourceType::Issue => "issues",
|
||||
SourceType::MergeRequest => "merge_requests",
|
||||
SourceType::Discussion => "discussions",
|
||||
};
|
||||
let type_str = source_type.as_str();
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
|
||||
let mut total_seeded: usize = 0;
|
||||
let mut last_id: i64 = 0;
|
||||
|
||||
loop {
|
||||
let inserted = if let Some(project) = project_filter {
|
||||
// Resolve project to ID for filtering
|
||||
let project_id: Option<i64> = conn
|
||||
.query_row(
|
||||
"SELECT id FROM projects WHERE path_with_namespace = ?1 COLLATE NOCASE",
|
||||
[project],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.ok();
|
||||
|
||||
let Some(pid) = project_id else {
|
||||
break;
|
||||
};
|
||||
|
||||
conn.execute(
|
||||
&format!(
|
||||
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
|
||||
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
|
||||
FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5
|
||||
ON CONFLICT(source_type, source_id) DO NOTHING"
|
||||
),
|
||||
rusqlite::params![type_str, now, last_id, pid, FULL_MODE_CHUNK_SIZE],
|
||||
)?
|
||||
} else {
|
||||
conn.execute(
|
||||
&format!(
|
||||
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
|
||||
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
|
||||
FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4
|
||||
ON CONFLICT(source_type, source_id) DO NOTHING"
|
||||
),
|
||||
rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE],
|
||||
)?
|
||||
};
|
||||
|
||||
if inserted == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Advance keyset cursor to the max id within the chunk window
|
||||
let max_id: i64 = conn.query_row(
|
||||
&format!(
|
||||
"SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)",
|
||||
table = table
|
||||
),
|
||||
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
total_seeded += inserted;
|
||||
last_id = max_id;
|
||||
}
|
||||
|
||||
info!(
|
||||
source_type = type_str,
|
||||
seeded = total_seeded,
|
||||
"Seeded dirty_sources"
|
||||
);
|
||||
|
||||
Ok(total_seeded)
|
||||
}
|
||||
|
||||
/// Print human-readable output.
|
||||
pub fn print_generate_docs(result: &GenerateDocsResult) {
|
||||
let mode = if result.full_mode { "full" } else { "incremental" };
|
||||
println!(
|
||||
"{} Document generation complete ({})",
|
||||
style("done").green().bold(),
|
||||
mode
|
||||
);
|
||||
|
||||
if result.full_mode {
|
||||
println!(" Seeded: {}", result.seeded);
|
||||
}
|
||||
println!(" Regenerated: {}", result.regenerated);
|
||||
println!(" Unchanged: {}", result.unchanged);
|
||||
if result.errored > 0 {
|
||||
println!(
|
||||
" Errored: {}",
|
||||
style(result.errored).red()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON output structures.
|
||||
#[derive(Serialize)]
|
||||
struct GenerateDocsJsonOutput {
|
||||
ok: bool,
|
||||
data: GenerateDocsJsonData,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GenerateDocsJsonData {
|
||||
mode: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
seeded: Option<usize>,
|
||||
regenerated: usize,
|
||||
unchanged: usize,
|
||||
errored: usize,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode output.
|
||||
pub fn print_generate_docs_json(result: &GenerateDocsResult) {
|
||||
let output = GenerateDocsJsonOutput {
|
||||
ok: true,
|
||||
data: GenerateDocsJsonData {
|
||||
mode: if result.full_mode {
|
||||
"full".to_string()
|
||||
} else {
|
||||
"incremental".to_string()
|
||||
},
|
||||
seeded: if result.full_mode {
|
||||
Some(result.seeded)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
regenerated: result.regenerated,
|
||||
unchanged: result.unchanged,
|
||||
errored: result.errored,
|
||||
},
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
@@ -7,7 +7,7 @@ use serde::Serialize;
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::lock::{AppLock, LockOptions};
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::gitlab::GitLabClient;
|
||||
@@ -51,7 +51,7 @@ pub async fn run_ingest(
|
||||
) -> Result<IngestResult> {
|
||||
// Validate resource type early
|
||||
if resource_type != "issues" && resource_type != "mrs" {
|
||||
return Err(GiError::Other(format!(
|
||||
return Err(LoreError::Other(format!(
|
||||
"Invalid resource type '{}'. Valid types: issues, mrs",
|
||||
resource_type
|
||||
)));
|
||||
@@ -74,7 +74,7 @@ pub async fn run_ingest(
|
||||
lock.acquire(force)?;
|
||||
|
||||
// Get token from environment
|
||||
let token = std::env::var(&config.gitlab.token_env_var).map_err(|_| GiError::TokenNotSet {
|
||||
let token = std::env::var(&config.gitlab.token_env_var).map_err(|_| LoreError::TokenNotSet {
|
||||
env_var: config.gitlab.token_env_var.clone(),
|
||||
})?;
|
||||
|
||||
@@ -119,12 +119,12 @@ pub async fn run_ingest(
|
||||
|
||||
if projects.is_empty() {
|
||||
if let Some(filter) = project_filter {
|
||||
return Err(GiError::Other(format!(
|
||||
return Err(LoreError::Other(format!(
|
||||
"Project '{}' not found in configuration",
|
||||
filter
|
||||
)));
|
||||
}
|
||||
return Err(GiError::Other(
|
||||
return Err(LoreError::Other(
|
||||
"No projects configured. Run 'lore init' first.".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::fs;
|
||||
|
||||
use crate::core::config::{MinimalConfig, MinimalGitLabConfig, ProjectConfig};
|
||||
use crate::core::db::{create_connection, run_migrations};
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::paths::{get_config_path, get_data_dir};
|
||||
use crate::gitlab::{GitLabClient, GitLabProject};
|
||||
|
||||
@@ -45,32 +45,30 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result<InitRe
|
||||
let config_path = get_config_path(options.config_path.as_deref());
|
||||
let data_dir = get_data_dir();
|
||||
|
||||
// 1. Check if config exists
|
||||
if config_path.exists() {
|
||||
// 1. Check if config exists (force takes precedence over non_interactive)
|
||||
if config_path.exists() && !options.force {
|
||||
if options.non_interactive {
|
||||
return Err(GiError::Other(format!(
|
||||
"Config file exists at {}. Cannot proceed in non-interactive mode.",
|
||||
return Err(LoreError::Other(format!(
|
||||
"Config file exists at {}. Use --force to overwrite.",
|
||||
config_path.display()
|
||||
)));
|
||||
}
|
||||
|
||||
if !options.force {
|
||||
return Err(GiError::Other(
|
||||
"User cancelled config overwrite.".to_string(),
|
||||
));
|
||||
}
|
||||
return Err(LoreError::Other(
|
||||
"User cancelled config overwrite.".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// 2. Validate GitLab URL format
|
||||
if url::Url::parse(&inputs.gitlab_url).is_err() {
|
||||
return Err(GiError::Other(format!(
|
||||
return Err(LoreError::Other(format!(
|
||||
"Invalid GitLab URL: {}",
|
||||
inputs.gitlab_url
|
||||
)));
|
||||
}
|
||||
|
||||
// 3. Check token is set in environment
|
||||
let token = std::env::var(&inputs.token_env_var).map_err(|_| GiError::TokenNotSet {
|
||||
let token = std::env::var(&inputs.token_env_var).map_err(|_| LoreError::TokenNotSet {
|
||||
env_var: inputs.token_env_var.clone(),
|
||||
})?;
|
||||
|
||||
@@ -78,8 +76,8 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result<InitRe
|
||||
let client = GitLabClient::new(&inputs.gitlab_url, &token, None);
|
||||
|
||||
let gitlab_user = client.get_current_user().await.map_err(|e| {
|
||||
if matches!(e, GiError::GitLabAuthFailed) {
|
||||
GiError::Other(format!("Authentication failed for {}", inputs.gitlab_url))
|
||||
if matches!(e, LoreError::GitLabAuthFailed) {
|
||||
LoreError::Other(format!("Authentication failed for {}", inputs.gitlab_url))
|
||||
} else {
|
||||
e
|
||||
}
|
||||
@@ -95,8 +93,8 @@ pub async fn run_init(inputs: InitInputs, options: InitOptions) -> Result<InitRe
|
||||
|
||||
for project_path in &inputs.project_paths {
|
||||
let project = client.get_project(project_path).await.map_err(|e| {
|
||||
if matches!(e, GiError::GitLabNotFound { .. }) {
|
||||
GiError::Other(format!("Project not found: {project_path}"))
|
||||
if matches!(e, LoreError::GitLabNotFound { .. }) {
|
||||
LoreError::Other(format!("Project not found: {project_path}"))
|
||||
} else {
|
||||
e
|
||||
}
|
||||
|
||||
@@ -3,21 +3,33 @@
|
||||
pub mod auth_test;
|
||||
pub mod count;
|
||||
pub mod doctor;
|
||||
pub mod embed;
|
||||
pub mod generate_docs;
|
||||
pub mod ingest;
|
||||
pub mod init;
|
||||
pub mod list;
|
||||
pub mod search;
|
||||
pub mod show;
|
||||
pub mod stats;
|
||||
pub mod sync;
|
||||
pub mod sync_status;
|
||||
|
||||
pub use auth_test::run_auth_test;
|
||||
pub use count::{print_count, print_count_json, run_count};
|
||||
pub use doctor::{print_doctor_results, run_doctor};
|
||||
pub use embed::{print_embed, print_embed_json, run_embed};
|
||||
pub use generate_docs::{print_generate_docs, print_generate_docs_json, run_generate_docs};
|
||||
pub use stats::{print_stats, print_stats_json, run_stats};
|
||||
pub use search::{
|
||||
print_search_results, print_search_results_json, run_search, SearchCliFilters, SearchResponse,
|
||||
};
|
||||
pub use ingest::{print_ingest_summary, print_ingest_summary_json, run_ingest};
|
||||
pub use init::{InitInputs, InitOptions, InitResult, run_init};
|
||||
pub use list::{
|
||||
ListFilters, MrListFilters, open_issue_in_browser, open_mr_in_browser, print_list_issues,
|
||||
print_list_issues_json, print_list_mrs, print_list_mrs_json, run_list_issues, run_list_mrs,
|
||||
};
|
||||
pub use sync::{print_sync, print_sync_json, run_sync, SyncOptions, SyncResult};
|
||||
pub use show::{
|
||||
print_show_issue, print_show_issue_json, print_show_mr, print_show_mr_json, run_show_issue,
|
||||
run_show_mr,
|
||||
|
||||
402
src/cli/commands/search.rs
Normal file
402
src/cli/commands/search.rs
Normal file
@@ -0,0 +1,402 @@
|
||||
//! Search command: lexical (FTS5) search with filter support and single-query hydration.
|
||||
|
||||
use console::style;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::core::project::resolve_project;
|
||||
use crate::core::time::{ms_to_iso, parse_since};
|
||||
use crate::documents::SourceType;
|
||||
use crate::search::{
|
||||
apply_filters, get_result_snippet, rank_rrf, search_fts, FtsQueryMode, PathFilter,
|
||||
SearchFilters,
|
||||
};
|
||||
use crate::Config;
|
||||
|
||||
/// Display-ready search result with all fields hydrated.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SearchResultDisplay {
|
||||
pub document_id: i64,
|
||||
pub source_type: String,
|
||||
pub title: String,
|
||||
pub url: Option<String>,
|
||||
pub author: Option<String>,
|
||||
pub created_at: Option<String>,
|
||||
pub updated_at: Option<String>,
|
||||
pub project_path: String,
|
||||
pub labels: Vec<String>,
|
||||
pub paths: Vec<String>,
|
||||
pub snippet: String,
|
||||
pub score: f64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub explain: Option<ExplainData>,
|
||||
}
|
||||
|
||||
/// Ranking explanation for --explain output.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ExplainData {
|
||||
pub vector_rank: Option<usize>,
|
||||
pub fts_rank: Option<usize>,
|
||||
pub rrf_score: f64,
|
||||
}
|
||||
|
||||
/// Search response wrapper.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SearchResponse {
|
||||
pub query: String,
|
||||
pub mode: String,
|
||||
pub total_results: usize,
|
||||
pub results: Vec<SearchResultDisplay>,
|
||||
pub warnings: Vec<String>,
|
||||
}
|
||||
|
||||
/// Build SearchFilters from CLI args.
|
||||
pub struct SearchCliFilters {
|
||||
pub source_type: Option<String>,
|
||||
pub author: Option<String>,
|
||||
pub project: Option<String>,
|
||||
pub labels: Vec<String>,
|
||||
pub path: Option<String>,
|
||||
pub after: Option<String>,
|
||||
pub updated_after: Option<String>,
|
||||
pub limit: usize,
|
||||
}
|
||||
|
||||
/// Run a lexical search query.
|
||||
pub fn run_search(
|
||||
config: &Config,
|
||||
query: &str,
|
||||
cli_filters: SearchCliFilters,
|
||||
fts_mode: FtsQueryMode,
|
||||
explain: bool,
|
||||
) -> Result<SearchResponse> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
|
||||
// Check if any documents exist
|
||||
let doc_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents", [], |row| row.get(0))
|
||||
.unwrap_or(0);
|
||||
|
||||
if doc_count == 0 {
|
||||
return Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
mode: "lexical".to_string(),
|
||||
total_results: 0,
|
||||
results: vec![],
|
||||
warnings: vec![
|
||||
"No documents indexed. Run 'lore generate-docs' first.".to_string()
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
// Build filters
|
||||
let source_type = cli_filters
|
||||
.source_type
|
||||
.as_deref()
|
||||
.and_then(SourceType::parse);
|
||||
|
||||
let project_id = cli_filters
|
||||
.project
|
||||
.as_deref()
|
||||
.map(|p| resolve_project(&conn, p))
|
||||
.transpose()?;
|
||||
|
||||
let after = cli_filters.after.as_deref().and_then(parse_since);
|
||||
let updated_after = cli_filters.updated_after.as_deref().and_then(parse_since);
|
||||
|
||||
let path = cli_filters.path.as_deref().map(|p| {
|
||||
if p.ends_with('/') {
|
||||
PathFilter::Prefix(p.to_string())
|
||||
} else {
|
||||
PathFilter::Exact(p.to_string())
|
||||
}
|
||||
});
|
||||
|
||||
let filters = SearchFilters {
|
||||
source_type,
|
||||
author: cli_filters.author,
|
||||
project_id,
|
||||
after,
|
||||
updated_after,
|
||||
labels: cli_filters.labels,
|
||||
path,
|
||||
limit: cli_filters.limit,
|
||||
};
|
||||
|
||||
// Adaptive recall: wider initial fetch when filters applied
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = if filters.has_any_filter() {
|
||||
(requested * 50).max(200).min(1500)
|
||||
} else {
|
||||
(requested * 10).max(50).min(1500)
|
||||
};
|
||||
|
||||
// FTS search
|
||||
let fts_results = search_fts(&conn, query, top_k, fts_mode)?;
|
||||
let fts_tuples: Vec<(i64, f64)> = fts_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.bm25_score))
|
||||
.collect();
|
||||
|
||||
// Build snippet map before ranking
|
||||
let snippet_map: std::collections::HashMap<i64, String> = fts_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.snippet.clone()))
|
||||
.collect();
|
||||
|
||||
// RRF ranking (single-list for lexical mode)
|
||||
let ranked = rank_rrf(&[], &fts_tuples);
|
||||
let ranked_ids: Vec<i64> = ranked.iter().map(|r| r.document_id).collect();
|
||||
|
||||
// Apply post-retrieval filters
|
||||
let filtered_ids = apply_filters(&conn, &ranked_ids, &filters)?;
|
||||
|
||||
if filtered_ids.is_empty() {
|
||||
return Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
mode: "lexical".to_string(),
|
||||
total_results: 0,
|
||||
results: vec![],
|
||||
warnings: vec![],
|
||||
});
|
||||
}
|
||||
|
||||
// Hydrate results in single round-trip
|
||||
let hydrated = hydrate_results(&conn, &filtered_ids)?;
|
||||
|
||||
// Build display results preserving filter order
|
||||
let rrf_map: std::collections::HashMap<i64, &crate::search::RrfResult> = ranked
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r))
|
||||
.collect();
|
||||
|
||||
let mut results: Vec<SearchResultDisplay> = Vec::with_capacity(hydrated.len());
|
||||
for row in &hydrated {
|
||||
let rrf = rrf_map.get(&row.document_id);
|
||||
let fts_snippet = snippet_map.get(&row.document_id).map(|s| s.as_str());
|
||||
let snippet = get_result_snippet(fts_snippet, &row.content_text);
|
||||
|
||||
let explain_data = if explain {
|
||||
rrf.map(|r| ExplainData {
|
||||
vector_rank: r.vector_rank,
|
||||
fts_rank: r.fts_rank,
|
||||
rrf_score: r.rrf_score,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
results.push(SearchResultDisplay {
|
||||
document_id: row.document_id,
|
||||
source_type: row.source_type.clone(),
|
||||
title: row.title.clone(),
|
||||
url: row.url.clone(),
|
||||
author: row.author.clone(),
|
||||
created_at: row.created_at.map(ms_to_iso),
|
||||
updated_at: row.updated_at.map(ms_to_iso),
|
||||
project_path: row.project_path.clone(),
|
||||
labels: row.labels.clone(),
|
||||
paths: row.paths.clone(),
|
||||
snippet,
|
||||
score: rrf.map(|r| r.normalized_score).unwrap_or(0.0),
|
||||
explain: explain_data,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
mode: "lexical".to_string(),
|
||||
total_results: results.len(),
|
||||
results,
|
||||
warnings: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Raw row from hydration query.
|
||||
struct HydratedRow {
|
||||
document_id: i64,
|
||||
source_type: String,
|
||||
title: String,
|
||||
url: Option<String>,
|
||||
author: Option<String>,
|
||||
created_at: Option<i64>,
|
||||
updated_at: Option<i64>,
|
||||
content_text: String,
|
||||
project_path: String,
|
||||
labels: Vec<String>,
|
||||
paths: Vec<String>,
|
||||
}
|
||||
|
||||
/// Hydrate document IDs into full display rows in a single query.
|
||||
///
|
||||
/// Uses json_each() to pass ranked IDs and preserve ordering via ORDER BY j.key.
|
||||
/// Labels and paths fetched via correlated json_group_array subqueries.
|
||||
fn hydrate_results(
|
||||
conn: &rusqlite::Connection,
|
||||
document_ids: &[i64],
|
||||
) -> Result<Vec<HydratedRow>> {
|
||||
if document_ids.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let ids_json = serde_json::to_string(document_ids)
|
||||
.map_err(|e| LoreError::Other(e.to_string()))?;
|
||||
|
||||
let sql = r#"
|
||||
SELECT d.id, d.source_type, d.title, d.url, d.author_username,
|
||||
d.created_at, d.updated_at, d.content_text,
|
||||
p.path_with_namespace AS project_path,
|
||||
(SELECT json_group_array(dl.label_name)
|
||||
FROM document_labels dl WHERE dl.document_id = d.id) AS labels_json,
|
||||
(SELECT json_group_array(dp.path)
|
||||
FROM document_paths dp WHERE dp.document_id = d.id) AS paths_json
|
||||
FROM json_each(?1) AS j
|
||||
JOIN documents d ON d.id = j.value
|
||||
JOIN projects p ON p.id = d.project_id
|
||||
ORDER BY j.key
|
||||
"#;
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let rows = stmt
|
||||
.query_map([ids_json], |row| {
|
||||
let labels_json: String = row.get(9)?;
|
||||
let paths_json: String = row.get(10)?;
|
||||
|
||||
Ok(HydratedRow {
|
||||
document_id: row.get(0)?,
|
||||
source_type: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
url: row.get(3)?,
|
||||
author: row.get(4)?,
|
||||
created_at: row.get(5)?,
|
||||
updated_at: row.get(6)?,
|
||||
content_text: row.get(7)?,
|
||||
project_path: row.get(8)?,
|
||||
labels: parse_json_array(&labels_json),
|
||||
paths: parse_json_array(&paths_json),
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Parse a JSON array string into a Vec<String>, filtering out null/empty.
|
||||
fn parse_json_array(json: &str) -> Vec<String> {
|
||||
serde_json::from_str::<Vec<serde_json::Value>>(json)
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Print human-readable search results.
|
||||
pub fn print_search_results(response: &SearchResponse) {
|
||||
if !response.warnings.is_empty() {
|
||||
for w in &response.warnings {
|
||||
eprintln!("{} {}", style("Warning:").yellow(), w);
|
||||
}
|
||||
}
|
||||
|
||||
if response.results.is_empty() {
|
||||
println!(
|
||||
"No results found for '{}'",
|
||||
style(&response.query).bold()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
println!(
|
||||
"{} results for '{}' ({})",
|
||||
response.total_results,
|
||||
style(&response.query).bold(),
|
||||
response.mode
|
||||
);
|
||||
println!();
|
||||
|
||||
for (i, result) in response.results.iter().enumerate() {
|
||||
let type_prefix = match result.source_type.as_str() {
|
||||
"issue" => "Issue",
|
||||
"merge_request" => "MR",
|
||||
"discussion" => "Discussion",
|
||||
_ => &result.source_type,
|
||||
};
|
||||
|
||||
println!(
|
||||
"[{}] {} - {} (score: {:.2})",
|
||||
i + 1,
|
||||
style(type_prefix).cyan(),
|
||||
result.title,
|
||||
result.score
|
||||
);
|
||||
|
||||
if let Some(ref url) = result.url {
|
||||
println!(" {}", style(url).dim());
|
||||
}
|
||||
|
||||
println!(
|
||||
" {} | {}",
|
||||
style(&result.project_path).dim(),
|
||||
result
|
||||
.author
|
||||
.as_deref()
|
||||
.map(|a| format!("@{}", a))
|
||||
.unwrap_or_default()
|
||||
);
|
||||
|
||||
if !result.labels.is_empty() {
|
||||
println!(
|
||||
" Labels: {}",
|
||||
result.labels.join(", ")
|
||||
);
|
||||
}
|
||||
|
||||
// Strip HTML tags from snippet for terminal display
|
||||
let clean_snippet = result
|
||||
.snippet
|
||||
.replace("<mark>", "")
|
||||
.replace("</mark>", "");
|
||||
println!(" {}", style(clean_snippet).dim());
|
||||
|
||||
if let Some(ref explain) = result.explain {
|
||||
println!(
|
||||
" {} fts_rank={} rrf_score={:.6}",
|
||||
style("[explain]").magenta(),
|
||||
explain
|
||||
.fts_rank
|
||||
.map(|r| r.to_string())
|
||||
.unwrap_or_else(|| "-".into()),
|
||||
explain.rrf_score
|
||||
);
|
||||
}
|
||||
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON output structures.
|
||||
#[derive(Serialize)]
|
||||
struct SearchJsonOutput<'a> {
|
||||
ok: bool,
|
||||
data: &'a SearchResponse,
|
||||
meta: SearchMeta,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SearchMeta {
|
||||
elapsed_ms: u64,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode output.
|
||||
pub fn print_search_results_json(response: &SearchResponse, elapsed_ms: u64) {
|
||||
let output = SearchJsonOutput {
|
||||
ok: true,
|
||||
data: response,
|
||||
meta: SearchMeta { elapsed_ms },
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use serde::Serialize;
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::core::time::ms_to_iso;
|
||||
|
||||
@@ -188,11 +188,11 @@ fn find_issue(conn: &Connection, iid: i64, project_filter: Option<&str>) -> Resu
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
match issues.len() {
|
||||
0 => Err(GiError::NotFound(format!("Issue #{} not found", iid))),
|
||||
0 => Err(LoreError::NotFound(format!("Issue #{} not found", iid))),
|
||||
1 => Ok(issues.into_iter().next().unwrap()),
|
||||
_ => {
|
||||
let projects: Vec<String> = issues.iter().map(|i| i.project_path.clone()).collect();
|
||||
Err(GiError::Ambiguous(format!(
|
||||
Err(LoreError::Ambiguous(format!(
|
||||
"Issue #{} exists in multiple projects: {}. Use --project to specify.",
|
||||
iid,
|
||||
projects.join(", ")
|
||||
@@ -386,11 +386,11 @@ fn find_mr(conn: &Connection, iid: i64, project_filter: Option<&str>) -> Result<
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
match mrs.len() {
|
||||
0 => Err(GiError::NotFound(format!("MR !{} not found", iid))),
|
||||
0 => Err(LoreError::NotFound(format!("MR !{} not found", iid))),
|
||||
1 => Ok(mrs.into_iter().next().unwrap()),
|
||||
_ => {
|
||||
let projects: Vec<String> = mrs.iter().map(|m| m.project_path.clone()).collect();
|
||||
Err(GiError::Ambiguous(format!(
|
||||
Err(LoreError::Ambiguous(format!(
|
||||
"MR !{} exists in multiple projects: {}. Use --project to specify.",
|
||||
iid,
|
||||
projects.join(", ")
|
||||
|
||||
348
src/cli/commands/stats.rs
Normal file
348
src/cli/commands/stats.rs
Normal file
@@ -0,0 +1,348 @@
|
||||
//! Stats command: document counts, embedding coverage, queue status, integrity checks.
|
||||
|
||||
use console::style;
|
||||
use rusqlite::Connection;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::core::db::create_connection;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::paths::get_db_path;
|
||||
use crate::Config;
|
||||
|
||||
/// Result of the stats command.
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct StatsResult {
|
||||
pub documents: DocumentStats,
|
||||
pub embeddings: EmbeddingStats,
|
||||
pub fts: FtsStats,
|
||||
pub queues: QueueStats,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub integrity: Option<IntegrityResult>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct DocumentStats {
|
||||
pub total: i64,
|
||||
pub issues: i64,
|
||||
pub merge_requests: i64,
|
||||
pub discussions: i64,
|
||||
pub truncated: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct EmbeddingStats {
|
||||
pub embedded_documents: i64,
|
||||
pub total_chunks: i64,
|
||||
pub coverage_pct: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct FtsStats {
|
||||
pub indexed: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct QueueStats {
|
||||
pub dirty_sources: i64,
|
||||
pub dirty_sources_failed: i64,
|
||||
pub pending_discussion_fetches: i64,
|
||||
pub pending_discussion_fetches_failed: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct IntegrityResult {
|
||||
pub ok: bool,
|
||||
pub fts_doc_mismatch: bool,
|
||||
pub orphan_embeddings: i64,
|
||||
pub stale_metadata: i64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub repair: Option<RepairResult>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct RepairResult {
|
||||
pub fts_rebuilt: bool,
|
||||
pub orphans_deleted: i64,
|
||||
pub stale_cleared: i64,
|
||||
}
|
||||
|
||||
/// Run the stats command.
|
||||
pub fn run_stats(
|
||||
config: &Config,
|
||||
check: bool,
|
||||
repair: bool,
|
||||
) -> Result<StatsResult> {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
let conn = create_connection(&db_path)?;
|
||||
|
||||
let mut result = StatsResult::default();
|
||||
|
||||
// Document counts
|
||||
result.documents.total = count_query(&conn, "SELECT COUNT(*) FROM documents")?;
|
||||
result.documents.issues =
|
||||
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'issue'")?;
|
||||
result.documents.merge_requests =
|
||||
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'merge_request'")?;
|
||||
result.documents.discussions =
|
||||
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE source_type = 'discussion'")?;
|
||||
result.documents.truncated =
|
||||
count_query(&conn, "SELECT COUNT(*) FROM documents WHERE is_truncated = 1")?;
|
||||
|
||||
// Embedding stats — skip gracefully if table doesn't exist (Gate A only)
|
||||
if table_exists(&conn, "embedding_metadata") {
|
||||
let embedded = count_query(
|
||||
&conn,
|
||||
"SELECT COUNT(DISTINCT document_id) FROM embedding_metadata WHERE last_error IS NULL",
|
||||
)?;
|
||||
let chunks = count_query(
|
||||
&conn,
|
||||
"SELECT COUNT(*) FROM embedding_metadata WHERE last_error IS NULL",
|
||||
)?;
|
||||
result.embeddings.embedded_documents = embedded;
|
||||
result.embeddings.total_chunks = chunks;
|
||||
result.embeddings.coverage_pct = if result.documents.total > 0 {
|
||||
(embedded as f64 / result.documents.total as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
}
|
||||
|
||||
// FTS stats
|
||||
result.fts.indexed = count_query(&conn, "SELECT COUNT(*) FROM documents_fts")?;
|
||||
|
||||
// Queue stats
|
||||
result.queues.dirty_sources =
|
||||
count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NULL")?;
|
||||
result.queues.dirty_sources_failed =
|
||||
count_query(&conn, "SELECT COUNT(*) FROM dirty_sources WHERE last_error IS NOT NULL")?;
|
||||
|
||||
if table_exists(&conn, "pending_discussion_fetches") {
|
||||
result.queues.pending_discussion_fetches = count_query(
|
||||
&conn,
|
||||
"SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NULL",
|
||||
)?;
|
||||
result.queues.pending_discussion_fetches_failed = count_query(
|
||||
&conn,
|
||||
"SELECT COUNT(*) FROM pending_discussion_fetches WHERE last_error IS NOT NULL",
|
||||
)?;
|
||||
}
|
||||
|
||||
// Integrity check
|
||||
if check {
|
||||
let mut integrity = IntegrityResult::default();
|
||||
|
||||
// FTS/doc count mismatch
|
||||
integrity.fts_doc_mismatch = result.fts.indexed != result.documents.total;
|
||||
|
||||
// Orphan embeddings (rowid/1000 should match a document ID)
|
||||
if table_exists(&conn, "embeddings") {
|
||||
integrity.orphan_embeddings = count_query(
|
||||
&conn,
|
||||
"SELECT COUNT(*) FROM embedding_metadata em
|
||||
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = em.document_id)",
|
||||
)?;
|
||||
}
|
||||
|
||||
// Stale metadata (document_hash != current content_hash)
|
||||
if table_exists(&conn, "embedding_metadata") {
|
||||
integrity.stale_metadata = count_query(
|
||||
&conn,
|
||||
"SELECT COUNT(*) FROM embedding_metadata em
|
||||
JOIN documents d ON d.id = em.document_id
|
||||
WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash",
|
||||
)?;
|
||||
}
|
||||
|
||||
integrity.ok = !integrity.fts_doc_mismatch
|
||||
&& integrity.orphan_embeddings == 0
|
||||
&& integrity.stale_metadata == 0;
|
||||
|
||||
// Repair
|
||||
if repair {
|
||||
let mut repair_result = RepairResult::default();
|
||||
|
||||
if integrity.fts_doc_mismatch {
|
||||
conn.execute(
|
||||
"INSERT INTO documents_fts(documents_fts) VALUES('rebuild')",
|
||||
[],
|
||||
)?;
|
||||
repair_result.fts_rebuilt = true;
|
||||
}
|
||||
|
||||
if integrity.orphan_embeddings > 0 && table_exists(&conn, "embedding_metadata") {
|
||||
let deleted = conn.execute(
|
||||
"DELETE FROM embedding_metadata
|
||||
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = embedding_metadata.document_id)",
|
||||
[],
|
||||
)?;
|
||||
repair_result.orphans_deleted = deleted as i64;
|
||||
|
||||
// Also clean orphaned vectors if vec0 table exists
|
||||
if table_exists(&conn, "embeddings") {
|
||||
let _ = conn.execute(
|
||||
"DELETE FROM embeddings
|
||||
WHERE rowid / 1000 NOT IN (SELECT id FROM documents)",
|
||||
[],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if integrity.stale_metadata > 0 && table_exists(&conn, "embedding_metadata") {
|
||||
let cleared = conn.execute(
|
||||
"DELETE FROM embedding_metadata
|
||||
WHERE document_id IN (
|
||||
SELECT em.document_id FROM embedding_metadata em
|
||||
JOIN documents d ON d.id = em.document_id
|
||||
WHERE em.chunk_index = 0 AND em.document_hash != d.content_hash
|
||||
)",
|
||||
[],
|
||||
)?;
|
||||
repair_result.stale_cleared = cleared as i64;
|
||||
}
|
||||
|
||||
integrity.repair = Some(repair_result);
|
||||
}
|
||||
|
||||
result.integrity = Some(integrity);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn count_query(conn: &Connection, sql: &str) -> Result<i64> {
|
||||
let count: i64 = conn
|
||||
.query_row(sql, [], |row| row.get(0))
|
||||
.unwrap_or(0);
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
fn table_exists(conn: &Connection, table: &str) -> bool {
|
||||
conn.query_row(
|
||||
"SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
|
||||
[table],
|
||||
|row| row.get::<_, i64>(0),
|
||||
)
|
||||
.unwrap_or(0)
|
||||
> 0
|
||||
}
|
||||
|
||||
/// Print human-readable stats.
|
||||
pub fn print_stats(result: &StatsResult) {
|
||||
println!("{}", style("Documents").cyan().bold());
|
||||
println!(" Total: {}", result.documents.total);
|
||||
println!(" Issues: {}", result.documents.issues);
|
||||
println!(" Merge Requests: {}", result.documents.merge_requests);
|
||||
println!(" Discussions: {}", result.documents.discussions);
|
||||
if result.documents.truncated > 0 {
|
||||
println!(" Truncated: {}", style(result.documents.truncated).yellow());
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("{}", style("Search Index").cyan().bold());
|
||||
println!(" FTS indexed: {}", result.fts.indexed);
|
||||
println!(
|
||||
" Embedding coverage: {:.1}% ({}/{})",
|
||||
result.embeddings.coverage_pct,
|
||||
result.embeddings.embedded_documents,
|
||||
result.documents.total
|
||||
);
|
||||
if result.embeddings.total_chunks > 0 {
|
||||
println!(" Total chunks: {}", result.embeddings.total_chunks);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("{}", style("Queues").cyan().bold());
|
||||
println!(" Dirty sources: {} pending, {} failed",
|
||||
result.queues.dirty_sources,
|
||||
result.queues.dirty_sources_failed
|
||||
);
|
||||
println!(" Discussion fetch: {} pending, {} failed",
|
||||
result.queues.pending_discussion_fetches,
|
||||
result.queues.pending_discussion_fetches_failed
|
||||
);
|
||||
|
||||
if let Some(ref integrity) = result.integrity {
|
||||
println!();
|
||||
let status = if integrity.ok {
|
||||
style("OK").green().bold()
|
||||
} else {
|
||||
style("ISSUES FOUND").red().bold()
|
||||
};
|
||||
println!("{} Integrity: {}", style("Check").cyan().bold(), status);
|
||||
|
||||
if integrity.fts_doc_mismatch {
|
||||
println!(" {} FTS/document count mismatch", style("!").red());
|
||||
}
|
||||
if integrity.orphan_embeddings > 0 {
|
||||
println!(
|
||||
" {} {} orphan embeddings",
|
||||
style("!").red(),
|
||||
integrity.orphan_embeddings
|
||||
);
|
||||
}
|
||||
if integrity.stale_metadata > 0 {
|
||||
println!(
|
||||
" {} {} stale embedding metadata",
|
||||
style("!").red(),
|
||||
integrity.stale_metadata
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(ref repair) = integrity.repair {
|
||||
println!();
|
||||
println!("{}", style("Repair").cyan().bold());
|
||||
if repair.fts_rebuilt {
|
||||
println!(" {} FTS index rebuilt", style("fixed").green());
|
||||
}
|
||||
if repair.orphans_deleted > 0 {
|
||||
println!(
|
||||
" {} {} orphan embeddings deleted",
|
||||
style("fixed").green(),
|
||||
repair.orphans_deleted
|
||||
);
|
||||
}
|
||||
if repair.stale_cleared > 0 {
|
||||
println!(
|
||||
" {} {} stale metadata entries cleared",
|
||||
style("fixed").green(),
|
||||
repair.stale_cleared
|
||||
);
|
||||
}
|
||||
if !repair.fts_rebuilt && repair.orphans_deleted == 0 && repair.stale_cleared == 0 {
|
||||
println!(" No issues to repair.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON output structures.
|
||||
#[derive(Serialize)]
|
||||
struct StatsJsonOutput {
|
||||
ok: bool,
|
||||
data: StatsResult,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode output.
|
||||
pub fn print_stats_json(result: &StatsResult) {
|
||||
let output = StatsJsonOutput {
|
||||
ok: true,
|
||||
data: StatsResult {
|
||||
documents: DocumentStats { ..*&result.documents },
|
||||
embeddings: EmbeddingStats { ..*&result.embeddings },
|
||||
fts: FtsStats { ..*&result.fts },
|
||||
queues: QueueStats { ..*&result.queues },
|
||||
integrity: result.integrity.as_ref().map(|i| IntegrityResult {
|
||||
ok: i.ok,
|
||||
fts_doc_mismatch: i.fts_doc_mismatch,
|
||||
orphan_embeddings: i.orphan_embeddings,
|
||||
stale_metadata: i.stale_metadata,
|
||||
repair: i.repair.as_ref().map(|r| RepairResult {
|
||||
fts_rebuilt: r.fts_rebuilt,
|
||||
orphans_deleted: r.orphans_deleted,
|
||||
stale_cleared: r.stale_cleared,
|
||||
}),
|
||||
}),
|
||||
},
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
124
src/cli/commands/sync.rs
Normal file
124
src/cli/commands/sync.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
//! Sync command: unified orchestrator for ingest -> generate-docs -> embed.
|
||||
|
||||
use console::style;
|
||||
use serde::Serialize;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::error::Result;
|
||||
|
||||
use super::embed::run_embed;
|
||||
use super::generate_docs::run_generate_docs;
|
||||
use super::ingest::run_ingest;
|
||||
|
||||
/// Options for the sync command.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct SyncOptions {
|
||||
pub full: bool,
|
||||
pub force: bool,
|
||||
pub no_embed: bool,
|
||||
pub no_docs: bool,
|
||||
}
|
||||
|
||||
/// Result of the sync command.
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
pub struct SyncResult {
|
||||
pub issues_updated: usize,
|
||||
pub mrs_updated: usize,
|
||||
pub discussions_fetched: usize,
|
||||
pub documents_regenerated: usize,
|
||||
pub documents_embedded: usize,
|
||||
}
|
||||
|
||||
/// Run the full sync pipeline: ingest -> generate-docs -> embed.
|
||||
pub async fn run_sync(config: &Config, options: SyncOptions) -> Result<SyncResult> {
|
||||
let mut result = SyncResult::default();
|
||||
|
||||
// Stage 1: Ingest issues
|
||||
info!("Sync stage 1/4: ingesting issues");
|
||||
let issues_result = run_ingest(config, "issues", None, options.force, options.full, true).await?;
|
||||
result.issues_updated = issues_result.issues_upserted;
|
||||
result.discussions_fetched += issues_result.discussions_fetched;
|
||||
|
||||
// Stage 2: Ingest MRs
|
||||
info!("Sync stage 2/4: ingesting merge requests");
|
||||
let mrs_result = run_ingest(config, "mrs", None, options.force, options.full, true).await?;
|
||||
result.mrs_updated = mrs_result.mrs_upserted;
|
||||
result.discussions_fetched += mrs_result.discussions_fetched;
|
||||
|
||||
// Stage 3: Generate documents (unless --no-docs)
|
||||
if options.no_docs {
|
||||
info!("Sync stage 3/4: skipping document generation (--no-docs)");
|
||||
} else {
|
||||
info!("Sync stage 3/4: generating documents");
|
||||
let docs_result = run_generate_docs(config, false, None)?;
|
||||
result.documents_regenerated = docs_result.regenerated;
|
||||
}
|
||||
|
||||
// Stage 4: Embed documents (unless --no-embed)
|
||||
if options.no_embed {
|
||||
info!("Sync stage 4/4: skipping embedding (--no-embed)");
|
||||
} else {
|
||||
info!("Sync stage 4/4: embedding documents");
|
||||
match run_embed(config, false).await {
|
||||
Ok(embed_result) => {
|
||||
result.documents_embedded = embed_result.embedded;
|
||||
}
|
||||
Err(e) => {
|
||||
// Graceful degradation: Ollama down is a warning, not an error
|
||||
warn!(error = %e, "Embedding stage failed (Ollama may be unavailable), continuing");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
issues = result.issues_updated,
|
||||
mrs = result.mrs_updated,
|
||||
discussions = result.discussions_fetched,
|
||||
docs = result.documents_regenerated,
|
||||
embedded = result.documents_embedded,
|
||||
"Sync pipeline complete"
|
||||
);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Print human-readable sync summary.
|
||||
pub fn print_sync(result: &SyncResult, elapsed: std::time::Duration) {
|
||||
println!(
|
||||
"{} Sync complete:",
|
||||
style("done").green().bold(),
|
||||
);
|
||||
println!(" Issues updated: {}", result.issues_updated);
|
||||
println!(" MRs updated: {}", result.mrs_updated);
|
||||
println!(" Discussions fetched: {}", result.discussions_fetched);
|
||||
println!(" Documents regenerated: {}", result.documents_regenerated);
|
||||
println!(" Documents embedded: {}", result.documents_embedded);
|
||||
println!(
|
||||
" Elapsed: {:.1}s",
|
||||
elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
|
||||
/// JSON output for sync.
|
||||
#[derive(Serialize)]
|
||||
struct SyncJsonOutput<'a> {
|
||||
ok: bool,
|
||||
data: &'a SyncResult,
|
||||
meta: SyncMeta,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SyncMeta {
|
||||
elapsed_ms: u64,
|
||||
}
|
||||
|
||||
/// Print JSON robot-mode sync output.
|
||||
pub fn print_sync_json(result: &SyncResult, elapsed_ms: u64) {
|
||||
let output = SyncJsonOutput {
|
||||
ok: true,
|
||||
data: result,
|
||||
meta: SyncMeta { elapsed_ms },
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
138
src/cli/mod.rs
138
src/cli/mod.rs
@@ -69,6 +69,18 @@ pub enum Commands {
|
||||
/// Fail if prompts would be shown
|
||||
#[arg(long)]
|
||||
non_interactive: bool,
|
||||
|
||||
/// GitLab base URL (required in robot mode)
|
||||
#[arg(long)]
|
||||
gitlab_url: Option<String>,
|
||||
|
||||
/// Environment variable name holding GitLab token (required in robot mode)
|
||||
#[arg(long)]
|
||||
token_env_var: Option<String>,
|
||||
|
||||
/// Comma-separated project paths (required in robot mode)
|
||||
#[arg(long)]
|
||||
projects: Option<String>,
|
||||
},
|
||||
|
||||
/// Create timestamped database backup
|
||||
@@ -81,9 +93,32 @@ pub enum Commands {
|
||||
yes: bool,
|
||||
},
|
||||
|
||||
/// Search indexed documents
|
||||
Search(SearchArgs),
|
||||
|
||||
/// Show document and index statistics
|
||||
Stats(StatsArgs),
|
||||
|
||||
/// Generate searchable documents from ingested data
|
||||
#[command(name = "generate-docs")]
|
||||
GenerateDocs(GenerateDocsArgs),
|
||||
|
||||
/// Generate vector embeddings for documents via Ollama
|
||||
Embed(EmbedArgs),
|
||||
|
||||
/// Run full sync pipeline: ingest -> generate-docs -> embed
|
||||
Sync(SyncArgs),
|
||||
|
||||
/// Run pending database migrations
|
||||
Migrate,
|
||||
|
||||
/// Quick health check: config, database, schema version
|
||||
Health,
|
||||
|
||||
/// Machine-readable command manifest for agent self-discovery
|
||||
#[command(name = "robot-docs")]
|
||||
RobotDocs,
|
||||
|
||||
// --- Hidden backward-compat aliases ---
|
||||
/// List issues or MRs (deprecated: use 'lore issues' or 'lore mrs')
|
||||
#[command(hide = true)]
|
||||
@@ -299,6 +334,109 @@ pub struct IngestArgs {
|
||||
pub full: bool,
|
||||
}
|
||||
|
||||
/// Arguments for `lore stats`
|
||||
#[derive(Parser)]
|
||||
pub struct StatsArgs {
|
||||
/// Run integrity checks
|
||||
#[arg(long)]
|
||||
pub check: bool,
|
||||
|
||||
/// Repair integrity issues (requires --check)
|
||||
#[arg(long, requires = "check")]
|
||||
pub repair: bool,
|
||||
}
|
||||
|
||||
/// Arguments for `lore search <QUERY>`
|
||||
#[derive(Parser)]
|
||||
pub struct SearchArgs {
|
||||
/// Search query string
|
||||
pub query: String,
|
||||
|
||||
/// Search mode (lexical, hybrid, semantic)
|
||||
#[arg(long, default_value = "hybrid")]
|
||||
pub mode: String,
|
||||
|
||||
/// Filter by source type (issue, mr, discussion)
|
||||
#[arg(long = "type", value_name = "TYPE")]
|
||||
pub source_type: Option<String>,
|
||||
|
||||
/// Filter by author username
|
||||
#[arg(long)]
|
||||
pub author: Option<String>,
|
||||
|
||||
/// Filter by project path
|
||||
#[arg(short = 'p', long)]
|
||||
pub project: Option<String>,
|
||||
|
||||
/// Filter by label (repeatable, AND logic)
|
||||
#[arg(long, action = clap::ArgAction::Append)]
|
||||
pub label: Vec<String>,
|
||||
|
||||
/// Filter by file path (trailing / for prefix match)
|
||||
#[arg(long)]
|
||||
pub path: Option<String>,
|
||||
|
||||
/// Filter by created after (7d, 2w, or YYYY-MM-DD)
|
||||
#[arg(long)]
|
||||
pub after: Option<String>,
|
||||
|
||||
/// Filter by updated after (7d, 2w, or YYYY-MM-DD)
|
||||
#[arg(long = "updated-after")]
|
||||
pub updated_after: Option<String>,
|
||||
|
||||
/// Maximum results (default 20, max 100)
|
||||
#[arg(short = 'n', long = "limit", default_value = "20")]
|
||||
pub limit: usize,
|
||||
|
||||
/// Show ranking explanation per result
|
||||
#[arg(long)]
|
||||
pub explain: bool,
|
||||
|
||||
/// FTS query mode: safe (default) or raw
|
||||
#[arg(long = "fts-mode", default_value = "safe")]
|
||||
pub fts_mode: String,
|
||||
}
|
||||
|
||||
/// Arguments for `lore generate-docs`
|
||||
#[derive(Parser)]
|
||||
pub struct GenerateDocsArgs {
|
||||
/// Full rebuild: seed all entities into dirty queue, then drain
|
||||
#[arg(long)]
|
||||
pub full: bool,
|
||||
|
||||
/// Filter to single project
|
||||
#[arg(short = 'p', long)]
|
||||
pub project: Option<String>,
|
||||
}
|
||||
|
||||
/// Arguments for `lore sync`
|
||||
#[derive(Parser)]
|
||||
pub struct SyncArgs {
|
||||
/// Reset cursors, fetch everything
|
||||
#[arg(long)]
|
||||
pub full: bool,
|
||||
|
||||
/// Override stale lock
|
||||
#[arg(long)]
|
||||
pub force: bool,
|
||||
|
||||
/// Skip embedding step
|
||||
#[arg(long)]
|
||||
pub no_embed: bool,
|
||||
|
||||
/// Skip document regeneration
|
||||
#[arg(long)]
|
||||
pub no_docs: bool,
|
||||
}
|
||||
|
||||
/// Arguments for `lore embed`
|
||||
#[derive(Parser)]
|
||||
pub struct EmbedArgs {
|
||||
/// Retry previously failed embeddings
|
||||
#[arg(long)]
|
||||
pub retry_failed: bool,
|
||||
}
|
||||
|
||||
/// Arguments for `lore count <ENTITY>`
|
||||
#[derive(Parser)]
|
||||
pub struct CountArgs {
|
||||
|
||||
99
src/core/backoff.rs
Normal file
99
src/core/backoff.rs
Normal file
@@ -0,0 +1,99 @@
|
||||
use rand::Rng;
|
||||
|
||||
/// Compute next_attempt_at with exponential backoff and jitter.
|
||||
///
|
||||
/// Formula: now + min(3600000, 1000 * 2^attempt_count) * (0.9 to 1.1)
|
||||
/// - Capped at 1 hour to prevent runaway delays
|
||||
/// - ±10% jitter prevents synchronized retries after outages
|
||||
///
|
||||
/// Used by:
|
||||
/// - `dirty_sources` retry scheduling (document regeneration failures)
|
||||
/// - `pending_discussion_fetches` retry scheduling (API fetch failures)
|
||||
///
|
||||
/// Having one implementation prevents subtle divergence between queues
|
||||
/// (e.g., different caps or jitter ranges).
|
||||
pub fn compute_next_attempt_at(now: i64, attempt_count: i64) -> i64 {
|
||||
// Cap attempt_count to prevent overflow (2^30 > 1 hour anyway)
|
||||
let capped_attempts = attempt_count.min(30) as u32;
|
||||
let base_delay_ms = 1000_i64.saturating_mul(1 << capped_attempts);
|
||||
let capped_delay_ms = base_delay_ms.min(3_600_000); // 1 hour cap
|
||||
|
||||
// Add ±10% jitter
|
||||
let jitter_factor = rand::thread_rng().gen_range(0.9..=1.1);
|
||||
let delay_with_jitter = (capped_delay_ms as f64 * jitter_factor) as i64;
|
||||
|
||||
now + delay_with_jitter
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const MAX_DELAY_MS: i64 = 3_600_000;
|
||||
|
||||
#[test]
|
||||
fn test_exponential_curve() {
|
||||
let now = 1_000_000_000_i64;
|
||||
// Each attempt should roughly double the delay (within jitter)
|
||||
for attempt in 1..=10 {
|
||||
let result = compute_next_attempt_at(now, attempt);
|
||||
let delay = result - now;
|
||||
let expected_base = 1000_i64 * (1 << attempt);
|
||||
let min_expected = (expected_base as f64 * 0.89) as i64;
|
||||
let max_expected = (expected_base as f64 * 1.11) as i64;
|
||||
assert!(
|
||||
delay >= min_expected && delay <= max_expected,
|
||||
"attempt {attempt}: delay {delay} not in [{min_expected}, {max_expected}]"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cap_at_one_hour() {
|
||||
let now = 1_000_000_000_i64;
|
||||
for attempt in [20, 25, 30, 50, 100] {
|
||||
let result = compute_next_attempt_at(now, attempt);
|
||||
let delay = result - now;
|
||||
let max_with_jitter = (MAX_DELAY_MS as f64 * 1.11) as i64;
|
||||
assert!(
|
||||
delay <= max_with_jitter,
|
||||
"attempt {attempt}: delay {delay} exceeds cap {max_with_jitter}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jitter_range() {
|
||||
let now = 1_000_000_000_i64;
|
||||
let attempt = 5; // base = 32000
|
||||
let base = 1000_i64 * (1 << attempt);
|
||||
let min_delay = (base as f64 * 0.89) as i64;
|
||||
let max_delay = (base as f64 * 1.11) as i64;
|
||||
|
||||
for _ in 0..100 {
|
||||
let result = compute_next_attempt_at(now, attempt);
|
||||
let delay = result - now;
|
||||
assert!(
|
||||
delay >= min_delay && delay <= max_delay,
|
||||
"delay {delay} not in jitter range [{min_delay}, {max_delay}]"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_first_retry_is_about_two_seconds() {
|
||||
let now = 1_000_000_000_i64;
|
||||
let result = compute_next_attempt_at(now, 1);
|
||||
let delay = result - now;
|
||||
// attempt 1: base = 2000ms, with jitter: 1800-2200ms
|
||||
assert!(delay >= 1800 && delay <= 2200, "first retry delay: {delay}ms");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_overflow_safety() {
|
||||
let now = i64::MAX / 2;
|
||||
// Should not panic even with very large attempt_count
|
||||
let result = compute_next_attempt_at(now, i64::MAX);
|
||||
assert!(result > now);
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use serde::Deserialize;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use super::error::{GiError, Result};
|
||||
use super::error::{LoreError, Result};
|
||||
use super::paths::get_config_path;
|
||||
|
||||
/// GitLab connection settings.
|
||||
@@ -130,7 +130,7 @@ impl Config {
|
||||
let config_path = get_config_path(cli_override);
|
||||
|
||||
if !config_path.exists() {
|
||||
return Err(GiError::ConfigNotFound {
|
||||
return Err(LoreError::ConfigNotFound {
|
||||
path: config_path.display().to_string(),
|
||||
});
|
||||
}
|
||||
@@ -140,25 +140,25 @@ impl Config {
|
||||
|
||||
/// Load configuration from a specific path.
|
||||
pub fn load_from_path(path: &Path) -> Result<Self> {
|
||||
let content = fs::read_to_string(path).map_err(|e| GiError::ConfigInvalid {
|
||||
let content = fs::read_to_string(path).map_err(|e| LoreError::ConfigInvalid {
|
||||
details: format!("Failed to read config file: {e}"),
|
||||
})?;
|
||||
|
||||
let config: Config =
|
||||
serde_json::from_str(&content).map_err(|e| GiError::ConfigInvalid {
|
||||
serde_json::from_str(&content).map_err(|e| LoreError::ConfigInvalid {
|
||||
details: format!("Invalid JSON: {e}"),
|
||||
})?;
|
||||
|
||||
// Validate required fields
|
||||
if config.projects.is_empty() {
|
||||
return Err(GiError::ConfigInvalid {
|
||||
return Err(LoreError::ConfigInvalid {
|
||||
details: "At least one project is required".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
for project in &config.projects {
|
||||
if project.path.is_empty() {
|
||||
return Err(GiError::ConfigInvalid {
|
||||
return Err(LoreError::ConfigInvalid {
|
||||
details: "Project path cannot be empty".to_string(),
|
||||
});
|
||||
}
|
||||
@@ -166,7 +166,7 @@ impl Config {
|
||||
|
||||
// Validate URL format
|
||||
if url::Url::parse(&config.gitlab.base_url).is_err() {
|
||||
return Err(GiError::ConfigInvalid {
|
||||
return Err(LoreError::ConfigInvalid {
|
||||
details: format!("Invalid GitLab URL: {}", config.gitlab.base_url),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::fs;
|
||||
use std::path::Path;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::error::{GiError, Result};
|
||||
use super::error::{LoreError, Result};
|
||||
|
||||
/// Embedded migrations - compiled into the binary.
|
||||
const MIGRATIONS: &[(&str, &str)] = &[
|
||||
@@ -27,6 +27,18 @@ const MIGRATIONS: &[(&str, &str)] = &[
|
||||
"006",
|
||||
include_str!("../../migrations/006_merge_requests.sql"),
|
||||
),
|
||||
(
|
||||
"007",
|
||||
include_str!("../../migrations/007_documents.sql"),
|
||||
),
|
||||
(
|
||||
"008",
|
||||
include_str!("../../migrations/008_fts5.sql"),
|
||||
),
|
||||
(
|
||||
"009",
|
||||
include_str!("../../migrations/009_embeddings.sql"),
|
||||
),
|
||||
];
|
||||
|
||||
/// Create a database connection with production-grade pragmas.
|
||||
@@ -88,13 +100,36 @@ pub fn run_migrations(conn: &Connection) -> Result<()> {
|
||||
continue;
|
||||
}
|
||||
|
||||
conn.execute_batch(sql)
|
||||
.map_err(|e| GiError::MigrationFailed {
|
||||
// Wrap each migration in a transaction to prevent partial application.
|
||||
// If the migration SQL already contains BEGIN/COMMIT, execute_batch handles
|
||||
// it, but wrapping in a savepoint ensures atomicity for those that don't.
|
||||
let savepoint_name = format!("migration_{}", version);
|
||||
conn.execute_batch(&format!("SAVEPOINT {}", savepoint_name))
|
||||
.map_err(|e| LoreError::MigrationFailed {
|
||||
version,
|
||||
message: e.to_string(),
|
||||
message: format!("Failed to create savepoint: {}", e),
|
||||
source: Some(e),
|
||||
})?;
|
||||
|
||||
match conn.execute_batch(sql) {
|
||||
Ok(()) => {
|
||||
conn.execute_batch(&format!("RELEASE {}", savepoint_name))
|
||||
.map_err(|e| LoreError::MigrationFailed {
|
||||
version,
|
||||
message: format!("Failed to release savepoint: {}", e),
|
||||
source: Some(e),
|
||||
})?;
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = conn.execute_batch(&format!("ROLLBACK TO {}", savepoint_name));
|
||||
return Err(LoreError::MigrationFailed {
|
||||
version,
|
||||
message: e.to_string(),
|
||||
source: Some(e),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
info!(version, "Migration applied");
|
||||
}
|
||||
|
||||
@@ -146,7 +181,7 @@ pub fn run_migrations_from_dir(conn: &Connection, migrations_dir: &Path) -> Resu
|
||||
let sql = fs::read_to_string(entry.path())?;
|
||||
|
||||
conn.execute_batch(&sql)
|
||||
.map_err(|e| GiError::MigrationFailed {
|
||||
.map_err(|e| LoreError::MigrationFailed {
|
||||
version,
|
||||
message: e.to_string(),
|
||||
source: Some(e),
|
||||
|
||||
@@ -21,6 +21,9 @@ pub enum ErrorCode {
|
||||
TransformError,
|
||||
IoError,
|
||||
InternalError,
|
||||
OllamaUnavailable,
|
||||
OllamaModelNotFound,
|
||||
EmbeddingFailed,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ErrorCode {
|
||||
@@ -39,6 +42,9 @@ impl std::fmt::Display for ErrorCode {
|
||||
Self::TransformError => "TRANSFORM_ERROR",
|
||||
Self::IoError => "IO_ERROR",
|
||||
Self::InternalError => "INTERNAL_ERROR",
|
||||
Self::OllamaUnavailable => "OLLAMA_UNAVAILABLE",
|
||||
Self::OllamaModelNotFound => "OLLAMA_MODEL_NOT_FOUND",
|
||||
Self::EmbeddingFailed => "EMBEDDING_FAILED",
|
||||
};
|
||||
write!(f, "{code}")
|
||||
}
|
||||
@@ -61,13 +67,16 @@ impl ErrorCode {
|
||||
Self::MigrationFailed => 11,
|
||||
Self::IoError => 12,
|
||||
Self::TransformError => 13,
|
||||
Self::OllamaUnavailable => 14,
|
||||
Self::OllamaModelNotFound => 15,
|
||||
Self::EmbeddingFailed => 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Main error type for gitlore.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum GiError {
|
||||
pub enum LoreError {
|
||||
#[error("Config file not found at {path}. Run \"lore init\" first.")]
|
||||
ConfigNotFound { path: String },
|
||||
|
||||
@@ -129,9 +138,25 @@ pub enum GiError {
|
||||
|
||||
#[error("{0}")]
|
||||
Other(String),
|
||||
|
||||
#[error("Cannot connect to Ollama at {base_url}. Is it running?")]
|
||||
OllamaUnavailable {
|
||||
base_url: String,
|
||||
#[source]
|
||||
source: Option<reqwest::Error>,
|
||||
},
|
||||
|
||||
#[error("Ollama model '{model}' not found. Run: ollama pull {model}")]
|
||||
OllamaModelNotFound { model: String },
|
||||
|
||||
#[error("Embedding failed for document {document_id}: {reason}")]
|
||||
EmbeddingFailed { document_id: i64, reason: String },
|
||||
|
||||
#[error("No embeddings found. Run: lore embed")]
|
||||
EmbeddingsNotBuilt,
|
||||
}
|
||||
|
||||
impl GiError {
|
||||
impl LoreError {
|
||||
/// Get the error code for programmatic handling.
|
||||
pub fn code(&self) -> ErrorCode {
|
||||
match self {
|
||||
@@ -152,6 +177,10 @@ impl GiError {
|
||||
Self::NotFound(_) => ErrorCode::GitLabNotFound,
|
||||
Self::Ambiguous(_) => ErrorCode::GitLabNotFound,
|
||||
Self::Other(_) => ErrorCode::InternalError,
|
||||
Self::OllamaUnavailable { .. } => ErrorCode::OllamaUnavailable,
|
||||
Self::OllamaModelNotFound { .. } => ErrorCode::OllamaModelNotFound,
|
||||
Self::EmbeddingFailed { .. } => ErrorCode::EmbeddingFailed,
|
||||
Self::EmbeddingsNotBuilt => ErrorCode::EmbeddingFailed,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -193,7 +222,15 @@ impl GiError {
|
||||
Self::Ambiguous(_) => Some(
|
||||
"Use -p to choose a specific project.\n\n Example:\n lore issues 42 -p group/project-a\n lore mrs 99 -p group/project-b",
|
||||
),
|
||||
_ => None,
|
||||
Self::OllamaUnavailable { .. } => Some("Start Ollama: ollama serve"),
|
||||
Self::OllamaModelNotFound { .. } => {
|
||||
Some("Pull the model: ollama pull nomic-embed-text")
|
||||
}
|
||||
Self::EmbeddingFailed { .. } => {
|
||||
Some("Check Ollama logs or retry with 'lore embed --retry-failed'")
|
||||
}
|
||||
Self::EmbeddingsNotBuilt => Some("Generate embeddings first: lore embed"),
|
||||
Self::Json(_) | Self::Io(_) | Self::Transform(_) | Self::Other(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,12 +264,12 @@ pub struct RobotErrorOutput {
|
||||
pub error: RobotError,
|
||||
}
|
||||
|
||||
impl From<&GiError> for RobotErrorOutput {
|
||||
fn from(e: &GiError) -> Self {
|
||||
impl From<&LoreError> for RobotErrorOutput {
|
||||
fn from(e: &LoreError) -> Self {
|
||||
Self {
|
||||
error: e.to_robot_error(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, GiError>;
|
||||
pub type Result<T> = std::result::Result<T, LoreError>;
|
||||
|
||||
@@ -12,7 +12,7 @@ use tracing::{debug, error, info, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::db::create_connection;
|
||||
use super::error::{GiError, Result};
|
||||
use super::error::{LoreError, Result};
|
||||
use super::time::{ms_to_iso, now_ms};
|
||||
|
||||
/// Maximum consecutive heartbeat failures before signaling error.
|
||||
@@ -116,7 +116,7 @@ impl AppLock {
|
||||
} else {
|
||||
// Lock held by another active process - rollback and return error
|
||||
drop(tx);
|
||||
return Err(GiError::DatabaseLocked {
|
||||
return Err(LoreError::DatabaseLocked {
|
||||
owner: existing_owner,
|
||||
started_at: ms_to_iso(acquired_at),
|
||||
});
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
//! Core infrastructure modules.
|
||||
|
||||
pub mod backoff;
|
||||
pub mod config;
|
||||
pub mod db;
|
||||
pub mod error;
|
||||
pub mod lock;
|
||||
pub mod paths;
|
||||
pub mod payloads;
|
||||
pub mod project;
|
||||
pub mod time;
|
||||
|
||||
pub use config::Config;
|
||||
pub use error::{GiError, Result};
|
||||
pub use error::{LoreError, Result};
|
||||
|
||||
163
src/core/project.rs
Normal file
163
src/core/project.rs
Normal file
@@ -0,0 +1,163 @@
|
||||
use rusqlite::Connection;
|
||||
|
||||
use super::error::{LoreError, Result};
|
||||
|
||||
/// Resolve a project string to a project_id using cascading match:
|
||||
/// 1. Exact match on path_with_namespace
|
||||
/// 2. Case-insensitive exact match
|
||||
/// 3. Suffix match (only if unambiguous)
|
||||
/// 4. Error with available projects list
|
||||
pub fn resolve_project(conn: &Connection, project_str: &str) -> Result<i64> {
|
||||
// Step 1: Exact match
|
||||
let exact = conn.query_row(
|
||||
"SELECT id FROM projects WHERE path_with_namespace = ?1",
|
||||
rusqlite::params![project_str],
|
||||
|row| row.get::<_, i64>(0),
|
||||
);
|
||||
if let Ok(id) = exact {
|
||||
return Ok(id);
|
||||
}
|
||||
|
||||
// Step 2: Case-insensitive exact match
|
||||
let ci = conn.query_row(
|
||||
"SELECT id FROM projects WHERE LOWER(path_with_namespace) = LOWER(?1)",
|
||||
rusqlite::params![project_str],
|
||||
|row| row.get::<_, i64>(0),
|
||||
);
|
||||
if let Ok(id) = ci {
|
||||
return Ok(id);
|
||||
}
|
||||
|
||||
// Step 3: Suffix match (unambiguous)
|
||||
let mut suffix_stmt = conn.prepare(
|
||||
"SELECT id, path_with_namespace FROM projects
|
||||
WHERE path_with_namespace LIKE '%/' || ?1
|
||||
OR path_with_namespace = ?1"
|
||||
)?;
|
||||
let suffix_matches: Vec<(i64, String)> = suffix_stmt
|
||||
.query_map(rusqlite::params![project_str], |row| {
|
||||
Ok((row.get(0)?, row.get(1)?))
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
match suffix_matches.len() {
|
||||
1 => return Ok(suffix_matches[0].0),
|
||||
n if n > 1 => {
|
||||
let matching: Vec<String> = suffix_matches.iter().map(|(_, p)| p.clone()).collect();
|
||||
return Err(LoreError::Other(format!(
|
||||
"Project '{}' is ambiguous. Matching projects:\n{}\n\nHint: Use the full path, e.g., --project={}",
|
||||
project_str,
|
||||
matching.iter().map(|p| format!(" {}", p)).collect::<Vec<_>>().join("\n"),
|
||||
matching[0]
|
||||
)));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Step 4: No match — list available projects
|
||||
let mut all_stmt = conn.prepare(
|
||||
"SELECT path_with_namespace FROM projects ORDER BY path_with_namespace"
|
||||
)?;
|
||||
let all_projects: Vec<String> = all_stmt
|
||||
.query_map([], |row| row.get(0))?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
if all_projects.is_empty() {
|
||||
return Err(LoreError::Other(format!(
|
||||
"Project '{}' not found. No projects have been synced yet.\n\nHint: Run 'lore ingest' first.",
|
||||
project_str
|
||||
)));
|
||||
}
|
||||
|
||||
Err(LoreError::Other(format!(
|
||||
"Project '{}' not found.\n\nAvailable projects:\n{}\n\nHint: Use the full path, e.g., --project={}",
|
||||
project_str,
|
||||
all_projects.iter().map(|p| format!(" {}", p)).collect::<Vec<_>>().join("\n"),
|
||||
all_projects[0]
|
||||
)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn setup_db() -> Connection {
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.execute_batch("
|
||||
CREATE TABLE projects (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_project_id INTEGER UNIQUE NOT NULL,
|
||||
path_with_namespace TEXT NOT NULL,
|
||||
default_branch TEXT,
|
||||
web_url TEXT,
|
||||
created_at INTEGER,
|
||||
updated_at INTEGER,
|
||||
raw_payload_id INTEGER
|
||||
);
|
||||
CREATE INDEX idx_projects_path ON projects(path_with_namespace);
|
||||
").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
fn insert_project(conn: &Connection, id: i64, path: &str) {
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (?1, ?2, ?3)",
|
||||
rusqlite::params![id, id * 100, path],
|
||||
).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exact_match() {
|
||||
let conn = setup_db();
|
||||
insert_project(&conn, 1, "backend/auth-service");
|
||||
let id = resolve_project(&conn, "backend/auth-service").unwrap();
|
||||
assert_eq!(id, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive() {
|
||||
let conn = setup_db();
|
||||
insert_project(&conn, 1, "backend/auth-service");
|
||||
let id = resolve_project(&conn, "Backend/Auth-Service").unwrap();
|
||||
assert_eq!(id, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_suffix_unambiguous() {
|
||||
let conn = setup_db();
|
||||
insert_project(&conn, 1, "backend/auth-service");
|
||||
insert_project(&conn, 2, "frontend/web-ui");
|
||||
let id = resolve_project(&conn, "auth-service").unwrap();
|
||||
assert_eq!(id, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_suffix_ambiguous() {
|
||||
let conn = setup_db();
|
||||
insert_project(&conn, 1, "backend/auth-service");
|
||||
insert_project(&conn, 2, "frontend/auth-service");
|
||||
let err = resolve_project(&conn, "auth-service").unwrap_err();
|
||||
let msg = err.to_string();
|
||||
assert!(msg.contains("ambiguous"), "Expected ambiguous error, got: {}", msg);
|
||||
assert!(msg.contains("backend/auth-service"));
|
||||
assert!(msg.contains("frontend/auth-service"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_match() {
|
||||
let conn = setup_db();
|
||||
insert_project(&conn, 1, "backend/auth-service");
|
||||
let err = resolve_project(&conn, "nonexistent").unwrap_err();
|
||||
let msg = err.to_string();
|
||||
assert!(msg.contains("not found"), "Expected not found error, got: {}", msg);
|
||||
assert!(msg.contains("backend/auth-service"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_projects() {
|
||||
let conn = setup_db();
|
||||
let err = resolve_project(&conn, "anything").unwrap_err();
|
||||
let msg = err.to_string();
|
||||
assert!(msg.contains("No projects have been synced"));
|
||||
}
|
||||
}
|
||||
1085
src/documents/extractor.rs
Normal file
1085
src/documents/extractor.rs
Normal file
File diff suppressed because it is too large
Load Diff
17
src/documents/mod.rs
Normal file
17
src/documents/mod.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
//! Document generation and management.
|
||||
//!
|
||||
//! Extracts searchable documents from issues, MRs, and discussions.
|
||||
|
||||
mod extractor;
|
||||
mod regenerator;
|
||||
mod truncation;
|
||||
|
||||
pub use extractor::{
|
||||
compute_content_hash, compute_list_hash, extract_discussion_document,
|
||||
extract_issue_document, extract_mr_document, DocumentData, SourceType,
|
||||
};
|
||||
pub use regenerator::{regenerate_dirty_documents, RegenerateResult};
|
||||
pub use truncation::{
|
||||
truncate_discussion, truncate_hard_cap, truncate_utf8, NoteContent, TruncationReason,
|
||||
TruncationResult, MAX_DISCUSSION_BYTES, MAX_DOCUMENT_BYTES_HARD,
|
||||
};
|
||||
475
src/documents/regenerator.rs
Normal file
475
src/documents/regenerator.rs
Normal file
@@ -0,0 +1,475 @@
|
||||
use rusqlite::Connection;
|
||||
use rusqlite::OptionalExtension;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::documents::{
|
||||
extract_discussion_document, extract_issue_document, extract_mr_document, DocumentData,
|
||||
SourceType,
|
||||
};
|
||||
use crate::ingestion::dirty_tracker::{clear_dirty, get_dirty_sources, record_dirty_error};
|
||||
|
||||
/// Result of a document regeneration run.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegenerateResult {
|
||||
pub regenerated: usize,
|
||||
pub unchanged: usize,
|
||||
pub errored: usize,
|
||||
}
|
||||
|
||||
/// Drain the dirty_sources queue, regenerating documents for each entry.
|
||||
///
|
||||
/// Uses per-item error handling (fail-soft) and drains the queue completely
|
||||
/// via a bounded batch loop. Each dirty item is processed independently.
|
||||
pub fn regenerate_dirty_documents(conn: &Connection) -> Result<RegenerateResult> {
|
||||
let mut result = RegenerateResult::default();
|
||||
|
||||
loop {
|
||||
let dirty = get_dirty_sources(conn)?;
|
||||
if dirty.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
for (source_type, source_id) in &dirty {
|
||||
match regenerate_one(conn, *source_type, *source_id) {
|
||||
Ok(changed) => {
|
||||
if changed {
|
||||
result.regenerated += 1;
|
||||
} else {
|
||||
result.unchanged += 1;
|
||||
}
|
||||
clear_dirty(conn, *source_type, *source_id)?;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
source_type = %source_type,
|
||||
source_id,
|
||||
error = %e,
|
||||
"Failed to regenerate document"
|
||||
);
|
||||
record_dirty_error(conn, *source_type, *source_id, &e.to_string())?;
|
||||
result.errored += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!(
|
||||
regenerated = result.regenerated,
|
||||
unchanged = result.unchanged,
|
||||
errored = result.errored,
|
||||
"Document regeneration complete"
|
||||
);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Regenerate a single document. Returns true if content_hash changed.
|
||||
fn regenerate_one(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
) -> Result<bool> {
|
||||
let doc = match source_type {
|
||||
SourceType::Issue => extract_issue_document(conn, source_id)?,
|
||||
SourceType::MergeRequest => extract_mr_document(conn, source_id)?,
|
||||
SourceType::Discussion => extract_discussion_document(conn, source_id)?,
|
||||
};
|
||||
|
||||
let Some(doc) = doc else {
|
||||
// Source was deleted — remove the document (cascade handles FTS/embeddings)
|
||||
delete_document(conn, source_type, source_id)?;
|
||||
return Ok(true);
|
||||
};
|
||||
|
||||
let existing_hash = get_existing_hash(conn, source_type, source_id)?;
|
||||
let changed = existing_hash.as_ref() != Some(&doc.content_hash);
|
||||
|
||||
// Always upsert: labels/paths can change independently of content_hash
|
||||
upsert_document(conn, &doc)?;
|
||||
|
||||
Ok(changed)
|
||||
}
|
||||
|
||||
/// Get existing content hash for a document, if it exists.
|
||||
fn get_existing_hash(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
) -> Result<Option<String>> {
|
||||
let mut stmt =
|
||||
conn.prepare("SELECT content_hash FROM documents WHERE source_type = ?1 AND source_id = ?2")?;
|
||||
|
||||
let hash: Option<String> = stmt
|
||||
.query_row(rusqlite::params![source_type.as_str(), source_id], |row| {
|
||||
row.get(0)
|
||||
})
|
||||
.optional()?;
|
||||
|
||||
Ok(hash)
|
||||
}
|
||||
|
||||
/// Upsert a document with triple-hash write optimization.
|
||||
///
|
||||
/// Wrapped in a SAVEPOINT to ensure atomicity of the multi-statement write
|
||||
/// (document row + labels + paths). Without this, a crash between statements
|
||||
/// could leave the document with a stale labels_hash but missing label rows.
|
||||
fn upsert_document(conn: &Connection, doc: &DocumentData) -> Result<()> {
|
||||
conn.execute_batch("SAVEPOINT upsert_doc")?;
|
||||
match upsert_document_inner(conn, doc) {
|
||||
Ok(()) => {
|
||||
conn.execute_batch("RELEASE upsert_doc")?;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = conn.execute_batch("ROLLBACK TO upsert_doc");
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<()> {
|
||||
// Check existing hashes before writing
|
||||
let existing: Option<(i64, String, String, String)> = conn
|
||||
.query_row(
|
||||
"SELECT id, content_hash, labels_hash, paths_hash FROM documents
|
||||
WHERE source_type = ?1 AND source_id = ?2",
|
||||
rusqlite::params![doc.source_type.as_str(), doc.source_id],
|
||||
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)),
|
||||
)
|
||||
.optional()?;
|
||||
|
||||
// Fast path: skip ALL writes when nothing changed (prevents WAL churn)
|
||||
if let Some((_, ref old_content_hash, ref old_labels_hash, ref old_paths_hash)) = existing {
|
||||
if old_content_hash == &doc.content_hash
|
||||
&& old_labels_hash == &doc.labels_hash
|
||||
&& old_paths_hash == &doc.paths_hash
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let labels_json =
|
||||
serde_json::to_string(&doc.labels).unwrap_or_else(|_| "[]".to_string());
|
||||
|
||||
// Upsert document row
|
||||
conn.execute(
|
||||
"INSERT INTO documents
|
||||
(source_type, source_id, project_id, author_username, label_names,
|
||||
labels_hash, paths_hash,
|
||||
created_at, updated_at, url, title, content_text, content_hash,
|
||||
is_truncated, truncated_reason)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15)
|
||||
ON CONFLICT(source_type, source_id) DO UPDATE SET
|
||||
author_username = excluded.author_username,
|
||||
label_names = excluded.label_names,
|
||||
labels_hash = excluded.labels_hash,
|
||||
paths_hash = excluded.paths_hash,
|
||||
updated_at = excluded.updated_at,
|
||||
url = excluded.url,
|
||||
title = excluded.title,
|
||||
content_text = excluded.content_text,
|
||||
content_hash = excluded.content_hash,
|
||||
is_truncated = excluded.is_truncated,
|
||||
truncated_reason = excluded.truncated_reason",
|
||||
rusqlite::params![
|
||||
doc.source_type.as_str(),
|
||||
doc.source_id,
|
||||
doc.project_id,
|
||||
doc.author_username,
|
||||
labels_json,
|
||||
doc.labels_hash,
|
||||
doc.paths_hash,
|
||||
doc.created_at,
|
||||
doc.updated_at,
|
||||
doc.url,
|
||||
doc.title,
|
||||
doc.content_text,
|
||||
doc.content_hash,
|
||||
doc.is_truncated as i32,
|
||||
doc.truncated_reason,
|
||||
],
|
||||
)?;
|
||||
|
||||
// Get document ID
|
||||
let doc_id = match existing {
|
||||
Some((id, _, _, _)) => id,
|
||||
None => get_document_id(conn, doc.source_type, doc.source_id)?,
|
||||
};
|
||||
|
||||
// Only update labels if hash changed
|
||||
let labels_changed = match &existing {
|
||||
Some((_, _, old_hash, _)) => old_hash != &doc.labels_hash,
|
||||
None => true,
|
||||
};
|
||||
if labels_changed {
|
||||
conn.execute(
|
||||
"DELETE FROM document_labels WHERE document_id = ?1",
|
||||
[doc_id],
|
||||
)?;
|
||||
for label in &doc.labels {
|
||||
conn.execute(
|
||||
"INSERT INTO document_labels (document_id, label_name) VALUES (?1, ?2)",
|
||||
rusqlite::params![doc_id, label],
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Only update paths if hash changed
|
||||
let paths_changed = match &existing {
|
||||
Some((_, _, _, old_hash)) => old_hash != &doc.paths_hash,
|
||||
None => true,
|
||||
};
|
||||
if paths_changed {
|
||||
conn.execute(
|
||||
"DELETE FROM document_paths WHERE document_id = ?1",
|
||||
[doc_id],
|
||||
)?;
|
||||
for path in &doc.paths {
|
||||
conn.execute(
|
||||
"INSERT INTO document_paths (document_id, path) VALUES (?1, ?2)",
|
||||
rusqlite::params![doc_id, path],
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete a document by source identity.
|
||||
fn delete_document(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
) -> Result<()> {
|
||||
conn.execute(
|
||||
"DELETE FROM documents WHERE source_type = ?1 AND source_id = ?2",
|
||||
rusqlite::params![source_type.as_str(), source_id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get document ID by source type and source ID.
|
||||
fn get_document_id(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
) -> Result<i64> {
|
||||
let id: i64 = conn.query_row(
|
||||
"SELECT id FROM documents WHERE source_type = ?1 AND source_id = ?2",
|
||||
rusqlite::params![source_type.as_str(), source_id],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::ingestion::dirty_tracker::mark_dirty;
|
||||
|
||||
fn setup_db() -> Connection {
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.execute_batch("
|
||||
CREATE TABLE projects (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_project_id INTEGER UNIQUE NOT NULL,
|
||||
path_with_namespace TEXT NOT NULL,
|
||||
default_branch TEXT,
|
||||
web_url TEXT,
|
||||
created_at INTEGER,
|
||||
updated_at INTEGER,
|
||||
raw_payload_id INTEGER
|
||||
);
|
||||
INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project');
|
||||
|
||||
CREATE TABLE issues (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_id INTEGER UNIQUE NOT NULL,
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
iid INTEGER NOT NULL,
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
state TEXT NOT NULL,
|
||||
author_username TEXT,
|
||||
created_at INTEGER NOT NULL,
|
||||
updated_at INTEGER NOT NULL,
|
||||
last_seen_at INTEGER NOT NULL,
|
||||
discussions_synced_for_updated_at INTEGER,
|
||||
web_url TEXT,
|
||||
raw_payload_id INTEGER
|
||||
);
|
||||
CREATE TABLE labels (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_id INTEGER,
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
name TEXT NOT NULL,
|
||||
color TEXT,
|
||||
description TEXT
|
||||
);
|
||||
CREATE TABLE issue_labels (
|
||||
issue_id INTEGER NOT NULL REFERENCES issues(id),
|
||||
label_id INTEGER NOT NULL REFERENCES labels(id),
|
||||
PRIMARY KEY(issue_id, label_id)
|
||||
);
|
||||
|
||||
CREATE TABLE documents (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_type TEXT NOT NULL,
|
||||
source_id INTEGER NOT NULL,
|
||||
project_id INTEGER NOT NULL,
|
||||
author_username TEXT,
|
||||
label_names TEXT,
|
||||
created_at INTEGER,
|
||||
updated_at INTEGER,
|
||||
url TEXT,
|
||||
title TEXT,
|
||||
content_text TEXT NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
labels_hash TEXT NOT NULL DEFAULT '',
|
||||
paths_hash TEXT NOT NULL DEFAULT '',
|
||||
is_truncated INTEGER NOT NULL DEFAULT 0,
|
||||
truncated_reason TEXT,
|
||||
UNIQUE(source_type, source_id)
|
||||
);
|
||||
CREATE TABLE document_labels (
|
||||
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
label_name TEXT NOT NULL,
|
||||
PRIMARY KEY(document_id, label_name)
|
||||
);
|
||||
CREATE TABLE document_paths (
|
||||
document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
path TEXT NOT NULL,
|
||||
PRIMARY KEY(document_id, path)
|
||||
);
|
||||
CREATE TABLE dirty_sources (
|
||||
source_type TEXT NOT NULL,
|
||||
source_id INTEGER NOT NULL,
|
||||
queued_at INTEGER NOT NULL,
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER,
|
||||
last_error TEXT,
|
||||
next_attempt_at INTEGER,
|
||||
PRIMARY KEY(source_type, source_id)
|
||||
);
|
||||
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
|
||||
").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_creates_document() {
|
||||
let conn = setup_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test Issue', 'Description here', 'opened', 'alice', 1000, 2000, 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
|
||||
let result = regenerate_dirty_documents(&conn).unwrap();
|
||||
assert_eq!(result.regenerated, 1);
|
||||
assert_eq!(result.unchanged, 0);
|
||||
assert_eq!(result.errored, 0);
|
||||
|
||||
// Verify document was created
|
||||
let count: i64 = conn.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0)).unwrap();
|
||||
assert_eq!(count, 1);
|
||||
|
||||
let content: String = conn.query_row("SELECT content_text FROM documents", [], |r| r.get(0)).unwrap();
|
||||
assert!(content.contains("[[Issue]] #42: Test Issue"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_unchanged() {
|
||||
let conn = setup_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'Desc', 'opened', 'alice', 1000, 2000, 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
// First regeneration creates the document
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
let r1 = regenerate_dirty_documents(&conn).unwrap();
|
||||
assert_eq!(r1.regenerated, 1);
|
||||
|
||||
// Second regeneration — same data, should be unchanged
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
let r2 = regenerate_dirty_documents(&conn).unwrap();
|
||||
assert_eq!(r2.unchanged, 1);
|
||||
assert_eq!(r2.regenerated, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_deleted_source() {
|
||||
let conn = setup_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
regenerate_dirty_documents(&conn).unwrap();
|
||||
|
||||
// Delete the issue and re-mark dirty
|
||||
conn.execute("PRAGMA foreign_keys = OFF", []).unwrap();
|
||||
conn.execute("DELETE FROM issues WHERE id = 1", []).unwrap();
|
||||
conn.execute("PRAGMA foreign_keys = ON", []).unwrap();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
|
||||
let result = regenerate_dirty_documents(&conn).unwrap();
|
||||
assert_eq!(result.regenerated, 1); // Deletion counts as "changed"
|
||||
|
||||
let count: i64 = conn.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0)).unwrap();
|
||||
assert_eq!(count, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_drains_queue() {
|
||||
let conn = setup_db();
|
||||
for i in 1..=10 {
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (?1, ?2, 1, ?1, 'Test', 'opened', 1000, 2000, 3000)",
|
||||
rusqlite::params![i, i * 10],
|
||||
).unwrap();
|
||||
mark_dirty(&conn, SourceType::Issue, i).unwrap();
|
||||
}
|
||||
|
||||
let result = regenerate_dirty_documents(&conn).unwrap();
|
||||
assert_eq!(result.regenerated, 10);
|
||||
|
||||
// Queue should be empty
|
||||
let dirty = get_dirty_sources(&conn).unwrap();
|
||||
assert!(dirty.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_triple_hash_fast_path() {
|
||||
let conn = setup_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO labels (id, project_id, name) VALUES (1, 1, 'bug')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO issue_labels (issue_id, label_id) VALUES (1, 1)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
// First run creates document
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
regenerate_dirty_documents(&conn).unwrap();
|
||||
|
||||
// Second run — triple hash match, should skip ALL writes
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
let result = regenerate_dirty_documents(&conn).unwrap();
|
||||
assert_eq!(result.unchanged, 1);
|
||||
|
||||
// Labels should still be present (not deleted and re-inserted)
|
||||
let label_count: i64 = conn.query_row(
|
||||
"SELECT COUNT(*) FROM document_labels", [], |r| r.get(0),
|
||||
).unwrap();
|
||||
assert_eq!(label_count, 1);
|
||||
}
|
||||
}
|
||||
329
src/documents/truncation.rs
Normal file
329
src/documents/truncation.rs
Normal file
@@ -0,0 +1,329 @@
|
||||
/// Maximum byte limit for discussion documents (suitable for embedding chunking).
|
||||
/// Note: uses `.len()` (byte count), not char count — consistent with `CHUNK_MAX_BYTES`.
|
||||
pub const MAX_DISCUSSION_BYTES: usize = 32_000;
|
||||
|
||||
/// Hard safety cap (bytes) for any document type (pathological content: pasted logs, base64).
|
||||
pub const MAX_DOCUMENT_BYTES_HARD: usize = 2_000_000;
|
||||
|
||||
/// A single note's content for truncation processing.
|
||||
pub struct NoteContent {
|
||||
pub author: String,
|
||||
pub date: String,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
/// Result of truncation processing.
|
||||
pub struct TruncationResult {
|
||||
pub content: String,
|
||||
pub is_truncated: bool,
|
||||
pub reason: Option<TruncationReason>,
|
||||
}
|
||||
|
||||
/// Why a document was truncated (matches DB CHECK constraint values).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TruncationReason {
|
||||
TokenLimitMiddleDrop,
|
||||
SingleNoteOversized,
|
||||
FirstLastOversized,
|
||||
HardCapOversized,
|
||||
}
|
||||
|
||||
impl TruncationReason {
|
||||
/// Returns the DB-compatible string matching the CHECK constraint.
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::TokenLimitMiddleDrop => "token_limit_middle_drop",
|
||||
Self::SingleNoteOversized => "single_note_oversized",
|
||||
Self::FirstLastOversized => "first_last_oversized",
|
||||
Self::HardCapOversized => "hard_cap_oversized",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Format a single note as `@author (date):\nbody\n\n`.
|
||||
fn format_note(note: &NoteContent) -> String {
|
||||
format!("@{} ({}):\n{}\n\n", note.author, note.date, note.body)
|
||||
}
|
||||
|
||||
/// Truncate a string at a UTF-8-safe byte boundary.
|
||||
/// Returns a slice no longer than `max_bytes` bytes, walking backward
|
||||
/// to find the nearest char boundary if needed.
|
||||
pub fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
|
||||
if s.len() <= max_bytes {
|
||||
return s;
|
||||
}
|
||||
// Walk backward from max_bytes to find a char boundary
|
||||
let mut end = max_bytes;
|
||||
while end > 0 && !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
&s[..end]
|
||||
}
|
||||
|
||||
/// Truncate discussion notes to fit within `max_bytes`.
|
||||
///
|
||||
/// Algorithm:
|
||||
/// 1. Format all notes
|
||||
/// 2. If total fits, return as-is
|
||||
/// 3. Single note: truncate at UTF-8 boundary, append [truncated]
|
||||
/// 4. Try to keep first N notes + last note + marker within limit
|
||||
/// 5. If first + last > limit: keep only first (truncated)
|
||||
pub fn truncate_discussion(notes: &[NoteContent], max_bytes: usize) -> TruncationResult {
|
||||
if notes.is_empty() {
|
||||
return TruncationResult {
|
||||
content: String::new(),
|
||||
is_truncated: false,
|
||||
reason: None,
|
||||
};
|
||||
}
|
||||
|
||||
let formatted: Vec<String> = notes.iter().map(format_note).collect();
|
||||
let total: String = formatted.concat();
|
||||
|
||||
// Case 1: fits within limit
|
||||
if total.len() <= max_bytes {
|
||||
return TruncationResult {
|
||||
content: total,
|
||||
is_truncated: false,
|
||||
reason: None,
|
||||
};
|
||||
}
|
||||
|
||||
// Case 2: single note — truncate it
|
||||
if notes.len() == 1 {
|
||||
let truncated = truncate_utf8(&total, max_bytes.saturating_sub(11)); // room for [truncated]
|
||||
let content = format!("{}[truncated]", truncated);
|
||||
return TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::SingleNoteOversized),
|
||||
};
|
||||
}
|
||||
|
||||
// Case 3: multiple notes — try first N + marker + last
|
||||
let last_note = &formatted[formatted.len() - 1];
|
||||
|
||||
// Binary search for max N where first N notes + marker + last note fit
|
||||
let mut best_n = 0;
|
||||
for n in 1..formatted.len() - 1 {
|
||||
let first_n: usize = formatted[..n].iter().map(|s| s.len()).sum();
|
||||
let omitted = formatted.len() - n - 1;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let candidate_len = first_n + marker.len() + last_note.len();
|
||||
if candidate_len <= max_bytes {
|
||||
best_n = n;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if best_n > 0 {
|
||||
// We can keep first best_n notes + marker + last note
|
||||
let first_part: String = formatted[..best_n].concat();
|
||||
let omitted = formatted.len() - best_n - 1;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let content = format!("{}{}{}", first_part, marker, last_note);
|
||||
return TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
||||
};
|
||||
}
|
||||
|
||||
// Case 4: even first + last don't fit — keep only first (truncated)
|
||||
let first_note = &formatted[0];
|
||||
if first_note.len() + last_note.len() > max_bytes {
|
||||
let truncated = truncate_utf8(first_note, max_bytes.saturating_sub(11));
|
||||
let content = format!("{}[truncated]", truncated);
|
||||
return TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::FirstLastOversized),
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback: first + marker + last (0 middle notes kept)
|
||||
let omitted = formatted.len() - 2;
|
||||
let marker = format!("\n\n[... {} notes omitted for length ...]\n\n", omitted);
|
||||
let content = format!("{}{}{}", formatted[0], marker, last_note);
|
||||
TruncationResult {
|
||||
content,
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::TokenLimitMiddleDrop),
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply hard cap truncation to any document type.
|
||||
/// Truncates at UTF-8-safe boundary if content exceeds 2MB.
|
||||
pub fn truncate_hard_cap(content: &str) -> TruncationResult {
|
||||
if content.len() <= MAX_DOCUMENT_BYTES_HARD {
|
||||
return TruncationResult {
|
||||
content: content.to_string(),
|
||||
is_truncated: false,
|
||||
reason: None,
|
||||
};
|
||||
}
|
||||
|
||||
let truncated = truncate_utf8(content, MAX_DOCUMENT_BYTES_HARD.saturating_sub(11));
|
||||
TruncationResult {
|
||||
content: format!("{}[truncated]", truncated),
|
||||
is_truncated: true,
|
||||
reason: Some(TruncationReason::HardCapOversized),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_note(author: &str, body: &str) -> NoteContent {
|
||||
NoteContent {
|
||||
author: author.to_string(),
|
||||
date: "2024-01-01".to_string(),
|
||||
body: body.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_truncation_under_limit() {
|
||||
let notes = vec![
|
||||
make_note("alice", "Short note 1"),
|
||||
make_note("bob", "Short note 2"),
|
||||
make_note("carol", "Short note 3"),
|
||||
];
|
||||
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
||||
assert!(!result.is_truncated);
|
||||
assert!(result.reason.is_none());
|
||||
assert!(result.content.contains("@alice"));
|
||||
assert!(result.content.contains("@bob"));
|
||||
assert!(result.content.contains("@carol"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_middle_notes_dropped() {
|
||||
// Create 10 notes where total exceeds limit
|
||||
let big_body = "x".repeat(4000);
|
||||
let notes: Vec<NoteContent> = (0..10)
|
||||
.map(|i| make_note(&format!("user{}", i), &big_body))
|
||||
.collect();
|
||||
let result = truncate_discussion(¬es, 10_000);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::TokenLimitMiddleDrop));
|
||||
// First note preserved
|
||||
assert!(result.content.contains("@user0"));
|
||||
// Last note preserved
|
||||
assert!(result.content.contains("@user9"));
|
||||
// Marker present
|
||||
assert!(result.content.contains("notes omitted for length"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_note_oversized() {
|
||||
let big_body = "x".repeat(50_000);
|
||||
let notes = vec![make_note("alice", &big_body)];
|
||||
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::SingleNoteOversized));
|
||||
assert!(result.content.ends_with("[truncated]"));
|
||||
assert!(result.content.len() <= MAX_DISCUSSION_BYTES + 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_first_last_oversized() {
|
||||
let big_body = "x".repeat(20_000);
|
||||
let notes = vec![
|
||||
make_note("alice", &big_body),
|
||||
make_note("bob", &big_body),
|
||||
];
|
||||
let result = truncate_discussion(¬es, 10_000);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::FirstLastOversized));
|
||||
assert!(result.content.contains("@alice"));
|
||||
assert!(result.content.ends_with("[truncated]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_note_under_limit() {
|
||||
let notes = vec![make_note("alice", "Short note")];
|
||||
let result = truncate_discussion(¬es, MAX_DISCUSSION_BYTES);
|
||||
assert!(!result.is_truncated);
|
||||
assert!(result.content.contains("@alice"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_notes() {
|
||||
let result = truncate_discussion(&[], MAX_DISCUSSION_BYTES);
|
||||
assert!(!result.is_truncated);
|
||||
assert!(result.content.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_boundary_safety() {
|
||||
// Emoji are 4 bytes each
|
||||
let emoji_content = "🎉".repeat(10);
|
||||
let truncated = truncate_utf8(&emoji_content, 10);
|
||||
// 10 bytes should hold 2 emoji (8 bytes) with 2 bytes left over (not enough for another)
|
||||
assert_eq!(truncated.len(), 8);
|
||||
assert_eq!(truncated, "🎉🎉");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_boundary_cjk() {
|
||||
// CJK characters are 3 bytes each
|
||||
let cjk = "中文字符测试";
|
||||
let truncated = truncate_utf8(cjk, 7);
|
||||
// 7 bytes: 2 full chars (6 bytes), 1 byte left (not enough for another)
|
||||
assert_eq!(truncated, "中文");
|
||||
assert_eq!(truncated.len(), 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hard_cap() {
|
||||
let big_content = "x".repeat(3_000_000);
|
||||
let result = truncate_hard_cap(&big_content);
|
||||
assert!(result.is_truncated);
|
||||
assert_eq!(result.reason, Some(TruncationReason::HardCapOversized));
|
||||
assert!(result.content.len() <= MAX_DOCUMENT_BYTES_HARD + 20);
|
||||
assert!(result.content.ends_with("[truncated]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hard_cap_under_limit() {
|
||||
let content = "Short content";
|
||||
let result = truncate_hard_cap(content);
|
||||
assert!(!result.is_truncated);
|
||||
assert_eq!(result.content, content);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_marker_count_correct() {
|
||||
// 7 notes, keep first 1 + last 1, drop middle 5
|
||||
let big_body = "x".repeat(5000);
|
||||
let notes: Vec<NoteContent> = (0..7)
|
||||
.map(|i| make_note(&format!("user{}", i), &big_body))
|
||||
.collect();
|
||||
let result = truncate_discussion(¬es, 12_000);
|
||||
assert!(result.is_truncated);
|
||||
assert!(result.content.contains("[... 5 notes omitted for length ...]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncation_reason_as_str() {
|
||||
assert_eq!(
|
||||
TruncationReason::TokenLimitMiddleDrop.as_str(),
|
||||
"token_limit_middle_drop"
|
||||
);
|
||||
assert_eq!(
|
||||
TruncationReason::SingleNoteOversized.as_str(),
|
||||
"single_note_oversized"
|
||||
);
|
||||
assert_eq!(
|
||||
TruncationReason::FirstLastOversized.as_str(),
|
||||
"first_last_oversized"
|
||||
);
|
||||
assert_eq!(
|
||||
TruncationReason::HardCapOversized.as_str(),
|
||||
"hard_cap_oversized"
|
||||
);
|
||||
}
|
||||
}
|
||||
79
src/embedding/change_detector.rs
Normal file
79
src/embedding/change_detector.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! Detect documents needing (re-)embedding based on content hash changes.
|
||||
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
|
||||
/// A document that needs embedding or re-embedding.
|
||||
#[derive(Debug)]
|
||||
pub struct PendingDocument {
|
||||
pub document_id: i64,
|
||||
pub content_text: String,
|
||||
pub content_hash: String,
|
||||
}
|
||||
|
||||
/// Find documents that need embedding: new (no metadata) or changed (hash mismatch).
|
||||
///
|
||||
/// Uses keyset pagination (WHERE d.id > last_id) and returns up to `page_size` results.
|
||||
pub fn find_pending_documents(
|
||||
conn: &Connection,
|
||||
page_size: usize,
|
||||
last_id: i64,
|
||||
) -> Result<Vec<PendingDocument>> {
|
||||
// Documents that either:
|
||||
// 1. Have no embedding_metadata at all (new)
|
||||
// 2. Have metadata where document_hash != content_hash (changed)
|
||||
let sql = r#"
|
||||
SELECT d.id, d.content_text, d.content_hash
|
||||
FROM documents d
|
||||
WHERE d.id > ?1
|
||||
AND (
|
||||
NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
)
|
||||
ORDER BY d.id
|
||||
LIMIT ?2
|
||||
"#;
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let rows = stmt
|
||||
.query_map(rusqlite::params![last_id, page_size as i64], |row| {
|
||||
Ok(PendingDocument {
|
||||
document_id: row.get(0)?,
|
||||
content_text: row.get(1)?,
|
||||
content_hash: row.get(2)?,
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Count total documents that need embedding.
|
||||
pub fn count_pending_documents(conn: &Connection) -> Result<i64> {
|
||||
let count: i64 = conn.query_row(
|
||||
r#"
|
||||
SELECT COUNT(*)
|
||||
FROM documents d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
)
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM embedding_metadata em
|
||||
WHERE em.document_id = d.id AND em.chunk_index = 0
|
||||
AND em.document_hash != d.content_hash
|
||||
)
|
||||
"#,
|
||||
[],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
Ok(count)
|
||||
}
|
||||
63
src/embedding/chunk_ids.rs
Normal file
63
src/embedding/chunk_ids.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
/// Multiplier for encoding (document_id, chunk_index) into a single rowid.
|
||||
/// Supports up to 1000 chunks per document (32M chars at 32k/chunk).
|
||||
pub const CHUNK_ROWID_MULTIPLIER: i64 = 1000;
|
||||
|
||||
/// Encode (document_id, chunk_index) into a sqlite-vec rowid.
|
||||
///
|
||||
/// rowid = document_id * CHUNK_ROWID_MULTIPLIER + chunk_index
|
||||
pub fn encode_rowid(document_id: i64, chunk_index: i64) -> i64 {
|
||||
document_id * CHUNK_ROWID_MULTIPLIER + chunk_index
|
||||
}
|
||||
|
||||
/// Decode a sqlite-vec rowid back into (document_id, chunk_index).
|
||||
pub fn decode_rowid(rowid: i64) -> (i64, i64) {
|
||||
let document_id = rowid / CHUNK_ROWID_MULTIPLIER;
|
||||
let chunk_index = rowid % CHUNK_ROWID_MULTIPLIER;
|
||||
(document_id, chunk_index)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_encode_single_chunk() {
|
||||
assert_eq!(encode_rowid(1, 0), 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_multi_chunk() {
|
||||
assert_eq!(encode_rowid(1, 5), 1005);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_specific_values() {
|
||||
assert_eq!(encode_rowid(42, 0), 42000);
|
||||
assert_eq!(encode_rowid(42, 5), 42005);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_zero_chunk() {
|
||||
assert_eq!(decode_rowid(42000), (42, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_roundtrip() {
|
||||
for doc_id in [0, 1, 42, 100, 999, 10000] {
|
||||
for chunk_idx in [0, 1, 5, 99, 999] {
|
||||
let rowid = encode_rowid(doc_id, chunk_idx);
|
||||
let (decoded_doc, decoded_chunk) = decode_rowid(rowid);
|
||||
assert_eq!(
|
||||
(decoded_doc, decoded_chunk),
|
||||
(doc_id, chunk_idx),
|
||||
"Roundtrip failed for doc_id={doc_id}, chunk_idx={chunk_idx}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiplier_value() {
|
||||
assert_eq!(CHUNK_ROWID_MULTIPLIER, 1000);
|
||||
}
|
||||
}
|
||||
207
src/embedding/chunking.rs
Normal file
207
src/embedding/chunking.rs
Normal file
@@ -0,0 +1,207 @@
|
||||
//! Text chunking for embedding: split documents at paragraph boundaries with overlap.
|
||||
|
||||
/// Maximum bytes per chunk.
|
||||
/// Named `_BYTES` because `str::len()` returns byte count; multi-byte UTF-8
|
||||
/// sequences mean byte length ≥ char count.
|
||||
pub const CHUNK_MAX_BYTES: usize = 32_000;
|
||||
|
||||
/// Character overlap between adjacent chunks.
|
||||
pub const CHUNK_OVERLAP_CHARS: usize = 500;
|
||||
|
||||
/// Split document content into chunks suitable for embedding.
|
||||
///
|
||||
/// Documents <= CHUNK_MAX_BYTES produce a single chunk.
|
||||
/// Longer documents are split at paragraph boundaries (`\n\n`), falling back
|
||||
/// to sentence boundaries, then word boundaries, then hard character cut.
|
||||
/// Adjacent chunks share CHUNK_OVERLAP_CHARS of overlap.
|
||||
///
|
||||
/// Returns Vec<(chunk_index, chunk_text)>.
|
||||
pub fn split_into_chunks(content: &str) -> Vec<(usize, String)> {
|
||||
if content.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
if content.len() <= CHUNK_MAX_BYTES {
|
||||
return vec![(0, content.to_string())];
|
||||
}
|
||||
|
||||
let mut chunks: Vec<(usize, String)> = Vec::new();
|
||||
let mut start = 0;
|
||||
let mut chunk_index = 0;
|
||||
|
||||
while start < content.len() {
|
||||
let remaining = &content[start..];
|
||||
if remaining.len() <= CHUNK_MAX_BYTES {
|
||||
chunks.push((chunk_index, remaining.to_string()));
|
||||
break;
|
||||
}
|
||||
|
||||
// Find a split point within CHUNK_MAX_BYTES (char-boundary-safe)
|
||||
let end = floor_char_boundary(content, start + CHUNK_MAX_BYTES);
|
||||
let window = &content[start..end];
|
||||
|
||||
// Try paragraph boundary (\n\n) — search backward from end
|
||||
let split_at = find_paragraph_break(window)
|
||||
.or_else(|| find_sentence_break(window))
|
||||
.or_else(|| find_word_break(window))
|
||||
.unwrap_or(window.len());
|
||||
|
||||
let chunk_text = &content[start..start + split_at];
|
||||
chunks.push((chunk_index, chunk_text.to_string()));
|
||||
|
||||
// Advance with overlap, guaranteeing forward progress to prevent infinite loops.
|
||||
// If split_at <= CHUNK_OVERLAP_CHARS we skip overlap to avoid stalling.
|
||||
// The .max(1) ensures we always advance at least 1 byte.
|
||||
let advance = if split_at > CHUNK_OVERLAP_CHARS {
|
||||
split_at - CHUNK_OVERLAP_CHARS
|
||||
} else {
|
||||
split_at
|
||||
}
|
||||
.max(1);
|
||||
start += advance;
|
||||
chunk_index += 1;
|
||||
}
|
||||
|
||||
chunks
|
||||
}
|
||||
|
||||
/// Find the last paragraph break (`\n\n`) in the window, preferring the
|
||||
/// last third for balanced chunks.
|
||||
fn find_paragraph_break(window: &str) -> Option<usize> {
|
||||
// Search backward from 2/3 of the way through to find a good split
|
||||
let search_start = window.len() * 2 / 3;
|
||||
window[search_start..].rfind("\n\n").map(|pos| search_start + pos + 2)
|
||||
.or_else(|| window[..search_start].rfind("\n\n").map(|pos| pos + 2))
|
||||
}
|
||||
|
||||
/// Find the last sentence boundary (`. `, `? `, `! `) in the window.
|
||||
fn find_sentence_break(window: &str) -> Option<usize> {
|
||||
let search_start = window.len() / 2;
|
||||
for pat in &[". ", "? ", "! "] {
|
||||
if let Some(pos) = window[search_start..].rfind(pat) {
|
||||
return Some(search_start + pos + pat.len());
|
||||
}
|
||||
}
|
||||
// Try first half
|
||||
for pat in &[". ", "? ", "! "] {
|
||||
if let Some(pos) = window[..search_start].rfind(pat) {
|
||||
return Some(pos + pat.len());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Find the last word boundary (space) in the window.
|
||||
fn find_word_break(window: &str) -> Option<usize> {
|
||||
let search_start = window.len() / 2;
|
||||
window[search_start..].rfind(' ').map(|pos| search_start + pos + 1)
|
||||
.or_else(|| window[..search_start].rfind(' ').map(|pos| pos + 1))
|
||||
}
|
||||
|
||||
/// Find the largest byte index <= `idx` that is a valid char boundary in `s`.
|
||||
/// Equivalent to `str::floor_char_boundary` (stabilized in Rust 1.82).
|
||||
fn floor_char_boundary(s: &str, idx: usize) -> usize {
|
||||
if idx >= s.len() {
|
||||
return s.len();
|
||||
}
|
||||
let mut i = idx;
|
||||
while i > 0 && !s.is_char_boundary(i) {
|
||||
i -= 1;
|
||||
}
|
||||
i
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_empty_content() {
|
||||
let chunks = split_into_chunks("");
|
||||
assert!(chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_short_document_single_chunk() {
|
||||
let content = "Short document content.";
|
||||
let chunks = split_into_chunks(content);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(chunks[0].0, 0);
|
||||
assert_eq!(chunks[0].1, content);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactly_max_chars() {
|
||||
let content = "a".repeat(CHUNK_MAX_BYTES);
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_long_document_multiple_chunks() {
|
||||
// Create content > CHUNK_MAX_BYTES with paragraph boundaries
|
||||
let paragraph = "This is a paragraph of text.\n\n";
|
||||
let mut content = String::new();
|
||||
while content.len() < CHUNK_MAX_BYTES * 2 {
|
||||
content.push_str(paragraph);
|
||||
}
|
||||
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2, "Expected multiple chunks, got {}", chunks.len());
|
||||
|
||||
// Verify indices are sequential
|
||||
for (i, (idx, _)) in chunks.iter().enumerate() {
|
||||
assert_eq!(*idx, i);
|
||||
}
|
||||
|
||||
// Verify all content is covered (no gaps)
|
||||
assert!(!chunks.last().unwrap().1.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_overlap() {
|
||||
// Create content that will produce 2+ chunks
|
||||
let paragraph = "This is paragraph content for testing chunk overlap behavior.\n\n";
|
||||
let mut content = String::new();
|
||||
while content.len() < CHUNK_MAX_BYTES + CHUNK_OVERLAP_CHARS + 1000 {
|
||||
content.push_str(paragraph);
|
||||
}
|
||||
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
|
||||
// Check that adjacent chunks share some content (overlap)
|
||||
if chunks.len() >= 2 {
|
||||
let end_of_first = &chunks[0].1;
|
||||
let start_of_second = &chunks[1].1;
|
||||
// The end of first chunk should overlap with start of second
|
||||
let overlap_region = &end_of_first[end_of_first.len().saturating_sub(CHUNK_OVERLAP_CHARS)..];
|
||||
assert!(
|
||||
start_of_second.starts_with(overlap_region)
|
||||
|| overlap_region.contains(&start_of_second[..100.min(start_of_second.len())]),
|
||||
"Expected overlap between chunks"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_paragraph_boundary() {
|
||||
// Create content without paragraph breaks
|
||||
let content = "word ".repeat(CHUNK_MAX_BYTES / 5 * 3);
|
||||
let chunks = split_into_chunks(&content);
|
||||
assert!(chunks.len() >= 2);
|
||||
// Should still split (at word boundaries)
|
||||
for (_, chunk) in &chunks {
|
||||
assert!(!chunk.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_indices_sequential() {
|
||||
let content = "a ".repeat(CHUNK_MAX_BYTES);
|
||||
let chunks = split_into_chunks(&content);
|
||||
for (i, (idx, _)) in chunks.iter().enumerate() {
|
||||
assert_eq!(*idx, i, "Chunk index mismatch at position {}", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
9
src/embedding/mod.rs
Normal file
9
src/embedding/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
pub mod change_detector;
|
||||
pub mod chunk_ids;
|
||||
pub mod chunking;
|
||||
pub mod ollama;
|
||||
pub mod pipeline;
|
||||
|
||||
pub use change_detector::{count_pending_documents, find_pending_documents, PendingDocument};
|
||||
pub use chunking::{split_into_chunks, CHUNK_MAX_BYTES, CHUNK_OVERLAP_CHARS};
|
||||
pub use pipeline::{embed_documents, EmbedResult};
|
||||
201
src/embedding/ollama.rs
Normal file
201
src/embedding/ollama.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::core::error::{LoreError, Result};
|
||||
|
||||
/// Configuration for Ollama embedding service.
|
||||
pub struct OllamaConfig {
|
||||
pub base_url: String,
|
||||
pub model: String,
|
||||
pub timeout_secs: u64,
|
||||
}
|
||||
|
||||
impl Default for OllamaConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
base_url: "http://localhost:11434".to_string(),
|
||||
model: "nomic-embed-text".to_string(),
|
||||
timeout_secs: 60,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Async client for Ollama embedding API.
|
||||
pub struct OllamaClient {
|
||||
client: Client,
|
||||
config: OllamaConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct EmbedRequest {
|
||||
model: String,
|
||||
input: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct EmbedResponse {
|
||||
#[allow(dead_code)]
|
||||
model: String,
|
||||
embeddings: Vec<Vec<f32>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct TagsResponse {
|
||||
models: Vec<ModelInfo>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ModelInfo {
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl OllamaClient {
|
||||
pub fn new(config: OllamaConfig) -> Self {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(config.timeout_secs))
|
||||
.build()
|
||||
.expect("Failed to create HTTP client");
|
||||
|
||||
Self { client, config }
|
||||
}
|
||||
|
||||
/// Health check: verifies Ollama is reachable and the configured model exists.
|
||||
///
|
||||
/// Model matching uses `starts_with` so "nomic-embed-text" matches
|
||||
/// "nomic-embed-text:latest".
|
||||
pub async fn health_check(&self) -> Result<()> {
|
||||
let url = format!("{}/api/tags", self.config.base_url);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.get(&url)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| LoreError::OllamaUnavailable {
|
||||
base_url: self.config.base_url.clone(),
|
||||
source: Some(e),
|
||||
})?;
|
||||
|
||||
let tags: TagsResponse =
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| LoreError::OllamaUnavailable {
|
||||
base_url: self.config.base_url.clone(),
|
||||
source: Some(e),
|
||||
})?;
|
||||
|
||||
let model_found = tags
|
||||
.models
|
||||
.iter()
|
||||
.any(|m| m.name.starts_with(&self.config.model));
|
||||
|
||||
if !model_found {
|
||||
return Err(LoreError::OllamaModelNotFound {
|
||||
model: self.config.model.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Embed a batch of texts using the configured model.
|
||||
///
|
||||
/// Returns one embedding vector per input text.
|
||||
pub async fn embed_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
|
||||
let url = format!("{}/api/embed", self.config.base_url);
|
||||
|
||||
let request = EmbedRequest {
|
||||
model: self.config.model.clone(),
|
||||
input: texts,
|
||||
};
|
||||
|
||||
let response = self.client.post(&url).json(&request).send().await.map_err(
|
||||
|e| LoreError::OllamaUnavailable {
|
||||
base_url: self.config.base_url.clone(),
|
||||
source: Some(e),
|
||||
},
|
||||
)?;
|
||||
|
||||
let status = response.status();
|
||||
if !status.is_success() {
|
||||
let body = response.text().await.unwrap_or_default();
|
||||
return Err(LoreError::EmbeddingFailed {
|
||||
document_id: 0,
|
||||
reason: format!("HTTP {}: {}", status, body),
|
||||
});
|
||||
}
|
||||
|
||||
let embed_response: EmbedResponse =
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| LoreError::EmbeddingFailed {
|
||||
document_id: 0,
|
||||
reason: format!("Failed to parse embed response: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(embed_response.embeddings)
|
||||
}
|
||||
}
|
||||
|
||||
/// Quick health check without creating a full client.
|
||||
pub async fn check_ollama_health(base_url: &str) -> bool {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(5))
|
||||
.build()
|
||||
.ok();
|
||||
|
||||
let Some(client) = client else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let url = format!("{base_url}/api/tags");
|
||||
client.get(&url).send().await.is_ok()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_config_defaults() {
|
||||
let config = OllamaConfig::default();
|
||||
assert_eq!(config.base_url, "http://localhost:11434");
|
||||
assert_eq!(config.model, "nomic-embed-text");
|
||||
assert_eq!(config.timeout_secs, 60);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_health_check_model_starts_with() {
|
||||
// Verify the matching logic: "nomic-embed-text" should match "nomic-embed-text:latest"
|
||||
let model = "nomic-embed-text";
|
||||
let tag_name = "nomic-embed-text:latest";
|
||||
assert!(tag_name.starts_with(model));
|
||||
|
||||
// Non-matching model
|
||||
let wrong_model = "llama2";
|
||||
assert!(!tag_name.starts_with(wrong_model));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_embed_request_serialization() {
|
||||
let request = EmbedRequest {
|
||||
model: "nomic-embed-text".to_string(),
|
||||
input: vec!["hello".to_string(), "world".to_string()],
|
||||
};
|
||||
let json = serde_json::to_string(&request).unwrap();
|
||||
assert!(json.contains("\"model\":\"nomic-embed-text\""));
|
||||
assert!(json.contains("\"input\":[\"hello\",\"world\"]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_embed_response_deserialization() {
|
||||
let json = r#"{"model":"nomic-embed-text","embeddings":[[0.1,0.2,0.3],[0.4,0.5,0.6]]}"#;
|
||||
let response: EmbedResponse = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(response.embeddings.len(), 2);
|
||||
assert_eq!(response.embeddings[0], vec![0.1, 0.2, 0.3]);
|
||||
assert_eq!(response.embeddings[1], vec![0.4, 0.5, 0.6]);
|
||||
}
|
||||
}
|
||||
251
src/embedding/pipeline.rs
Normal file
251
src/embedding/pipeline.rs
Normal file
@@ -0,0 +1,251 @@
|
||||
//! Async embedding pipeline: chunk documents, embed via Ollama, store in sqlite-vec.
|
||||
|
||||
use rusqlite::Connection;
|
||||
use sha2::{Digest, Sha256};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::change_detector::{count_pending_documents, find_pending_documents};
|
||||
use crate::embedding::chunk_ids::encode_rowid;
|
||||
use crate::embedding::chunking::split_into_chunks;
|
||||
use crate::embedding::ollama::OllamaClient;
|
||||
|
||||
const BATCH_SIZE: usize = 32;
|
||||
const DB_PAGE_SIZE: usize = 500;
|
||||
const EXPECTED_DIMS: usize = 768;
|
||||
|
||||
/// Result of an embedding run.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EmbedResult {
|
||||
pub embedded: usize,
|
||||
pub failed: usize,
|
||||
pub skipped: usize,
|
||||
}
|
||||
|
||||
/// Work item: a single chunk to embed.
|
||||
struct ChunkWork {
|
||||
doc_id: i64,
|
||||
chunk_index: usize,
|
||||
doc_hash: String,
|
||||
chunk_hash: String,
|
||||
text: String,
|
||||
}
|
||||
|
||||
/// Run the embedding pipeline: find pending documents, chunk, embed, store.
|
||||
///
|
||||
/// Processes batches of BATCH_SIZE texts per Ollama API call.
|
||||
/// Uses keyset pagination over documents (DB_PAGE_SIZE per page).
|
||||
pub async fn embed_documents(
|
||||
conn: &Connection,
|
||||
client: &OllamaClient,
|
||||
model_name: &str,
|
||||
progress_callback: Option<Box<dyn Fn(usize, usize)>>,
|
||||
) -> Result<EmbedResult> {
|
||||
let total = count_pending_documents(conn)? as usize;
|
||||
let mut result = EmbedResult::default();
|
||||
let mut last_id: i64 = 0;
|
||||
let mut processed: usize = 0;
|
||||
|
||||
if total == 0 {
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
info!(total, "Starting embedding pipeline");
|
||||
|
||||
loop {
|
||||
let pending = find_pending_documents(conn, DB_PAGE_SIZE, last_id)?;
|
||||
if pending.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Build chunk work items for this page
|
||||
let mut all_chunks: Vec<ChunkWork> = Vec::new();
|
||||
|
||||
for doc in &pending {
|
||||
// Always advance the cursor, even for skipped docs, to avoid re-fetching
|
||||
last_id = doc.document_id;
|
||||
|
||||
if doc.content_text.is_empty() {
|
||||
result.skipped += 1;
|
||||
processed += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Clear existing embeddings for this document before re-embedding
|
||||
clear_document_embeddings(conn, doc.document_id)?;
|
||||
|
||||
let chunks = split_into_chunks(&doc.content_text);
|
||||
for (chunk_index, text) in chunks {
|
||||
all_chunks.push(ChunkWork {
|
||||
doc_id: doc.document_id,
|
||||
chunk_index,
|
||||
doc_hash: doc.content_hash.clone(),
|
||||
chunk_hash: sha256_hash(&text),
|
||||
text,
|
||||
});
|
||||
}
|
||||
|
||||
// Track progress per document (not per chunk) to match `total`
|
||||
processed += 1;
|
||||
if let Some(ref cb) = progress_callback {
|
||||
cb(processed, total);
|
||||
}
|
||||
}
|
||||
|
||||
// Process chunks in batches of BATCH_SIZE
|
||||
for batch in all_chunks.chunks(BATCH_SIZE) {
|
||||
let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
|
||||
|
||||
match client.embed_batch(texts).await {
|
||||
Ok(embeddings) => {
|
||||
for (i, embedding) in embeddings.iter().enumerate() {
|
||||
if i >= batch.len() {
|
||||
break;
|
||||
}
|
||||
let chunk = &batch[i];
|
||||
|
||||
if embedding.len() != EXPECTED_DIMS {
|
||||
warn!(
|
||||
doc_id = chunk.doc_id,
|
||||
chunk_index = chunk.chunk_index,
|
||||
got_dims = embedding.len(),
|
||||
expected = EXPECTED_DIMS,
|
||||
"Dimension mismatch, skipping"
|
||||
);
|
||||
record_embedding_error(
|
||||
conn,
|
||||
chunk.doc_id,
|
||||
chunk.chunk_index,
|
||||
&chunk.doc_hash,
|
||||
&chunk.chunk_hash,
|
||||
model_name,
|
||||
&format!(
|
||||
"Dimension mismatch: got {}, expected {}",
|
||||
embedding.len(),
|
||||
EXPECTED_DIMS
|
||||
),
|
||||
)?;
|
||||
result.failed += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
store_embedding(
|
||||
conn,
|
||||
chunk.doc_id,
|
||||
chunk.chunk_index,
|
||||
&chunk.doc_hash,
|
||||
&chunk.chunk_hash,
|
||||
model_name,
|
||||
embedding,
|
||||
)?;
|
||||
result.embedded += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error = %e, "Batch embedding failed");
|
||||
for chunk in batch {
|
||||
record_embedding_error(
|
||||
conn,
|
||||
chunk.doc_id,
|
||||
chunk.chunk_index,
|
||||
&chunk.doc_hash,
|
||||
&chunk.chunk_hash,
|
||||
model_name,
|
||||
&e.to_string(),
|
||||
)?;
|
||||
result.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
embedded = result.embedded,
|
||||
failed = result.failed,
|
||||
skipped = result.skipped,
|
||||
"Embedding pipeline complete"
|
||||
);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Clear all embeddings and metadata for a document.
|
||||
fn clear_document_embeddings(conn: &Connection, document_id: i64) -> Result<()> {
|
||||
conn.execute(
|
||||
"DELETE FROM embedding_metadata WHERE document_id = ?1",
|
||||
[document_id],
|
||||
)?;
|
||||
|
||||
let start_rowid = encode_rowid(document_id, 0);
|
||||
let end_rowid = encode_rowid(document_id + 1, 0);
|
||||
conn.execute(
|
||||
"DELETE FROM embeddings WHERE rowid >= ?1 AND rowid < ?2",
|
||||
rusqlite::params![start_rowid, end_rowid],
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Store an embedding vector and its metadata.
|
||||
fn store_embedding(
|
||||
conn: &Connection,
|
||||
doc_id: i64,
|
||||
chunk_index: usize,
|
||||
doc_hash: &str,
|
||||
chunk_hash: &str,
|
||||
model_name: &str,
|
||||
embedding: &[f32],
|
||||
) -> Result<()> {
|
||||
let rowid = encode_rowid(doc_id, chunk_index as i64);
|
||||
|
||||
let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();
|
||||
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO embeddings (rowid, embedding) VALUES (?1, ?2)",
|
||||
rusqlite::params![rowid, embedding_bytes],
|
||||
)?;
|
||||
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO embedding_metadata
|
||||
(document_id, chunk_index, model, dims, document_hash, chunk_hash,
|
||||
created_at, attempt_count, last_error)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, 1, NULL)",
|
||||
rusqlite::params![doc_id, chunk_index as i64, model_name, EXPECTED_DIMS as i64, doc_hash, chunk_hash, now],
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Record an embedding error in metadata for later retry.
|
||||
fn record_embedding_error(
|
||||
conn: &Connection,
|
||||
doc_id: i64,
|
||||
chunk_index: usize,
|
||||
doc_hash: &str,
|
||||
chunk_hash: &str,
|
||||
model_name: &str,
|
||||
error: &str,
|
||||
) -> Result<()> {
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
conn.execute(
|
||||
"INSERT INTO embedding_metadata
|
||||
(document_id, chunk_index, model, dims, document_hash, chunk_hash,
|
||||
created_at, attempt_count, last_error, last_attempt_at)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, 1, ?8, ?7)
|
||||
ON CONFLICT(document_id, chunk_index) DO UPDATE SET
|
||||
attempt_count = embedding_metadata.attempt_count + 1,
|
||||
last_error = ?8,
|
||||
last_attempt_at = ?7",
|
||||
rusqlite::params![doc_id, chunk_index as i64, model_name, EXPECTED_DIMS as i64, doc_hash, chunk_hash, now, error],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sha256_hash(input: &str) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(input.as_bytes());
|
||||
format!("{:x}", hasher.finalize())
|
||||
}
|
||||
@@ -15,7 +15,7 @@ use tracing::debug;
|
||||
use super::types::{
|
||||
GitLabDiscussion, GitLabIssue, GitLabMergeRequest, GitLabProject, GitLabUser, GitLabVersion,
|
||||
};
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
|
||||
/// Simple rate limiter with jitter to prevent thundering herd.
|
||||
struct RateLimiter {
|
||||
@@ -31,17 +31,18 @@ impl RateLimiter {
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire(&mut self) {
|
||||
/// Compute how long to wait, update last_request, and return the delay.
|
||||
/// The caller sleeps *after* releasing the mutex guard.
|
||||
fn check_delay(&mut self) -> Option<Duration> {
|
||||
let elapsed = self.last_request.elapsed();
|
||||
self.last_request = Instant::now();
|
||||
|
||||
if elapsed < self.min_interval {
|
||||
// Add 0-50ms jitter to prevent synchronized requests
|
||||
let jitter = Duration::from_millis(rand_jitter());
|
||||
let wait_time = self.min_interval - elapsed + jitter;
|
||||
sleep(wait_time).await;
|
||||
Some(self.min_interval - elapsed + jitter)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
||||
self.last_request = Instant::now();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +113,10 @@ impl GitLabClient {
|
||||
|
||||
/// Make an authenticated API request.
|
||||
async fn request<T: serde::de::DeserializeOwned>(&self, path: &str) -> Result<T> {
|
||||
self.rate_limiter.lock().await.acquire().await;
|
||||
let delay = self.rate_limiter.lock().await.check_delay();
|
||||
if let Some(d) = delay {
|
||||
sleep(d).await;
|
||||
}
|
||||
|
||||
let url = format!("{}{}", self.base_url, path);
|
||||
debug!(url = %url, "GitLab request");
|
||||
@@ -123,7 +127,7 @@ impl GitLabClient {
|
||||
.header("PRIVATE-TOKEN", &self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| GiError::GitLabNetworkError {
|
||||
.map_err(|e| LoreError::GitLabNetworkError {
|
||||
base_url: self.base_url.clone(),
|
||||
source: Some(e),
|
||||
})?;
|
||||
@@ -138,9 +142,9 @@ impl GitLabClient {
|
||||
path: &str,
|
||||
) -> Result<T> {
|
||||
match response.status() {
|
||||
StatusCode::UNAUTHORIZED => Err(GiError::GitLabAuthFailed),
|
||||
StatusCode::UNAUTHORIZED => Err(LoreError::GitLabAuthFailed),
|
||||
|
||||
StatusCode::NOT_FOUND => Err(GiError::GitLabNotFound {
|
||||
StatusCode::NOT_FOUND => Err(LoreError::GitLabNotFound {
|
||||
resource: path.to_string(),
|
||||
}),
|
||||
|
||||
@@ -152,7 +156,7 @@ impl GitLabClient {
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(60);
|
||||
|
||||
Err(GiError::GitLabRateLimited { retry_after })
|
||||
Err(LoreError::GitLabRateLimited { retry_after })
|
||||
}
|
||||
|
||||
status if status.is_success() => {
|
||||
@@ -160,7 +164,7 @@ impl GitLabClient {
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
status => Err(GiError::Other(format!(
|
||||
status => Err(LoreError::Other(format!(
|
||||
"GitLab API error: {} {}",
|
||||
status.as_u16(),
|
||||
status.canonical_reason().unwrap_or("Unknown")
|
||||
@@ -216,6 +220,7 @@ impl GitLabClient {
|
||||
match result {
|
||||
Ok((issues, headers)) => {
|
||||
let is_empty = issues.is_empty();
|
||||
let full_page = issues.len() as u32 == per_page;
|
||||
|
||||
// Yield each issue
|
||||
for issue in issues {
|
||||
@@ -233,12 +238,11 @@ impl GitLabClient {
|
||||
page = next;
|
||||
}
|
||||
_ => {
|
||||
// No next page or empty response - we're done
|
||||
if is_empty {
|
||||
if is_empty || !full_page {
|
||||
break;
|
||||
}
|
||||
// Check if current page returned less than per_page (last page)
|
||||
break;
|
||||
// Full page but no x-next-page header: try next page heuristically
|
||||
page += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -278,6 +282,7 @@ impl GitLabClient {
|
||||
match result {
|
||||
Ok((discussions, headers)) => {
|
||||
let is_empty = discussions.is_empty();
|
||||
let full_page = discussions.len() as u32 == per_page;
|
||||
|
||||
for discussion in discussions {
|
||||
yield Ok(discussion);
|
||||
@@ -293,10 +298,11 @@ impl GitLabClient {
|
||||
page = next;
|
||||
}
|
||||
_ => {
|
||||
if is_empty {
|
||||
if is_empty || !full_page {
|
||||
break;
|
||||
}
|
||||
break;
|
||||
// Full page but no x-next-page header: try next page heuristically
|
||||
page += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -462,19 +468,24 @@ impl GitLabClient {
|
||||
.and_then(|s| s.parse::<u32>().ok());
|
||||
|
||||
let should_continue = match (link_next.is_some(), x_next_page, full_page) {
|
||||
(true, _, _) => true, // Link header present: continue
|
||||
(false, Some(np), _) if np > page => {
|
||||
page = np;
|
||||
(true, _, _) => {
|
||||
page += 1; // Link header present: continue to next
|
||||
true
|
||||
}
|
||||
(false, Some(np), _) if np > page => {
|
||||
page = np; // x-next-page tells us exactly which page
|
||||
true
|
||||
}
|
||||
(false, None, true) => {
|
||||
page += 1; // Full page, no headers: try next
|
||||
true
|
||||
}
|
||||
(false, None, true) => true, // Full page, no headers: try next
|
||||
_ => false, // Otherwise we're done
|
||||
};
|
||||
|
||||
if !should_continue || is_empty {
|
||||
break;
|
||||
}
|
||||
page += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
yield Err(e);
|
||||
@@ -491,7 +502,10 @@ impl GitLabClient {
|
||||
path: &str,
|
||||
params: &[(&str, String)],
|
||||
) -> Result<(T, HeaderMap)> {
|
||||
self.rate_limiter.lock().await.acquire().await;
|
||||
let delay = self.rate_limiter.lock().await.check_delay();
|
||||
if let Some(d) = delay {
|
||||
sleep(d).await;
|
||||
}
|
||||
|
||||
let url = format!("{}{}", self.base_url, path);
|
||||
debug!(url = %url, ?params, "GitLab paginated request");
|
||||
@@ -503,7 +517,7 @@ impl GitLabClient {
|
||||
.header("PRIVATE-TOKEN", &self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| GiError::GitLabNetworkError {
|
||||
.map_err(|e| LoreError::GitLabNetworkError {
|
||||
base_url: self.base_url.clone(),
|
||||
source: Some(e),
|
||||
})?;
|
||||
|
||||
258
src/ingestion/dirty_tracker.rs
Normal file
258
src/ingestion/dirty_tracker.rs
Normal file
@@ -0,0 +1,258 @@
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::backoff::compute_next_attempt_at;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::time::now_ms;
|
||||
use crate::documents::SourceType;
|
||||
|
||||
const DIRTY_SOURCES_BATCH_SIZE: usize = 500;
|
||||
|
||||
/// Mark a source entity as dirty INSIDE an existing transaction.
|
||||
/// ON CONFLICT resets ALL backoff/error state so fresh updates are immediately eligible.
|
||||
pub fn mark_dirty_tx(
|
||||
tx: &rusqlite::Transaction<'_>,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
) -> Result<()> {
|
||||
tx.execute(
|
||||
"INSERT INTO dirty_sources (source_type, source_id, queued_at)
|
||||
VALUES (?1, ?2, ?3)
|
||||
ON CONFLICT(source_type, source_id) DO UPDATE SET
|
||||
queued_at = excluded.queued_at,
|
||||
attempt_count = 0,
|
||||
last_attempt_at = NULL,
|
||||
last_error = NULL,
|
||||
next_attempt_at = NULL",
|
||||
rusqlite::params![source_type.as_str(), source_id, now_ms()],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convenience wrapper for non-transactional contexts.
|
||||
pub fn mark_dirty(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
|
||||
conn.execute(
|
||||
"INSERT INTO dirty_sources (source_type, source_id, queued_at)
|
||||
VALUES (?1, ?2, ?3)
|
||||
ON CONFLICT(source_type, source_id) DO UPDATE SET
|
||||
queued_at = excluded.queued_at,
|
||||
attempt_count = 0,
|
||||
last_attempt_at = NULL,
|
||||
last_error = NULL,
|
||||
next_attempt_at = NULL",
|
||||
rusqlite::params![source_type.as_str(), source_id, now_ms()],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get dirty sources ready for processing.
|
||||
/// Returns entries where next_attempt_at is NULL or <= now.
|
||||
/// Orders by attempt_count ASC (fresh before failed), then queued_at ASC.
|
||||
pub fn get_dirty_sources(conn: &Connection) -> Result<Vec<(SourceType, i64)>> {
|
||||
let now = now_ms();
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT source_type, source_id FROM dirty_sources
|
||||
WHERE next_attempt_at IS NULL OR next_attempt_at <= ?1
|
||||
ORDER BY attempt_count ASC, queued_at ASC
|
||||
LIMIT ?2"
|
||||
)?;
|
||||
let rows = stmt
|
||||
.query_map(rusqlite::params![now, DIRTY_SOURCES_BATCH_SIZE as i64], |row| {
|
||||
let st_str: String = row.get(0)?;
|
||||
let source_id: i64 = row.get(1)?;
|
||||
Ok((st_str, source_id))
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
let mut results = Vec::with_capacity(rows.len());
|
||||
for (st_str, source_id) in rows {
|
||||
let source_type = SourceType::parse(&st_str).ok_or_else(|| {
|
||||
crate::core::error::LoreError::Other(format!(
|
||||
"Invalid source_type in dirty_sources: {}",
|
||||
st_str
|
||||
))
|
||||
})?;
|
||||
results.push((source_type, source_id));
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Clear dirty entry after successful processing.
|
||||
pub fn clear_dirty(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
|
||||
conn.execute(
|
||||
"DELETE FROM dirty_sources WHERE source_type = ?1 AND source_id = ?2",
|
||||
rusqlite::params![source_type.as_str(), source_id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Record an error for a dirty source, incrementing attempt_count and setting backoff.
|
||||
pub fn record_dirty_error(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
error: &str,
|
||||
) -> Result<()> {
|
||||
let now = now_ms();
|
||||
// Get current attempt_count first
|
||||
let attempt_count: i64 = conn.query_row(
|
||||
"SELECT attempt_count FROM dirty_sources WHERE source_type = ?1 AND source_id = ?2",
|
||||
rusqlite::params![source_type.as_str(), source_id],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
let new_attempt = attempt_count + 1;
|
||||
let next_at = compute_next_attempt_at(now, new_attempt);
|
||||
|
||||
conn.execute(
|
||||
"UPDATE dirty_sources SET
|
||||
attempt_count = ?1,
|
||||
last_attempt_at = ?2,
|
||||
last_error = ?3,
|
||||
next_attempt_at = ?4
|
||||
WHERE source_type = ?5 AND source_id = ?6",
|
||||
rusqlite::params![new_attempt, now, error, next_at, source_type.as_str(), source_id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn setup_db() -> Connection {
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.execute_batch("
|
||||
CREATE TABLE dirty_sources (
|
||||
source_type TEXT NOT NULL CHECK (source_type IN ('issue','merge_request','discussion')),
|
||||
source_id INTEGER NOT NULL,
|
||||
queued_at INTEGER NOT NULL,
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER,
|
||||
last_error TEXT,
|
||||
next_attempt_at INTEGER,
|
||||
PRIMARY KEY(source_type, source_id)
|
||||
);
|
||||
CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at);
|
||||
").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mark_dirty_inserts() {
|
||||
let conn = setup_db();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
|
||||
let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap();
|
||||
assert_eq!(count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mark_dirty_tx_inserts() {
|
||||
let mut conn = setup_db();
|
||||
{
|
||||
let tx = conn.transaction().unwrap();
|
||||
mark_dirty_tx(&tx, SourceType::Issue, 1).unwrap();
|
||||
tx.commit().unwrap();
|
||||
}
|
||||
let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap();
|
||||
assert_eq!(count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_requeue_resets_backoff() {
|
||||
let conn = setup_db();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
// Simulate error state
|
||||
record_dirty_error(&conn, SourceType::Issue, 1, "test error").unwrap();
|
||||
|
||||
let attempt: i64 = conn.query_row(
|
||||
"SELECT attempt_count FROM dirty_sources WHERE source_id = 1", [], |r| r.get(0)
|
||||
).unwrap();
|
||||
assert_eq!(attempt, 1);
|
||||
|
||||
// Re-mark should reset
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
let attempt: i64 = conn.query_row(
|
||||
"SELECT attempt_count FROM dirty_sources WHERE source_id = 1", [], |r| r.get(0)
|
||||
).unwrap();
|
||||
assert_eq!(attempt, 0);
|
||||
|
||||
let next_at: Option<i64> = conn.query_row(
|
||||
"SELECT next_attempt_at FROM dirty_sources WHERE source_id = 1", [], |r| r.get(0)
|
||||
).unwrap();
|
||||
assert!(next_at.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_respects_backoff() {
|
||||
let conn = setup_db();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
// Set next_attempt_at far in the future
|
||||
conn.execute(
|
||||
"UPDATE dirty_sources SET next_attempt_at = 9999999999999 WHERE source_id = 1",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
let results = get_dirty_sources(&conn).unwrap();
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_orders_by_attempt_count() {
|
||||
let conn = setup_db();
|
||||
// Insert issue 1 (failed, attempt_count=2)
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
conn.execute(
|
||||
"UPDATE dirty_sources SET attempt_count = 2 WHERE source_id = 1",
|
||||
[],
|
||||
).unwrap();
|
||||
// Insert issue 2 (fresh, attempt_count=0)
|
||||
mark_dirty(&conn, SourceType::Issue, 2).unwrap();
|
||||
|
||||
let results = get_dirty_sources(&conn).unwrap();
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].1, 2); // Fresh first
|
||||
assert_eq!(results[1].1, 1); // Failed second
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_size_500() {
|
||||
let conn = setup_db();
|
||||
for i in 0..600 {
|
||||
mark_dirty(&conn, SourceType::Issue, i).unwrap();
|
||||
}
|
||||
let results = get_dirty_sources(&conn).unwrap();
|
||||
assert_eq!(results.len(), 500);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clear_removes() {
|
||||
let conn = setup_db();
|
||||
mark_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
clear_dirty(&conn, SourceType::Issue, 1).unwrap();
|
||||
|
||||
let count: i64 = conn.query_row("SELECT COUNT(*) FROM dirty_sources", [], |r| r.get(0)).unwrap();
|
||||
assert_eq!(count, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drain_loop() {
|
||||
let conn = setup_db();
|
||||
for i in 0..1200 {
|
||||
mark_dirty(&conn, SourceType::Issue, i).unwrap();
|
||||
}
|
||||
|
||||
let mut total = 0;
|
||||
loop {
|
||||
let batch = get_dirty_sources(&conn).unwrap();
|
||||
if batch.is_empty() {
|
||||
break;
|
||||
}
|
||||
for (st, id) in &batch {
|
||||
clear_dirty(&conn, *st, *id).unwrap();
|
||||
}
|
||||
total += batch.len();
|
||||
}
|
||||
assert_eq!(total, 1200);
|
||||
}
|
||||
}
|
||||
265
src/ingestion/discussion_queue.rs
Normal file
265
src/ingestion/discussion_queue.rs
Normal file
@@ -0,0 +1,265 @@
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::backoff::compute_next_attempt_at;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::time::now_ms;
|
||||
|
||||
/// Noteable type for discussion queue.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum NoteableType {
|
||||
Issue,
|
||||
MergeRequest,
|
||||
}
|
||||
|
||||
impl NoteableType {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Issue => "Issue",
|
||||
Self::MergeRequest => "MergeRequest",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(s: &str) -> Option<Self> {
|
||||
match s {
|
||||
"Issue" => Some(Self::Issue),
|
||||
"MergeRequest" => Some(Self::MergeRequest),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A pending discussion fetch entry.
|
||||
pub struct PendingFetch {
|
||||
pub project_id: i64,
|
||||
pub noteable_type: NoteableType,
|
||||
pub noteable_iid: i64,
|
||||
pub attempt_count: i32,
|
||||
}
|
||||
|
||||
/// Queue a discussion fetch. ON CONFLICT resets backoff (consistent with dirty_sources).
|
||||
pub fn queue_discussion_fetch(
|
||||
conn: &Connection,
|
||||
project_id: i64,
|
||||
noteable_type: NoteableType,
|
||||
noteable_iid: i64,
|
||||
) -> Result<()> {
|
||||
conn.execute(
|
||||
"INSERT INTO pending_discussion_fetches (project_id, noteable_type, noteable_iid, queued_at)
|
||||
VALUES (?1, ?2, ?3, ?4)
|
||||
ON CONFLICT(project_id, noteable_type, noteable_iid) DO UPDATE SET
|
||||
queued_at = excluded.queued_at,
|
||||
attempt_count = 0,
|
||||
last_attempt_at = NULL,
|
||||
last_error = NULL,
|
||||
next_attempt_at = NULL",
|
||||
rusqlite::params![project_id, noteable_type.as_str(), noteable_iid, now_ms()],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get next batch of pending fetches (WHERE next_attempt_at IS NULL OR <= now).
|
||||
pub fn get_pending_fetches(conn: &Connection, limit: usize) -> Result<Vec<PendingFetch>> {
|
||||
let now = now_ms();
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT project_id, noteable_type, noteable_iid, attempt_count
|
||||
FROM pending_discussion_fetches
|
||||
WHERE next_attempt_at IS NULL OR next_attempt_at <= ?1
|
||||
ORDER BY queued_at ASC
|
||||
LIMIT ?2"
|
||||
)?;
|
||||
let rows = stmt
|
||||
.query_map(rusqlite::params![now, limit as i64], |row| {
|
||||
Ok((
|
||||
row.get::<_, i64>(0)?,
|
||||
row.get::<_, String>(1)?,
|
||||
row.get::<_, i64>(2)?,
|
||||
row.get::<_, i32>(3)?,
|
||||
))
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
let mut results = Vec::with_capacity(rows.len());
|
||||
for (project_id, nt_str, noteable_iid, attempt_count) in rows {
|
||||
let noteable_type = NoteableType::parse(&nt_str).ok_or_else(|| {
|
||||
crate::core::error::LoreError::Other(format!(
|
||||
"Invalid noteable_type in pending_discussion_fetches: {}",
|
||||
nt_str
|
||||
))
|
||||
})?;
|
||||
results.push(PendingFetch {
|
||||
project_id,
|
||||
noteable_type,
|
||||
noteable_iid,
|
||||
attempt_count,
|
||||
});
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Mark fetch complete (remove from queue).
|
||||
pub fn complete_fetch(
|
||||
conn: &Connection,
|
||||
project_id: i64,
|
||||
noteable_type: NoteableType,
|
||||
noteable_iid: i64,
|
||||
) -> Result<()> {
|
||||
conn.execute(
|
||||
"DELETE FROM pending_discussion_fetches
|
||||
WHERE project_id = ?1 AND noteable_type = ?2 AND noteable_iid = ?3",
|
||||
rusqlite::params![project_id, noteable_type.as_str(), noteable_iid],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Record fetch error with backoff.
|
||||
pub fn record_fetch_error(
|
||||
conn: &Connection,
|
||||
project_id: i64,
|
||||
noteable_type: NoteableType,
|
||||
noteable_iid: i64,
|
||||
error: &str,
|
||||
) -> Result<()> {
|
||||
let now = now_ms();
|
||||
let attempt_count: i64 = conn.query_row(
|
||||
"SELECT attempt_count FROM pending_discussion_fetches
|
||||
WHERE project_id = ?1 AND noteable_type = ?2 AND noteable_iid = ?3",
|
||||
rusqlite::params![project_id, noteable_type.as_str(), noteable_iid],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
let new_attempt = attempt_count + 1;
|
||||
let next_at = compute_next_attempt_at(now, new_attempt);
|
||||
|
||||
conn.execute(
|
||||
"UPDATE pending_discussion_fetches SET
|
||||
attempt_count = ?1,
|
||||
last_attempt_at = ?2,
|
||||
last_error = ?3,
|
||||
next_attempt_at = ?4
|
||||
WHERE project_id = ?5 AND noteable_type = ?6 AND noteable_iid = ?7",
|
||||
rusqlite::params![new_attempt, now, error, next_at, project_id, noteable_type.as_str(), noteable_iid],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn setup_db() -> Connection {
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.execute_batch("
|
||||
CREATE TABLE projects (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_project_id INTEGER UNIQUE NOT NULL,
|
||||
path_with_namespace TEXT NOT NULL,
|
||||
default_branch TEXT,
|
||||
web_url TEXT,
|
||||
created_at INTEGER,
|
||||
updated_at INTEGER,
|
||||
raw_payload_id INTEGER
|
||||
);
|
||||
INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project');
|
||||
|
||||
CREATE TABLE pending_discussion_fetches (
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
noteable_type TEXT NOT NULL,
|
||||
noteable_iid INTEGER NOT NULL,
|
||||
queued_at INTEGER NOT NULL,
|
||||
attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
last_attempt_at INTEGER,
|
||||
last_error TEXT,
|
||||
next_attempt_at INTEGER,
|
||||
PRIMARY KEY(project_id, noteable_type, noteable_iid)
|
||||
);
|
||||
CREATE INDEX idx_pending_discussions_next_attempt ON pending_discussion_fetches(next_attempt_at);
|
||||
").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_queue_and_get() {
|
||||
let conn = setup_db();
|
||||
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
|
||||
|
||||
let fetches = get_pending_fetches(&conn, 100).unwrap();
|
||||
assert_eq!(fetches.len(), 1);
|
||||
assert_eq!(fetches[0].project_id, 1);
|
||||
assert_eq!(fetches[0].noteable_type, NoteableType::Issue);
|
||||
assert_eq!(fetches[0].noteable_iid, 42);
|
||||
assert_eq!(fetches[0].attempt_count, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_requeue_resets_backoff() {
|
||||
let conn = setup_db();
|
||||
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
|
||||
record_fetch_error(&conn, 1, NoteableType::Issue, 42, "network error").unwrap();
|
||||
|
||||
let attempt: i32 = conn.query_row(
|
||||
"SELECT attempt_count FROM pending_discussion_fetches WHERE noteable_iid = 42",
|
||||
[], |r| r.get(0),
|
||||
).unwrap();
|
||||
assert_eq!(attempt, 1);
|
||||
|
||||
// Re-queue should reset
|
||||
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
|
||||
let attempt: i32 = conn.query_row(
|
||||
"SELECT attempt_count FROM pending_discussion_fetches WHERE noteable_iid = 42",
|
||||
[], |r| r.get(0),
|
||||
).unwrap();
|
||||
assert_eq!(attempt, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backoff_respected() {
|
||||
let conn = setup_db();
|
||||
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
|
||||
conn.execute(
|
||||
"UPDATE pending_discussion_fetches SET next_attempt_at = 9999999999999 WHERE noteable_iid = 42",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
let fetches = get_pending_fetches(&conn, 100).unwrap();
|
||||
assert!(fetches.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_complete_removes() {
|
||||
let conn = setup_db();
|
||||
queue_discussion_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
|
||||
complete_fetch(&conn, 1, NoteableType::Issue, 42).unwrap();
|
||||
|
||||
let count: i64 = conn.query_row(
|
||||
"SELECT COUNT(*) FROM pending_discussion_fetches", [], |r| r.get(0),
|
||||
).unwrap();
|
||||
assert_eq!(count, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_increments_attempts() {
|
||||
let conn = setup_db();
|
||||
queue_discussion_fetch(&conn, 1, NoteableType::MergeRequest, 10).unwrap();
|
||||
record_fetch_error(&conn, 1, NoteableType::MergeRequest, 10, "timeout").unwrap();
|
||||
|
||||
let (attempt, error): (i32, Option<String>) = conn.query_row(
|
||||
"SELECT attempt_count, last_error FROM pending_discussion_fetches WHERE noteable_iid = 10",
|
||||
[], |r| Ok((r.get(0)?, r.get(1)?)),
|
||||
).unwrap();
|
||||
assert_eq!(attempt, 1);
|
||||
assert_eq!(error, Some("timeout".to_string()));
|
||||
|
||||
let next_at: Option<i64> = conn.query_row(
|
||||
"SELECT next_attempt_at FROM pending_discussion_fetches WHERE noteable_iid = 10",
|
||||
[], |r| r.get(0),
|
||||
).unwrap();
|
||||
assert!(next_at.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_noteable_type_parse() {
|
||||
assert_eq!(NoteableType::parse("Issue"), Some(NoteableType::Issue));
|
||||
assert_eq!(NoteableType::parse("MergeRequest"), Some(NoteableType::MergeRequest));
|
||||
assert_eq!(NoteableType::parse("invalid"), None);
|
||||
}
|
||||
}
|
||||
@@ -8,11 +8,13 @@
|
||||
|
||||
use futures::StreamExt;
|
||||
use rusqlite::Connection;
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::payloads::{StorePayloadOptions, store_payload};
|
||||
use crate::documents::SourceType;
|
||||
use crate::ingestion::dirty_tracker;
|
||||
use crate::gitlab::GitLabClient;
|
||||
use crate::gitlab::transformers::{NoteableRef, transform_discussion, transform_notes};
|
||||
|
||||
@@ -55,7 +57,7 @@ pub async fn ingest_issue_discussions(
|
||||
total_result.stale_discussions_removed += result.stale_discussions_removed;
|
||||
}
|
||||
|
||||
info!(
|
||||
debug!(
|
||||
issues_processed = issues.len(),
|
||||
discussions_fetched = total_result.discussions_fetched,
|
||||
discussions_upserted = total_result.discussions_upserted,
|
||||
@@ -90,7 +92,7 @@ async fn ingest_discussions_for_issue(
|
||||
// Track discussions we've seen for stale removal
|
||||
let mut seen_discussion_ids: Vec<String> = Vec::new();
|
||||
// Track if any error occurred during pagination
|
||||
let mut pagination_error: Option<crate::core::error::GiError> = None;
|
||||
let mut pagination_error: Option<crate::core::error::LoreError> = None;
|
||||
|
||||
while let Some(disc_result) = discussions_stream.next().await {
|
||||
|
||||
@@ -141,6 +143,9 @@ async fn ingest_discussions_for_issue(
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
// Mark dirty for document regeneration (inside transaction)
|
||||
dirty_tracker::mark_dirty_tx(&tx, SourceType::Discussion, local_discussion_id)?;
|
||||
|
||||
// Transform and store notes
|
||||
let notes = transform_notes(&gitlab_discussion, local_project_id);
|
||||
let notes_count = notes.len();
|
||||
|
||||
@@ -14,9 +14,11 @@ use rusqlite::{Connection, Transaction};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::payloads::{StorePayloadOptions, store_payload};
|
||||
use crate::core::time::now_ms;
|
||||
use crate::documents::SourceType;
|
||||
use crate::ingestion::dirty_tracker;
|
||||
use crate::gitlab::GitLabClient;
|
||||
use crate::gitlab::transformers::{MilestoneRow, transform_issue};
|
||||
use crate::gitlab::types::GitLabIssue;
|
||||
@@ -297,6 +299,9 @@ fn process_issue_in_transaction(
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
// Mark dirty for document regeneration (inside transaction)
|
||||
dirty_tracker::mark_dirty_tx(tx, SourceType::Issue, local_issue_id)?;
|
||||
|
||||
// Clear existing label links (stale removal)
|
||||
tx.execute(
|
||||
"DELETE FROM issue_labels WHERE issue_id = ?",
|
||||
@@ -470,7 +475,7 @@ fn get_issues_needing_discussion_sync(
|
||||
fn parse_timestamp(ts: &str) -> Result<i64> {
|
||||
chrono::DateTime::parse_from_rfc3339(ts)
|
||||
.map(|dt| dt.timestamp_millis())
|
||||
.map_err(|e| GiError::Other(format!("Failed to parse timestamp '{}': {}", ts, e)))
|
||||
.map_err(|e| LoreError::Other(format!("Failed to parse timestamp '{}': {}", ts, e)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -13,9 +13,11 @@ use rusqlite::{Connection, Transaction, params};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::Config;
|
||||
use crate::core::error::{GiError, Result};
|
||||
use crate::core::error::{LoreError, Result};
|
||||
use crate::core::payloads::{StorePayloadOptions, store_payload};
|
||||
use crate::core::time::now_ms;
|
||||
use crate::documents::SourceType;
|
||||
use crate::ingestion::dirty_tracker;
|
||||
use crate::gitlab::GitLabClient;
|
||||
use crate::gitlab::transformers::merge_request::transform_merge_request;
|
||||
use crate::gitlab::types::GitLabMergeRequest;
|
||||
@@ -166,7 +168,7 @@ fn process_single_mr(
|
||||
// Transform MR first (outside transaction - no DB access)
|
||||
let payload_json = serde_json::to_value(mr)?;
|
||||
let transformed = transform_merge_request(mr, project_id)
|
||||
.map_err(|e| GiError::Other(format!("MR transform failed: {}", e)))?;
|
||||
.map_err(|e| LoreError::Other(format!("MR transform failed: {}", e)))?;
|
||||
|
||||
// Wrap all DB operations in a transaction for atomicity
|
||||
let tx = conn.unchecked_transaction()?;
|
||||
@@ -263,6 +265,9 @@ fn process_mr_in_transaction(
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
// Mark dirty for document regeneration (inside transaction)
|
||||
dirty_tracker::mark_dirty_tx(tx, SourceType::MergeRequest, local_mr_id)?;
|
||||
|
||||
// Clear-and-relink labels
|
||||
tx.execute(
|
||||
"DELETE FROM mr_labels WHERE merge_request_id = ?",
|
||||
@@ -448,7 +453,7 @@ pub fn get_mrs_needing_discussion_sync(
|
||||
fn parse_timestamp(ts: &str) -> Result<i64> {
|
||||
chrono::DateTime::parse_from_rfc3339(ts)
|
||||
.map(|dt| dt.timestamp_millis())
|
||||
.map_err(|e| GiError::Other(format!("Failed to parse timestamp '{}': {}", ts, e)))
|
||||
.map_err(|e| LoreError::Other(format!("Failed to parse timestamp '{}': {}", ts, e)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
//! This module handles fetching and storing issues, discussions, and notes
|
||||
//! from GitLab with cursor-based incremental sync.
|
||||
|
||||
pub mod dirty_tracker;
|
||||
pub mod discussion_queue;
|
||||
pub mod discussions;
|
||||
pub mod issues;
|
||||
pub mod merge_requests;
|
||||
|
||||
@@ -18,6 +18,8 @@ use crate::Config;
|
||||
use crate::core::error::Result;
|
||||
use crate::core::payloads::{StorePayloadOptions, store_payload};
|
||||
use crate::core::time::now_ms;
|
||||
use crate::documents::SourceType;
|
||||
use crate::ingestion::dirty_tracker;
|
||||
use crate::gitlab::GitLabClient;
|
||||
use crate::gitlab::transformers::{
|
||||
NormalizedDiscussion, NormalizedNote, transform_mr_discussion,
|
||||
@@ -189,6 +191,9 @@ pub fn write_prefetched_mr_discussions(
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
// Mark dirty for document regeneration (inside transaction)
|
||||
dirty_tracker::mark_dirty_tx(&tx, SourceType::Discussion, local_discussion_id)?;
|
||||
|
||||
// Upsert notes
|
||||
for note in &disc.notes {
|
||||
let should_store_payload = !note.is_system
|
||||
@@ -402,6 +407,9 @@ async fn ingest_discussions_for_mr(
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
|
||||
// Mark dirty for document regeneration (inside transaction)
|
||||
dirty_tracker::mark_dirty_tx(&tx, SourceType::Discussion, local_discussion_id)?;
|
||||
|
||||
// Upsert notes (not delete-all-then-insert)
|
||||
for note in ¬es {
|
||||
// Selective payload storage: skip system notes without position
|
||||
|
||||
@@ -5,7 +5,10 @@
|
||||
|
||||
pub mod cli;
|
||||
pub mod core;
|
||||
pub mod documents;
|
||||
pub mod embedding;
|
||||
pub mod gitlab;
|
||||
pub mod ingestion;
|
||||
pub mod search;
|
||||
|
||||
pub use core::{Config, GiError, Result};
|
||||
pub use core::{Config, LoreError, Result};
|
||||
|
||||
646
src/main.rs
646
src/main.rs
@@ -10,17 +10,23 @@ use tracing_subscriber::util::SubscriberInitExt;
|
||||
|
||||
use lore::Config;
|
||||
use lore::cli::commands::{
|
||||
InitInputs, InitOptions, ListFilters, MrListFilters, open_issue_in_browser, open_mr_in_browser,
|
||||
print_count, print_count_json, print_doctor_results, print_ingest_summary,
|
||||
print_ingest_summary_json, print_list_issues, print_list_issues_json, print_list_mrs,
|
||||
print_list_mrs_json, print_show_issue, print_show_issue_json, print_show_mr,
|
||||
InitInputs, InitOptions, InitResult, ListFilters, MrListFilters, SearchCliFilters, open_issue_in_browser,
|
||||
open_mr_in_browser, print_count, print_count_json, print_doctor_results, print_generate_docs,
|
||||
print_generate_docs_json, print_ingest_summary, print_ingest_summary_json, print_list_issues,
|
||||
print_list_issues_json, print_list_mrs, print_list_mrs_json, print_search_results,
|
||||
print_search_results_json, print_show_issue, print_show_issue_json, print_show_mr, print_stats,
|
||||
print_stats_json,
|
||||
print_embed, print_embed_json, print_sync, print_sync_json,
|
||||
print_show_mr_json, print_sync_status, print_sync_status_json, run_auth_test, run_count,
|
||||
run_doctor, run_ingest, run_init, run_list_issues, run_list_mrs, run_show_issue, run_show_mr,
|
||||
run_sync_status,
|
||||
run_doctor, run_embed, run_generate_docs, run_ingest, run_init, run_list_issues, run_list_mrs,
|
||||
run_search, run_show_issue, run_show_mr, run_stats, run_sync, run_sync_status, SyncOptions,
|
||||
};
|
||||
use lore::cli::{
|
||||
Cli, Commands, CountArgs, EmbedArgs, GenerateDocsArgs, IngestArgs, IssuesArgs, MrsArgs,
|
||||
SearchArgs, StatsArgs, SyncArgs,
|
||||
};
|
||||
use lore::cli::{Cli, Commands, CountArgs, IngestArgs, IssuesArgs, MrsArgs};
|
||||
use lore::core::db::{create_connection, get_schema_version, run_migrations};
|
||||
use lore::core::error::{GiError, RobotErrorOutput};
|
||||
use lore::core::error::{LoreError, RobotErrorOutput};
|
||||
use lore::core::paths::get_config_path;
|
||||
use lore::core::paths::get_db_path;
|
||||
|
||||
@@ -49,6 +55,10 @@ async fn main() {
|
||||
let result = match cli.command {
|
||||
Commands::Issues(args) => handle_issues(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Mrs(args) => handle_mrs(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Search(args) => handle_search(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Stats(args) => handle_stats(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Embed(args) => handle_embed(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Sync(args) => handle_sync_cmd(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Ingest(args) => handle_ingest(cli.config.as_deref(), args, robot_mode).await,
|
||||
Commands::Count(args) => {
|
||||
handle_count(cli.config.as_deref(), args, robot_mode).await
|
||||
@@ -60,10 +70,29 @@ async fn main() {
|
||||
Commands::Init {
|
||||
force,
|
||||
non_interactive,
|
||||
} => handle_init(cli.config.as_deref(), force, non_interactive, robot_mode).await,
|
||||
gitlab_url,
|
||||
token_env_var,
|
||||
projects,
|
||||
} => {
|
||||
handle_init(
|
||||
cli.config.as_deref(),
|
||||
force,
|
||||
non_interactive,
|
||||
robot_mode,
|
||||
gitlab_url,
|
||||
token_env_var,
|
||||
projects,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Commands::GenerateDocs(args) => {
|
||||
handle_generate_docs(cli.config.as_deref(), args, robot_mode).await
|
||||
}
|
||||
Commands::Backup => handle_backup(robot_mode),
|
||||
Commands::Reset { yes: _ } => handle_reset(robot_mode),
|
||||
Commands::Migrate => handle_migrate(cli.config.as_deref(), robot_mode).await,
|
||||
Commands::Health => handle_health(cli.config.as_deref(), robot_mode).await,
|
||||
Commands::RobotDocs => handle_robot_docs(robot_mode),
|
||||
|
||||
// --- Backward-compat: deprecated aliases ---
|
||||
Commands::List {
|
||||
@@ -159,7 +188,7 @@ async fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
/// Fallback error output for non-GiError errors in robot mode.
|
||||
/// Fallback error output for non-LoreError errors in robot mode.
|
||||
#[derive(Serialize)]
|
||||
struct FallbackErrorOutput {
|
||||
error: FallbackError,
|
||||
@@ -172,8 +201,8 @@ struct FallbackError {
|
||||
}
|
||||
|
||||
fn handle_error(e: Box<dyn std::error::Error>, robot_mode: bool) -> ! {
|
||||
// Try to downcast to GiError for structured output
|
||||
if let Some(gi_error) = e.downcast_ref::<GiError>() {
|
||||
// Try to downcast to LoreError for structured output
|
||||
if let Some(gi_error) = e.downcast_ref::<LoreError>() {
|
||||
if robot_mode {
|
||||
let output = RobotErrorOutput::from(gi_error);
|
||||
// Use serde_json for safe serialization; fallback constructs JSON safely
|
||||
@@ -201,7 +230,7 @@ fn handle_error(e: Box<dyn std::error::Error>, robot_mode: bool) -> ! {
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback for non-GiError errors - use serde for proper JSON escaping
|
||||
// Fallback for non-LoreError errors - use serde for proper JSON escaping
|
||||
if robot_mode {
|
||||
let output = FallbackErrorOutput {
|
||||
error: FallbackError {
|
||||
@@ -473,22 +502,123 @@ async fn handle_sync_status_cmd(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// JSON output for init command.
|
||||
#[derive(Serialize)]
|
||||
struct InitOutput {
|
||||
ok: bool,
|
||||
data: InitOutputData,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct InitOutputData {
|
||||
config_path: String,
|
||||
data_dir: String,
|
||||
user: InitOutputUser,
|
||||
projects: Vec<InitOutputProject>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct InitOutputUser {
|
||||
username: String,
|
||||
name: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct InitOutputProject {
|
||||
path: String,
|
||||
name: String,
|
||||
}
|
||||
|
||||
fn print_init_json(result: &InitResult) {
|
||||
let output = InitOutput {
|
||||
ok: true,
|
||||
data: InitOutputData {
|
||||
config_path: result.config_path.clone(),
|
||||
data_dir: result.data_dir.clone(),
|
||||
user: InitOutputUser {
|
||||
username: result.user.username.clone(),
|
||||
name: result.user.name.clone(),
|
||||
},
|
||||
projects: result
|
||||
.projects
|
||||
.iter()
|
||||
.map(|p| InitOutputProject {
|
||||
path: p.path.clone(),
|
||||
name: p.name.clone(),
|
||||
})
|
||||
.collect(),
|
||||
},
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output).unwrap());
|
||||
}
|
||||
|
||||
async fn handle_init(
|
||||
config_override: Option<&str>,
|
||||
force: bool,
|
||||
non_interactive: bool,
|
||||
_robot_mode: bool, // TODO: Add robot mode support for init (requires non-interactive implementation)
|
||||
robot_mode: bool,
|
||||
gitlab_url_flag: Option<String>,
|
||||
token_env_var_flag: Option<String>,
|
||||
projects_flag: Option<String>,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Robot mode: require all inputs via flags, skip interactive prompts
|
||||
if robot_mode {
|
||||
let missing: Vec<&str> = [
|
||||
gitlab_url_flag.is_none().then_some("--gitlab-url"),
|
||||
token_env_var_flag.is_none().then_some("--token-env-var"),
|
||||
projects_flag.is_none().then_some("--projects"),
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
|
||||
if !missing.is_empty() {
|
||||
let output = RobotErrorWithSuggestion {
|
||||
error: RobotErrorSuggestionData {
|
||||
code: "MISSING_FLAGS".to_string(),
|
||||
message: format!("Robot mode requires flags: {}", missing.join(", ")),
|
||||
suggestion: "lore --robot init --gitlab-url https://gitlab.com --token-env-var GITLAB_TOKEN --projects group/project".to_string(),
|
||||
},
|
||||
};
|
||||
eprintln!("{}", serde_json::to_string(&output)?);
|
||||
std::process::exit(2);
|
||||
}
|
||||
|
||||
let project_paths: Vec<String> = projects_flag
|
||||
.unwrap()
|
||||
.split(',')
|
||||
.map(|p| p.trim().to_string())
|
||||
.filter(|p| !p.is_empty())
|
||||
.collect();
|
||||
|
||||
let result = run_init(
|
||||
InitInputs {
|
||||
gitlab_url: gitlab_url_flag.unwrap(),
|
||||
token_env_var: token_env_var_flag.unwrap(),
|
||||
project_paths,
|
||||
},
|
||||
InitOptions {
|
||||
config_path: config_override.map(String::from),
|
||||
force: true,
|
||||
non_interactive: true,
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
print_init_json(&result);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Human mode: interactive prompts
|
||||
let config_path = get_config_path(config_override);
|
||||
let mut confirmed_overwrite = force;
|
||||
|
||||
// Check if config exists and handle overwrite
|
||||
if config_path.exists() {
|
||||
if config_path.exists() && !force {
|
||||
if non_interactive {
|
||||
eprintln!(
|
||||
"{}",
|
||||
style(format!(
|
||||
"Config file exists at {}. Cannot proceed in non-interactive mode.",
|
||||
"Config file exists at {}. Use --force to overwrite.",
|
||||
config_path.display()
|
||||
))
|
||||
.red()
|
||||
@@ -496,59 +626,70 @@ async fn handle_init(
|
||||
std::process::exit(2);
|
||||
}
|
||||
|
||||
if !force {
|
||||
let confirm = Confirm::new()
|
||||
.with_prompt(format!(
|
||||
"Config file exists at {}. Overwrite?",
|
||||
config_path.display()
|
||||
))
|
||||
.default(false)
|
||||
.interact()?;
|
||||
let confirm = Confirm::new()
|
||||
.with_prompt(format!(
|
||||
"Config file exists at {}. Overwrite?",
|
||||
config_path.display()
|
||||
))
|
||||
.default(false)
|
||||
.interact()?;
|
||||
|
||||
if !confirm {
|
||||
println!("{}", style("Cancelled.").yellow());
|
||||
std::process::exit(2);
|
||||
}
|
||||
confirmed_overwrite = true;
|
||||
if !confirm {
|
||||
println!("{}", style("Cancelled.").yellow());
|
||||
std::process::exit(2);
|
||||
}
|
||||
confirmed_overwrite = true;
|
||||
}
|
||||
|
||||
// Prompt for GitLab URL
|
||||
let gitlab_url: String = Input::new()
|
||||
.with_prompt("GitLab URL")
|
||||
.default("https://gitlab.com".to_string())
|
||||
.validate_with(|input: &String| -> Result<(), &str> {
|
||||
if url::Url::parse(input).is_ok() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err("Please enter a valid URL")
|
||||
}
|
||||
})
|
||||
.interact_text()?;
|
||||
let gitlab_url: String = if let Some(url) = gitlab_url_flag {
|
||||
url
|
||||
} else {
|
||||
Input::new()
|
||||
.with_prompt("GitLab URL")
|
||||
.default("https://gitlab.com".to_string())
|
||||
.validate_with(|input: &String| -> Result<(), &str> {
|
||||
if url::Url::parse(input).is_ok() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err("Please enter a valid URL")
|
||||
}
|
||||
})
|
||||
.interact_text()?
|
||||
};
|
||||
|
||||
// Prompt for token env var
|
||||
let token_env_var: String = Input::new()
|
||||
.with_prompt("Token environment variable name")
|
||||
.default("GITLAB_TOKEN".to_string())
|
||||
.interact_text()?;
|
||||
let token_env_var: String = if let Some(var) = token_env_var_flag {
|
||||
var
|
||||
} else {
|
||||
Input::new()
|
||||
.with_prompt("Token environment variable name")
|
||||
.default("GITLAB_TOKEN".to_string())
|
||||
.interact_text()?
|
||||
};
|
||||
|
||||
// Prompt for project paths
|
||||
let project_paths_input: String = Input::new()
|
||||
.with_prompt("Project paths (comma-separated, e.g., group/project)")
|
||||
.validate_with(|input: &String| -> Result<(), &str> {
|
||||
if input.trim().is_empty() {
|
||||
Err("Please enter at least one project path")
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.interact_text()?;
|
||||
let project_paths: Vec<String> = if let Some(projects) = projects_flag {
|
||||
projects
|
||||
.split(',')
|
||||
.map(|p| p.trim().to_string())
|
||||
.filter(|p| !p.is_empty())
|
||||
.collect()
|
||||
} else {
|
||||
let project_paths_input: String = Input::new()
|
||||
.with_prompt("Project paths (comma-separated, e.g., group/project)")
|
||||
.validate_with(|input: &String| -> Result<(), &str> {
|
||||
if input.trim().is_empty() {
|
||||
Err("Please enter at least one project path")
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.interact_text()?;
|
||||
|
||||
let project_paths: Vec<String> = project_paths_input
|
||||
.split(',')
|
||||
.map(|p| p.trim().to_string())
|
||||
.filter(|p| !p.is_empty())
|
||||
.collect();
|
||||
project_paths_input
|
||||
.split(',')
|
||||
.map(|p| p.trim().to_string())
|
||||
.filter(|p| !p.is_empty())
|
||||
.collect()
|
||||
};
|
||||
|
||||
println!("{}", style("\nValidating configuration...").blue());
|
||||
|
||||
@@ -840,6 +981,385 @@ async fn handle_migrate(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_stats(
|
||||
config_override: Option<&str>,
|
||||
args: StatsArgs,
|
||||
robot_mode: bool,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = Config::load(config_override)?;
|
||||
let result = run_stats(&config, args.check, args.repair)?;
|
||||
if robot_mode {
|
||||
print_stats_json(&result);
|
||||
} else {
|
||||
print_stats(&result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_search(
|
||||
config_override: Option<&str>,
|
||||
args: SearchArgs,
|
||||
robot_mode: bool,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = Config::load(config_override)?;
|
||||
|
||||
let fts_mode = match args.fts_mode.as_str() {
|
||||
"raw" => lore::search::FtsQueryMode::Raw,
|
||||
_ => lore::search::FtsQueryMode::Safe,
|
||||
};
|
||||
|
||||
let cli_filters = SearchCliFilters {
|
||||
source_type: args.source_type,
|
||||
author: args.author,
|
||||
project: args.project,
|
||||
labels: args.label,
|
||||
path: args.path,
|
||||
after: args.after,
|
||||
updated_after: args.updated_after,
|
||||
limit: args.limit,
|
||||
};
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let response = run_search(&config, &args.query, cli_filters, fts_mode, args.explain)?;
|
||||
let elapsed_ms = start.elapsed().as_millis() as u64;
|
||||
|
||||
if robot_mode {
|
||||
print_search_results_json(&response, elapsed_ms);
|
||||
} else {
|
||||
print_search_results(&response);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_generate_docs(
|
||||
config_override: Option<&str>,
|
||||
args: GenerateDocsArgs,
|
||||
robot_mode: bool,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = Config::load(config_override)?;
|
||||
|
||||
let result = run_generate_docs(&config, args.full, args.project.as_deref())?;
|
||||
if robot_mode {
|
||||
print_generate_docs_json(&result);
|
||||
} else {
|
||||
print_generate_docs(&result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_embed(
|
||||
config_override: Option<&str>,
|
||||
args: EmbedArgs,
|
||||
robot_mode: bool,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = Config::load(config_override)?;
|
||||
let result = run_embed(&config, args.retry_failed).await?;
|
||||
if robot_mode {
|
||||
print_embed_json(&result);
|
||||
} else {
|
||||
print_embed(&result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_sync_cmd(
|
||||
config_override: Option<&str>,
|
||||
args: SyncArgs,
|
||||
robot_mode: bool,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config = Config::load(config_override)?;
|
||||
let options = SyncOptions {
|
||||
full: args.full,
|
||||
force: args.force,
|
||||
no_embed: args.no_embed,
|
||||
no_docs: args.no_docs,
|
||||
};
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let result = run_sync(&config, options).await?;
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
if robot_mode {
|
||||
print_sync_json(&result, elapsed.as_millis() as u64);
|
||||
} else {
|
||||
print_sync(&result, elapsed);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Health + Robot-docs handlers
|
||||
// ============================================================================
|
||||
|
||||
/// JSON output for health command.
|
||||
#[derive(Serialize)]
|
||||
struct HealthOutput {
|
||||
ok: bool,
|
||||
data: HealthData,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct HealthData {
|
||||
healthy: bool,
|
||||
config_found: bool,
|
||||
db_found: bool,
|
||||
schema_current: bool,
|
||||
schema_version: i32,
|
||||
}
|
||||
|
||||
async fn handle_health(
|
||||
config_override: Option<&str>,
|
||||
robot_mode: bool,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let config_path = get_config_path(config_override);
|
||||
let config_found = config_path.exists();
|
||||
|
||||
let (db_found, schema_version, schema_current) = if config_found {
|
||||
match Config::load(config_override) {
|
||||
Ok(config) => {
|
||||
let db_path = get_db_path(config.storage.db_path.as_deref());
|
||||
if db_path.exists() {
|
||||
match create_connection(&db_path) {
|
||||
Ok(conn) => {
|
||||
let version = get_schema_version(&conn);
|
||||
let latest = 9; // Number of embedded migrations
|
||||
(true, version, version >= latest)
|
||||
}
|
||||
Err(_) => (true, 0, false),
|
||||
}
|
||||
} else {
|
||||
(false, 0, false)
|
||||
}
|
||||
}
|
||||
Err(_) => (false, 0, false),
|
||||
}
|
||||
} else {
|
||||
(false, 0, false)
|
||||
};
|
||||
|
||||
let healthy = config_found && db_found && schema_current;
|
||||
|
||||
if robot_mode {
|
||||
let output = HealthOutput {
|
||||
ok: true,
|
||||
data: HealthData {
|
||||
healthy,
|
||||
config_found,
|
||||
db_found,
|
||||
schema_current,
|
||||
schema_version,
|
||||
},
|
||||
};
|
||||
println!("{}", serde_json::to_string(&output)?);
|
||||
} else {
|
||||
let status = |ok: bool| {
|
||||
if ok {
|
||||
style("pass").green()
|
||||
} else {
|
||||
style("FAIL").red()
|
||||
}
|
||||
};
|
||||
println!("Config: {} ({})", status(config_found), config_path.display());
|
||||
println!("DB: {}", status(db_found));
|
||||
println!(
|
||||
"Schema: {} (v{})",
|
||||
status(schema_current),
|
||||
schema_version
|
||||
);
|
||||
println!();
|
||||
if healthy {
|
||||
println!("{}", style("Healthy").green().bold());
|
||||
} else {
|
||||
println!("{}", style("Unhealthy - run 'lore doctor' for details").red().bold());
|
||||
}
|
||||
}
|
||||
|
||||
if !healthy {
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// JSON output for robot-docs command.
|
||||
#[derive(Serialize)]
|
||||
struct RobotDocsOutput {
|
||||
ok: bool,
|
||||
data: RobotDocsData,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct RobotDocsData {
|
||||
name: String,
|
||||
version: String,
|
||||
description: String,
|
||||
activation: RobotDocsActivation,
|
||||
commands: serde_json::Value,
|
||||
exit_codes: serde_json::Value,
|
||||
error_format: String,
|
||||
workflows: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct RobotDocsActivation {
|
||||
flags: Vec<String>,
|
||||
env: String,
|
||||
auto: String,
|
||||
}
|
||||
|
||||
fn handle_robot_docs(robot_mode: bool) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let version = env!("CARGO_PKG_VERSION").to_string();
|
||||
|
||||
let commands = serde_json::json!({
|
||||
"init": {
|
||||
"description": "Initialize configuration and database",
|
||||
"flags": ["--force", "--non-interactive", "--gitlab-url <URL>", "--token-env-var <VAR>", "--projects <paths>"],
|
||||
"robot_flags": ["--gitlab-url", "--token-env-var", "--projects"],
|
||||
"example": "lore --robot init --gitlab-url https://gitlab.com --token-env-var GITLAB_TOKEN --projects group/project"
|
||||
},
|
||||
"health": {
|
||||
"description": "Quick pre-flight check: config, database, schema version",
|
||||
"flags": [],
|
||||
"example": "lore --robot health"
|
||||
},
|
||||
"auth": {
|
||||
"description": "Verify GitLab authentication",
|
||||
"flags": [],
|
||||
"example": "lore --robot auth"
|
||||
},
|
||||
"doctor": {
|
||||
"description": "Full environment health check (config, auth, DB, Ollama)",
|
||||
"flags": [],
|
||||
"example": "lore --robot doctor"
|
||||
},
|
||||
"ingest": {
|
||||
"description": "Sync data from GitLab",
|
||||
"flags": ["--project <path>", "--force", "--full", "<entity: issues|mrs>"],
|
||||
"example": "lore --robot ingest issues --project group/repo"
|
||||
},
|
||||
"sync": {
|
||||
"description": "Full sync pipeline: ingest -> generate-docs -> embed",
|
||||
"flags": ["--full", "--force", "--no-embed", "--no-docs"],
|
||||
"example": "lore --robot sync"
|
||||
},
|
||||
"issues": {
|
||||
"description": "List or show issues",
|
||||
"flags": ["<IID>", "--limit", "--state", "--project", "--author", "--assignee", "--label", "--milestone", "--since", "--due-before", "--has-due", "--sort", "--asc"],
|
||||
"example": "lore --robot issues --state opened --limit 10"
|
||||
},
|
||||
"mrs": {
|
||||
"description": "List or show merge requests",
|
||||
"flags": ["<IID>", "--limit", "--state", "--project", "--author", "--assignee", "--reviewer", "--label", "--since", "--draft", "--no-draft", "--target", "--source", "--sort", "--asc"],
|
||||
"example": "lore --robot mrs --state opened"
|
||||
},
|
||||
"search": {
|
||||
"description": "Search indexed documents (lexical, hybrid, semantic)",
|
||||
"flags": ["<QUERY>", "--mode", "--type", "--author", "--project", "--label", "--path", "--after", "--updated-after", "--limit", "--explain", "--fts-mode"],
|
||||
"example": "lore --robot search 'authentication bug' --mode hybrid --limit 10"
|
||||
},
|
||||
"count": {
|
||||
"description": "Count entities in local database",
|
||||
"flags": ["<entity: issues|mrs|discussions|notes>", "--for <issue|mr>"],
|
||||
"example": "lore --robot count issues"
|
||||
},
|
||||
"stats": {
|
||||
"description": "Show document and index statistics",
|
||||
"flags": ["--check", "--repair"],
|
||||
"example": "lore --robot stats"
|
||||
},
|
||||
"status": {
|
||||
"description": "Show sync state (cursors, last sync times)",
|
||||
"flags": [],
|
||||
"example": "lore --robot status"
|
||||
},
|
||||
"generate-docs": {
|
||||
"description": "Generate searchable documents from ingested data",
|
||||
"flags": ["--full", "--project <path>"],
|
||||
"example": "lore --robot generate-docs --full"
|
||||
},
|
||||
"embed": {
|
||||
"description": "Generate vector embeddings for documents via Ollama",
|
||||
"flags": ["--retry-failed"],
|
||||
"example": "lore --robot embed"
|
||||
},
|
||||
"migrate": {
|
||||
"description": "Run pending database migrations",
|
||||
"flags": [],
|
||||
"example": "lore --robot migrate"
|
||||
},
|
||||
"version": {
|
||||
"description": "Show version information",
|
||||
"flags": [],
|
||||
"example": "lore --robot version"
|
||||
},
|
||||
"robot-docs": {
|
||||
"description": "This command (agent self-discovery manifest)",
|
||||
"flags": [],
|
||||
"example": "lore robot-docs"
|
||||
}
|
||||
});
|
||||
|
||||
let exit_codes = serde_json::json!({
|
||||
"0": "Success",
|
||||
"1": "Internal error / health check failed",
|
||||
"2": "Config not found / missing flags",
|
||||
"3": "Config invalid",
|
||||
"4": "Token not set",
|
||||
"5": "GitLab auth failed",
|
||||
"6": "Resource not found",
|
||||
"7": "Rate limited",
|
||||
"8": "Network error",
|
||||
"9": "Database locked",
|
||||
"10": "Database error",
|
||||
"11": "Migration failed",
|
||||
"12": "I/O error",
|
||||
"13": "Transform error"
|
||||
});
|
||||
|
||||
let workflows = serde_json::json!({
|
||||
"first_setup": [
|
||||
"lore --robot init --gitlab-url https://gitlab.com --token-env-var GITLAB_TOKEN --projects group/project",
|
||||
"lore --robot doctor",
|
||||
"lore --robot sync"
|
||||
],
|
||||
"daily_sync": [
|
||||
"lore --robot sync"
|
||||
],
|
||||
"search": [
|
||||
"lore --robot search 'query' --mode hybrid"
|
||||
],
|
||||
"pre_flight": [
|
||||
"lore --robot health"
|
||||
]
|
||||
});
|
||||
|
||||
let output = RobotDocsOutput {
|
||||
ok: true,
|
||||
data: RobotDocsData {
|
||||
name: "lore".to_string(),
|
||||
version,
|
||||
description: "Local GitLab data management with semantic search".to_string(),
|
||||
activation: RobotDocsActivation {
|
||||
flags: vec!["--robot".to_string(), "-J".to_string(), "--json".to_string()],
|
||||
env: "LORE_ROBOT=1".to_string(),
|
||||
auto: "Non-TTY stdout".to_string(),
|
||||
},
|
||||
commands,
|
||||
exit_codes,
|
||||
error_format: "stderr JSON: {\"error\":{\"code\":\"...\",\"message\":\"...\",\"suggestion\":\"...\"}}".to_string(),
|
||||
workflows,
|
||||
},
|
||||
};
|
||||
|
||||
if robot_mode {
|
||||
println!("{}", serde_json::to_string(&output)?);
|
||||
} else {
|
||||
println!("{}", serde_json::to_string_pretty(&output)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Backward-compat handlers (deprecated, delegate to new handlers)
|
||||
// ============================================================================
|
||||
|
||||
227
src/search/filters.rs
Normal file
227
src/search/filters.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
use crate::core::error::Result;
|
||||
use crate::documents::SourceType;
|
||||
use rusqlite::Connection;
|
||||
|
||||
const DEFAULT_LIMIT: usize = 20;
|
||||
const MAX_LIMIT: usize = 100;
|
||||
|
||||
/// Path filter: exact match or prefix match (trailing `/`).
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PathFilter {
|
||||
Exact(String),
|
||||
Prefix(String),
|
||||
}
|
||||
|
||||
/// Filters applied to search results post-retrieval.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct SearchFilters {
|
||||
pub source_type: Option<SourceType>,
|
||||
pub author: Option<String>,
|
||||
pub project_id: Option<i64>,
|
||||
pub after: Option<i64>,
|
||||
pub updated_after: Option<i64>,
|
||||
pub labels: Vec<String>,
|
||||
pub path: Option<PathFilter>,
|
||||
pub limit: usize,
|
||||
}
|
||||
|
||||
impl SearchFilters {
|
||||
/// Returns true if any filter (besides limit) is set.
|
||||
pub fn has_any_filter(&self) -> bool {
|
||||
self.source_type.is_some()
|
||||
|| self.author.is_some()
|
||||
|| self.project_id.is_some()
|
||||
|| self.after.is_some()
|
||||
|| self.updated_after.is_some()
|
||||
|| !self.labels.is_empty()
|
||||
|| self.path.is_some()
|
||||
}
|
||||
|
||||
/// Clamp limit to [1, 100], defaulting 0 to 20.
|
||||
pub fn clamp_limit(&self) -> usize {
|
||||
if self.limit == 0 {
|
||||
DEFAULT_LIMIT
|
||||
} else {
|
||||
self.limit.min(MAX_LIMIT)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Escape SQL LIKE wildcards in a string.
|
||||
fn escape_like(s: &str) -> String {
|
||||
s.replace('\\', "\\\\")
|
||||
.replace('%', "\\%")
|
||||
.replace('_', "\\_")
|
||||
}
|
||||
|
||||
/// Apply filters to a ranked list of document IDs, preserving rank order.
|
||||
///
|
||||
/// Uses json_each() to pass ranked IDs efficiently and maintain ordering
|
||||
/// via ORDER BY j.key.
|
||||
pub fn apply_filters(
|
||||
conn: &Connection,
|
||||
document_ids: &[i64],
|
||||
filters: &SearchFilters,
|
||||
) -> Result<Vec<i64>> {
|
||||
if document_ids.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let ids_json = serde_json::to_string(document_ids)
|
||||
.map_err(|e| crate::core::error::LoreError::Other(e.to_string()))?;
|
||||
|
||||
let mut sql = String::from(
|
||||
"SELECT d.id FROM json_each(?1) AS j JOIN documents d ON d.id = j.value WHERE 1=1",
|
||||
);
|
||||
let mut params: Vec<Box<dyn rusqlite::types::ToSql>> = vec![Box::new(ids_json)];
|
||||
let mut param_idx = 2;
|
||||
|
||||
if let Some(ref st) = filters.source_type {
|
||||
sql.push_str(&format!(" AND d.source_type = ?{}", param_idx));
|
||||
params.push(Box::new(st.as_str().to_string()));
|
||||
param_idx += 1;
|
||||
}
|
||||
|
||||
if let Some(ref author) = filters.author {
|
||||
sql.push_str(&format!(" AND d.author_username = ?{}", param_idx));
|
||||
params.push(Box::new(author.clone()));
|
||||
param_idx += 1;
|
||||
}
|
||||
|
||||
if let Some(pid) = filters.project_id {
|
||||
sql.push_str(&format!(" AND d.project_id = ?{}", param_idx));
|
||||
params.push(Box::new(pid));
|
||||
param_idx += 1;
|
||||
}
|
||||
|
||||
if let Some(after) = filters.after {
|
||||
sql.push_str(&format!(" AND d.created_at >= ?{}", param_idx));
|
||||
params.push(Box::new(after));
|
||||
param_idx += 1;
|
||||
}
|
||||
|
||||
if let Some(updated_after) = filters.updated_after {
|
||||
sql.push_str(&format!(" AND d.updated_at >= ?{}", param_idx));
|
||||
params.push(Box::new(updated_after));
|
||||
param_idx += 1;
|
||||
}
|
||||
|
||||
for label in &filters.labels {
|
||||
sql.push_str(&format!(
|
||||
" AND EXISTS (SELECT 1 FROM document_labels dl WHERE dl.document_id = d.id AND dl.label_name = ?{})",
|
||||
param_idx
|
||||
));
|
||||
params.push(Box::new(label.clone()));
|
||||
param_idx += 1;
|
||||
}
|
||||
|
||||
if let Some(ref path_filter) = filters.path {
|
||||
match path_filter {
|
||||
PathFilter::Exact(p) => {
|
||||
sql.push_str(&format!(
|
||||
" AND EXISTS (SELECT 1 FROM document_paths dp WHERE dp.document_id = d.id AND dp.path = ?{})",
|
||||
param_idx
|
||||
));
|
||||
params.push(Box::new(p.clone()));
|
||||
param_idx += 1;
|
||||
}
|
||||
PathFilter::Prefix(p) => {
|
||||
let escaped = escape_like(p);
|
||||
sql.push_str(&format!(
|
||||
" AND EXISTS (SELECT 1 FROM document_paths dp WHERE dp.document_id = d.id AND dp.path LIKE ?{} ESCAPE '\\')",
|
||||
param_idx
|
||||
));
|
||||
params.push(Box::new(format!("{}%", escaped)));
|
||||
param_idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let limit = filters.clamp_limit();
|
||||
sql.push_str(&format!(
|
||||
" ORDER BY j.key LIMIT ?{}",
|
||||
param_idx
|
||||
));
|
||||
params.push(Box::new(limit as i64));
|
||||
|
||||
let param_refs: Vec<&dyn rusqlite::types::ToSql> = params.iter().map(|p| p.as_ref()).collect();
|
||||
|
||||
let mut stmt = conn.prepare(&sql)?;
|
||||
let ids = stmt
|
||||
.query_map(param_refs.as_slice(), |row| row.get::<_, i64>(0))?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(ids)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_has_any_filter_default() {
|
||||
let f = SearchFilters::default();
|
||||
assert!(!f.has_any_filter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_any_filter_with_source_type() {
|
||||
let f = SearchFilters {
|
||||
source_type: Some(SourceType::Issue),
|
||||
..Default::default()
|
||||
};
|
||||
assert!(f.has_any_filter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_any_filter_with_labels() {
|
||||
let f = SearchFilters {
|
||||
labels: vec!["bug".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
assert!(f.has_any_filter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_limit_clamping_zero() {
|
||||
let f = SearchFilters {
|
||||
limit: 0,
|
||||
..Default::default()
|
||||
};
|
||||
assert_eq!(f.clamp_limit(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_limit_clamping_over_max() {
|
||||
let f = SearchFilters {
|
||||
limit: 200,
|
||||
..Default::default()
|
||||
};
|
||||
assert_eq!(f.clamp_limit(), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_limit_clamping_normal() {
|
||||
let f = SearchFilters {
|
||||
limit: 50,
|
||||
..Default::default()
|
||||
};
|
||||
assert_eq!(f.clamp_limit(), 50);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_like() {
|
||||
assert_eq!(escape_like("src/%.rs"), "src/\\%.rs");
|
||||
assert_eq!(escape_like("file_name"), "file\\_name");
|
||||
assert_eq!(escape_like("normal"), "normal");
|
||||
assert_eq!(escape_like("a\\b"), "a\\\\b");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_ids() {
|
||||
// Cannot test apply_filters without DB, but we can verify empty returns empty
|
||||
// by testing the early return path logic
|
||||
let f = SearchFilters::default();
|
||||
assert!(!f.has_any_filter());
|
||||
}
|
||||
}
|
||||
228
src/search/fts.rs
Normal file
228
src/search/fts.rs
Normal file
@@ -0,0 +1,228 @@
|
||||
use crate::core::error::Result;
|
||||
use rusqlite::Connection;
|
||||
|
||||
/// FTS query mode.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum FtsQueryMode {
|
||||
/// Safe mode: each token wrapped in quotes, trailing * preserved on alphanumeric tokens.
|
||||
Safe,
|
||||
/// Raw mode: query passed directly to FTS5 (for advanced users).
|
||||
Raw,
|
||||
}
|
||||
|
||||
/// A single FTS5 search result.
|
||||
#[derive(Debug)]
|
||||
pub struct FtsResult {
|
||||
pub document_id: i64,
|
||||
pub bm25_score: f64,
|
||||
pub snippet: String,
|
||||
}
|
||||
|
||||
/// Convert raw user input into a safe FTS5 query.
|
||||
///
|
||||
/// Safe mode:
|
||||
/// - Splits on whitespace
|
||||
/// - Wraps each token in double quotes (escaping internal quotes)
|
||||
/// - Preserves trailing `*` on alphanumeric-only tokens (prefix search)
|
||||
///
|
||||
/// Raw mode: passes through unchanged.
|
||||
pub fn to_fts_query(raw: &str, mode: FtsQueryMode) -> String {
|
||||
match mode {
|
||||
FtsQueryMode::Raw => raw.to_string(),
|
||||
FtsQueryMode::Safe => {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let tokens: Vec<String> = trimmed
|
||||
.split_whitespace()
|
||||
.map(|token| {
|
||||
// Check if token ends with * and the rest is alphanumeric
|
||||
if token.ends_with('*') {
|
||||
let stem = &token[..token.len() - 1];
|
||||
if !stem.is_empty() && stem.chars().all(|c| c.is_alphanumeric() || c == '_') {
|
||||
// Preserve prefix search: "stem"*
|
||||
let escaped = stem.replace('"', "\"\"");
|
||||
return format!("\"{}\"*", escaped);
|
||||
}
|
||||
}
|
||||
// Default: wrap in quotes, escape internal quotes
|
||||
let escaped = token.replace('"', "\"\"");
|
||||
format!("\"{}\"", escaped)
|
||||
})
|
||||
.collect();
|
||||
|
||||
tokens.join(" ")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute an FTS5 search query.
|
||||
///
|
||||
/// Returns results ranked by BM25 score (lower = better match) with
|
||||
/// contextual snippets highlighting matches.
|
||||
pub fn search_fts(
|
||||
conn: &Connection,
|
||||
query: &str,
|
||||
limit: usize,
|
||||
mode: FtsQueryMode,
|
||||
) -> Result<Vec<FtsResult>> {
|
||||
let fts_query = to_fts_query(query, mode);
|
||||
if fts_query.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let sql = r#"
|
||||
SELECT d.id, bm25(documents_fts) AS score,
|
||||
snippet(documents_fts, 1, '<mark>', '</mark>', '...', 64) AS snip
|
||||
FROM documents_fts
|
||||
JOIN documents d ON d.id = documents_fts.rowid
|
||||
WHERE documents_fts MATCH ?1
|
||||
ORDER BY score
|
||||
LIMIT ?2
|
||||
"#;
|
||||
|
||||
let mut stmt = conn.prepare(sql)?;
|
||||
let results = stmt
|
||||
.query_map(rusqlite::params![fts_query, limit as i64], |row| {
|
||||
Ok(FtsResult {
|
||||
document_id: row.get(0)?,
|
||||
bm25_score: row.get(1)?,
|
||||
snippet: row.get(2)?,
|
||||
})
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Generate a fallback snippet for results without FTS snippets.
|
||||
/// Truncates at a word boundary and appends "...".
|
||||
pub fn generate_fallback_snippet(content_text: &str, max_chars: usize) -> String {
|
||||
if content_text.chars().count() <= max_chars {
|
||||
return content_text.to_string();
|
||||
}
|
||||
|
||||
// Collect the char boundary at max_chars to slice correctly for multi-byte content
|
||||
let byte_end = content_text
|
||||
.char_indices()
|
||||
.nth(max_chars)
|
||||
.map(|(i, _)| i)
|
||||
.unwrap_or(content_text.len());
|
||||
let truncated = &content_text[..byte_end];
|
||||
|
||||
// Walk backward to find a word boundary (space)
|
||||
if let Some(last_space) = truncated.rfind(' ') {
|
||||
format!("{}...", &truncated[..last_space])
|
||||
} else {
|
||||
format!("{}...", truncated)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the best snippet: prefer FTS snippet, fall back to truncated content.
|
||||
pub fn get_result_snippet(fts_snippet: Option<&str>, content_text: &str) -> String {
|
||||
match fts_snippet {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => generate_fallback_snippet(content_text, 200),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_basic() {
|
||||
let result = to_fts_query("auth error", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"auth\" \"error\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_prefix() {
|
||||
let result = to_fts_query("auth*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"auth\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_special_chars() {
|
||||
let result = to_fts_query("C++", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"C++\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_dash() {
|
||||
let result = to_fts_query("-DWITH_SSL", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"-DWITH_SSL\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safe_query_quotes() {
|
||||
let result = to_fts_query("he said \"hello\"", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"he\" \"said\" \"\"\"hello\"\"\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_mode_passthrough() {
|
||||
let result = to_fts_query("auth OR error", FtsQueryMode::Raw);
|
||||
assert_eq!(result, "auth OR error");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_query() {
|
||||
let result = to_fts_query("", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "");
|
||||
|
||||
let result = to_fts_query(" ", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix_only_alphanumeric() {
|
||||
// Non-alphanumeric prefix: C++* should NOT be treated as prefix search
|
||||
let result = to_fts_query("C++*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"C++*\"");
|
||||
|
||||
// Pure alphanumeric prefix: auth* should be prefix search
|
||||
let result = to_fts_query("auth*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"auth\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix_with_underscore() {
|
||||
let result = to_fts_query("jwt_token*", FtsQueryMode::Safe);
|
||||
assert_eq!(result, "\"jwt_token\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fallback_snippet_short() {
|
||||
let result = generate_fallback_snippet("Short content", 200);
|
||||
assert_eq!(result, "Short content");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fallback_snippet_word_boundary() {
|
||||
let content = "This is a moderately long piece of text that should be truncated at a word boundary for readability purposes";
|
||||
let result = generate_fallback_snippet(content, 50);
|
||||
assert!(result.ends_with("..."));
|
||||
assert!(result.len() <= 55); // 50 + "..."
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_result_snippet_prefers_fts() {
|
||||
let result = get_result_snippet(Some("FTS <mark>match</mark>"), "full content text");
|
||||
assert_eq!(result, "FTS <mark>match</mark>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_result_snippet_fallback() {
|
||||
let result = get_result_snippet(None, "full content text");
|
||||
assert_eq!(result, "full content text");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_result_snippet_empty_fts() {
|
||||
let result = get_result_snippet(Some(""), "full content text");
|
||||
assert_eq!(result, "full content text");
|
||||
}
|
||||
}
|
||||
258
src/search/hybrid.rs
Normal file
258
src/search/hybrid.rs
Normal file
@@ -0,0 +1,258 @@
|
||||
//! Hybrid search orchestrator combining FTS5 + sqlite-vec via RRF.
|
||||
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::ollama::OllamaClient;
|
||||
use crate::search::{rank_rrf, search_fts, search_vector, FtsQueryMode};
|
||||
use crate::search::filters::{apply_filters, SearchFilters};
|
||||
|
||||
const BASE_RECALL_MIN: usize = 50;
|
||||
const FILTERED_RECALL_MIN: usize = 200;
|
||||
const RECALL_CAP: usize = 1500;
|
||||
|
||||
/// Search mode selection.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum SearchMode {
|
||||
Hybrid,
|
||||
Lexical,
|
||||
Semantic,
|
||||
}
|
||||
|
||||
impl SearchMode {
|
||||
pub fn parse(s: &str) -> Option<Self> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"hybrid" => Some(Self::Hybrid),
|
||||
"lexical" | "fts" => Some(Self::Lexical),
|
||||
"semantic" | "vector" => Some(Self::Semantic),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Hybrid => "hybrid",
|
||||
Self::Lexical => "lexical",
|
||||
Self::Semantic => "semantic",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Combined search result with provenance from both retrieval lists.
|
||||
pub struct HybridResult {
|
||||
pub document_id: i64,
|
||||
pub score: f64,
|
||||
pub vector_rank: Option<usize>,
|
||||
pub fts_rank: Option<usize>,
|
||||
pub rrf_score: f64,
|
||||
}
|
||||
|
||||
/// Execute hybrid search, returning ranked results + any warnings.
|
||||
///
|
||||
/// `client` is `Option` to enable graceful degradation: when Ollama is
|
||||
/// unavailable, the caller passes `None` and hybrid mode falls back to
|
||||
/// FTS-only with a warning.
|
||||
pub async fn search_hybrid(
|
||||
conn: &Connection,
|
||||
client: Option<&OllamaClient>,
|
||||
query: &str,
|
||||
mode: SearchMode,
|
||||
filters: &SearchFilters,
|
||||
fts_mode: FtsQueryMode,
|
||||
) -> Result<(Vec<HybridResult>, Vec<String>)> {
|
||||
let mut warnings: Vec<String> = Vec::new();
|
||||
|
||||
// Adaptive recall
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = if filters.has_any_filter() {
|
||||
(requested * 50).max(FILTERED_RECALL_MIN).min(RECALL_CAP)
|
||||
} else {
|
||||
(requested * 10).max(BASE_RECALL_MIN).min(RECALL_CAP)
|
||||
};
|
||||
|
||||
let (fts_tuples, vec_tuples) = match mode {
|
||||
SearchMode::Lexical => {
|
||||
let fts_results = search_fts(conn, query, top_k, fts_mode)?;
|
||||
let fts_tuples: Vec<(i64, f64)> = fts_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.bm25_score))
|
||||
.collect();
|
||||
(fts_tuples, Vec::new())
|
||||
}
|
||||
|
||||
SearchMode::Semantic => {
|
||||
let Some(client) = client else {
|
||||
return Err(crate::core::error::LoreError::Other(
|
||||
"Semantic search requires Ollama. Start Ollama or use --mode=lexical.".into(),
|
||||
));
|
||||
};
|
||||
|
||||
let query_embedding = client.embed_batch(vec![query.to_string()]).await?;
|
||||
let embedding = query_embedding
|
||||
.into_iter()
|
||||
.next()
|
||||
.unwrap_or_default();
|
||||
|
||||
if embedding.is_empty() {
|
||||
return Err(crate::core::error::LoreError::Other(
|
||||
"Ollama returned empty embedding for query.".into(),
|
||||
));
|
||||
}
|
||||
|
||||
let vec_results = search_vector(conn, &embedding, top_k)?;
|
||||
let vec_tuples: Vec<(i64, f64)> = vec_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.distance))
|
||||
.collect();
|
||||
(Vec::new(), vec_tuples)
|
||||
}
|
||||
|
||||
SearchMode::Hybrid => {
|
||||
let fts_results = search_fts(conn, query, top_k, fts_mode)?;
|
||||
let fts_tuples: Vec<(i64, f64)> = fts_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.bm25_score))
|
||||
.collect();
|
||||
|
||||
match client {
|
||||
Some(client) => {
|
||||
match client.embed_batch(vec![query.to_string()]).await {
|
||||
Ok(query_embedding) => {
|
||||
let embedding = query_embedding
|
||||
.into_iter()
|
||||
.next()
|
||||
.unwrap_or_default();
|
||||
|
||||
let vec_tuples = if embedding.is_empty() {
|
||||
warnings.push(
|
||||
"Ollama returned empty embedding, using FTS only.".into(),
|
||||
);
|
||||
Vec::new()
|
||||
} else {
|
||||
let vec_results = search_vector(conn, &embedding, top_k)?;
|
||||
vec_results
|
||||
.iter()
|
||||
.map(|r| (r.document_id, r.distance))
|
||||
.collect()
|
||||
};
|
||||
|
||||
(fts_tuples, vec_tuples)
|
||||
}
|
||||
Err(e) => {
|
||||
warnings.push(
|
||||
format!("Embedding failed ({}), falling back to lexical search.", e),
|
||||
);
|
||||
(fts_tuples, Vec::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warnings.push(
|
||||
"Ollama unavailable, falling back to lexical search.".into(),
|
||||
);
|
||||
(fts_tuples, Vec::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let ranked = rank_rrf(&vec_tuples, &fts_tuples);
|
||||
|
||||
let results: Vec<HybridResult> = ranked
|
||||
.into_iter()
|
||||
.map(|r| HybridResult {
|
||||
document_id: r.document_id,
|
||||
score: r.normalized_score,
|
||||
vector_rank: r.vector_rank,
|
||||
fts_rank: r.fts_rank,
|
||||
rrf_score: r.rrf_score,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Apply post-retrieval filters and limit
|
||||
let limit = filters.clamp_limit();
|
||||
let results = if filters.has_any_filter() {
|
||||
let all_ids: Vec<i64> = results.iter().map(|r| r.document_id).collect();
|
||||
let filtered_ids = apply_filters(conn, &all_ids, filters)?;
|
||||
let filtered_set: std::collections::HashSet<i64> = filtered_ids.iter().copied().collect();
|
||||
results
|
||||
.into_iter()
|
||||
.filter(|r| filtered_set.contains(&r.document_id))
|
||||
.take(limit)
|
||||
.collect()
|
||||
} else {
|
||||
results.into_iter().take(limit).collect()
|
||||
};
|
||||
|
||||
Ok((results, warnings))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_search_mode_from_str() {
|
||||
assert_eq!(SearchMode::parse("hybrid"), Some(SearchMode::Hybrid));
|
||||
assert_eq!(SearchMode::parse("lexical"), Some(SearchMode::Lexical));
|
||||
assert_eq!(SearchMode::parse("fts"), Some(SearchMode::Lexical));
|
||||
assert_eq!(SearchMode::parse("semantic"), Some(SearchMode::Semantic));
|
||||
assert_eq!(SearchMode::parse("vector"), Some(SearchMode::Semantic));
|
||||
assert_eq!(SearchMode::parse("HYBRID"), Some(SearchMode::Hybrid));
|
||||
assert_eq!(SearchMode::parse("invalid"), None);
|
||||
assert_eq!(SearchMode::parse(""), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_mode_as_str() {
|
||||
assert_eq!(SearchMode::Hybrid.as_str(), "hybrid");
|
||||
assert_eq!(SearchMode::Lexical.as_str(), "lexical");
|
||||
assert_eq!(SearchMode::Semantic.as_str(), "semantic");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adaptive_recall_unfiltered() {
|
||||
let filters = SearchFilters {
|
||||
limit: 20,
|
||||
..Default::default()
|
||||
};
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = (requested * 10).max(BASE_RECALL_MIN).min(RECALL_CAP);
|
||||
assert_eq!(top_k, 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adaptive_recall_filtered() {
|
||||
let filters = SearchFilters {
|
||||
limit: 20,
|
||||
author: Some("alice".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = (requested * 50).max(FILTERED_RECALL_MIN).min(RECALL_CAP);
|
||||
assert_eq!(top_k, 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adaptive_recall_cap() {
|
||||
let filters = SearchFilters {
|
||||
limit: 100,
|
||||
author: Some("alice".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = (requested * 50).max(FILTERED_RECALL_MIN).min(RECALL_CAP);
|
||||
assert_eq!(top_k, RECALL_CAP); // 5000 capped to 1500
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adaptive_recall_minimum() {
|
||||
let filters = SearchFilters {
|
||||
limit: 1,
|
||||
..Default::default()
|
||||
};
|
||||
let requested = filters.clamp_limit();
|
||||
let top_k = (requested * 10).max(BASE_RECALL_MIN).min(RECALL_CAP);
|
||||
assert_eq!(top_k, BASE_RECALL_MIN); // 10 -> 50
|
||||
}
|
||||
}
|
||||
14
src/search/mod.rs
Normal file
14
src/search/mod.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
mod filters;
|
||||
mod fts;
|
||||
mod hybrid;
|
||||
mod rrf;
|
||||
mod vector;
|
||||
|
||||
pub use fts::{
|
||||
generate_fallback_snippet, get_result_snippet, search_fts, to_fts_query, FtsQueryMode,
|
||||
FtsResult,
|
||||
};
|
||||
pub use filters::{apply_filters, PathFilter, SearchFilters};
|
||||
pub use rrf::{rank_rrf, RrfResult};
|
||||
pub use vector::{search_vector, VectorResult};
|
||||
pub use hybrid::{search_hybrid, HybridResult, SearchMode};
|
||||
178
src/search/rrf.rs
Normal file
178
src/search/rrf.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
const RRF_K: f64 = 60.0;
|
||||
|
||||
/// A single result from Reciprocal Rank Fusion, containing both raw and
|
||||
/// normalized scores plus per-list rank provenance for --explain output.
|
||||
pub struct RrfResult {
|
||||
pub document_id: i64,
|
||||
/// Raw RRF score: sum of 1/(k + rank) across all lists.
|
||||
pub rrf_score: f64,
|
||||
/// Normalized to [0, 1] where the best result is 1.0.
|
||||
pub normalized_score: f64,
|
||||
/// 1-indexed rank in the vector results list, if present.
|
||||
pub vector_rank: Option<usize>,
|
||||
/// 1-indexed rank in the FTS results list, if present.
|
||||
pub fts_rank: Option<usize>,
|
||||
}
|
||||
|
||||
/// Combine vector and FTS retrieval results using Reciprocal Rank Fusion.
|
||||
///
|
||||
/// Input tuples are `(document_id, score/distance)` — already sorted by each retriever.
|
||||
/// Ranks are 1-indexed (first result = rank 1).
|
||||
///
|
||||
/// Score = sum of 1/(k + rank) for each list containing the document.
|
||||
pub fn rank_rrf(
|
||||
vector_results: &[(i64, f64)],
|
||||
fts_results: &[(i64, f64)],
|
||||
) -> Vec<RrfResult> {
|
||||
if vector_results.is_empty() && fts_results.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// (rrf_score, vector_rank, fts_rank)
|
||||
let mut scores: HashMap<i64, (f64, Option<usize>, Option<usize>)> = HashMap::new();
|
||||
|
||||
for (i, &(doc_id, _)) in vector_results.iter().enumerate() {
|
||||
let rank = i + 1; // 1-indexed
|
||||
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
||||
entry.0 += 1.0 / (RRF_K + rank as f64);
|
||||
if entry.1.is_none() {
|
||||
entry.1 = Some(rank);
|
||||
}
|
||||
}
|
||||
|
||||
for (i, &(doc_id, _)) in fts_results.iter().enumerate() {
|
||||
let rank = i + 1; // 1-indexed
|
||||
let entry = scores.entry(doc_id).or_insert((0.0, None, None));
|
||||
entry.0 += 1.0 / (RRF_K + rank as f64);
|
||||
if entry.2.is_none() {
|
||||
entry.2 = Some(rank);
|
||||
}
|
||||
}
|
||||
|
||||
let mut results: Vec<RrfResult> = scores
|
||||
.into_iter()
|
||||
.map(|(doc_id, (rrf_score, vector_rank, fts_rank))| RrfResult {
|
||||
document_id: doc_id,
|
||||
rrf_score,
|
||||
normalized_score: 0.0, // filled in below
|
||||
vector_rank,
|
||||
fts_rank,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort descending by rrf_score
|
||||
results.sort_by(|a, b| b.rrf_score.partial_cmp(&a.rrf_score).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// Normalize: best = 1.0
|
||||
if let Some(max_score) = results.first().map(|r| r.rrf_score) {
|
||||
if max_score > 0.0 {
|
||||
for result in &mut results {
|
||||
result.normalized_score = result.rrf_score / max_score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_dual_list_ranks_higher() {
|
||||
let vector = vec![(1, 0.1), (2, 0.2)];
|
||||
let fts = vec![(1, 5.0), (3, 3.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
// Doc 1 appears in both lists, should rank highest
|
||||
assert_eq!(results[0].document_id, 1);
|
||||
|
||||
// Doc 1 score should be higher than doc 2 and doc 3
|
||||
let doc1 = &results[0];
|
||||
let doc2_score = results.iter().find(|r| r.document_id == 2).unwrap().rrf_score;
|
||||
let doc3_score = results.iter().find(|r| r.document_id == 3).unwrap().rrf_score;
|
||||
assert!(doc1.rrf_score > doc2_score);
|
||||
assert!(doc1.rrf_score > doc3_score);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_list_included() {
|
||||
let vector = vec![(1, 0.1)];
|
||||
let fts = vec![(2, 5.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
let doc_ids: Vec<i64> = results.iter().map(|r| r.document_id).collect();
|
||||
assert!(doc_ids.contains(&1));
|
||||
assert!(doc_ids.contains(&2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalization() {
|
||||
let vector = vec![(1, 0.1), (2, 0.2)];
|
||||
let fts = vec![(1, 5.0), (3, 3.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
// Best result should have normalized_score = 1.0
|
||||
assert!((results[0].normalized_score - 1.0).abs() < f64::EPSILON);
|
||||
|
||||
// All scores in [0, 1]
|
||||
for r in &results {
|
||||
assert!(r.normalized_score >= 0.0);
|
||||
assert!(r.normalized_score <= 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_inputs() {
|
||||
let results = rank_rrf(&[], &[]);
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ranks_are_1_indexed() {
|
||||
let vector = vec![(10, 0.1), (20, 0.2)];
|
||||
let fts = vec![(10, 5.0), (30, 3.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
let doc10 = results.iter().find(|r| r.document_id == 10).unwrap();
|
||||
assert_eq!(doc10.vector_rank, Some(1));
|
||||
assert_eq!(doc10.fts_rank, Some(1));
|
||||
|
||||
let doc20 = results.iter().find(|r| r.document_id == 20).unwrap();
|
||||
assert_eq!(doc20.vector_rank, Some(2));
|
||||
assert_eq!(doc20.fts_rank, None);
|
||||
|
||||
let doc30 = results.iter().find(|r| r.document_id == 30).unwrap();
|
||||
assert_eq!(doc30.vector_rank, None);
|
||||
assert_eq!(doc30.fts_rank, Some(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_and_normalized_scores() {
|
||||
let vector = vec![(1, 0.1)];
|
||||
let fts = vec![(1, 5.0)];
|
||||
let results = rank_rrf(&vector, &fts);
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
let r = &results[0];
|
||||
|
||||
// RRF score = 1/(60+1) + 1/(60+1) = 2/61
|
||||
let expected = 2.0 / 61.0;
|
||||
assert!((r.rrf_score - expected).abs() < 1e-10);
|
||||
assert!((r.normalized_score - 1.0).abs() < f64::EPSILON);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_empty_list() {
|
||||
let vector = vec![(1, 0.1), (2, 0.2)];
|
||||
let results = rank_rrf(&vector, &[]);
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
// Single result should still have normalized_score = 1.0
|
||||
assert!((results[0].normalized_score - 1.0).abs() < f64::EPSILON);
|
||||
}
|
||||
}
|
||||
139
src/search/vector.rs
Normal file
139
src/search/vector.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use rusqlite::Connection;
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::embedding::chunk_ids::decode_rowid;
|
||||
|
||||
/// A single vector search result (document-level, deduplicated).
|
||||
#[derive(Debug)]
|
||||
pub struct VectorResult {
|
||||
pub document_id: i64,
|
||||
pub distance: f64,
|
||||
}
|
||||
|
||||
/// Search documents using sqlite-vec KNN query.
|
||||
///
|
||||
/// Over-fetches 3x limit to handle chunk deduplication (multiple chunks per
|
||||
/// document produce multiple KNN results for the same document_id).
|
||||
/// Returns deduplicated results with best (lowest) distance per document.
|
||||
pub fn search_vector(
|
||||
conn: &Connection,
|
||||
query_embedding: &[f32],
|
||||
limit: usize,
|
||||
) -> Result<Vec<VectorResult>> {
|
||||
if query_embedding.is_empty() || limit == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Convert to raw little-endian bytes for sqlite-vec
|
||||
let embedding_bytes: Vec<u8> = query_embedding
|
||||
.iter()
|
||||
.flat_map(|f| f.to_le_bytes())
|
||||
.collect();
|
||||
|
||||
let k = limit * 3; // Over-fetch for dedup
|
||||
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT rowid, distance
|
||||
FROM embeddings
|
||||
WHERE embedding MATCH ?1
|
||||
AND k = ?2
|
||||
ORDER BY distance"
|
||||
)?;
|
||||
|
||||
let rows: Vec<(i64, f64)> = stmt
|
||||
.query_map(rusqlite::params![embedding_bytes, k as i64], |row| {
|
||||
Ok((row.get(0)?, row.get(1)?))
|
||||
})?
|
||||
.collect::<std::result::Result<Vec<_>, _>>()?;
|
||||
|
||||
// Dedup by document_id, keeping best (lowest) distance
|
||||
let mut best: HashMap<i64, f64> = HashMap::new();
|
||||
for (rowid, distance) in rows {
|
||||
let (document_id, _chunk_index) = decode_rowid(rowid);
|
||||
best.entry(document_id)
|
||||
.and_modify(|d| {
|
||||
if distance < *d {
|
||||
*d = distance;
|
||||
}
|
||||
})
|
||||
.or_insert(distance);
|
||||
}
|
||||
|
||||
// Sort by distance ascending, take limit
|
||||
let mut results: Vec<VectorResult> = best
|
||||
.into_iter()
|
||||
.map(|(document_id, distance)| VectorResult {
|
||||
document_id,
|
||||
distance,
|
||||
})
|
||||
.collect();
|
||||
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
|
||||
results.truncate(limit);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Note: Full integration tests require sqlite-vec loaded, which happens via
|
||||
// create_connection in db.rs. These are basic unit tests for the dedup logic.
|
||||
|
||||
#[test]
|
||||
fn test_empty_returns_empty() {
|
||||
// Can't test KNN without sqlite-vec, but we can test edge cases
|
||||
let result = search_vector_dedup(vec![], 10);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_keeps_best_distance() {
|
||||
// Simulate: doc 1 has chunks at rowid 1000 (idx 0) and 1001 (idx 1)
|
||||
let rows = vec![
|
||||
(1000_i64, 0.5_f64), // doc 1, chunk 0
|
||||
(1001, 0.3), // doc 1, chunk 1 (better)
|
||||
(2000, 0.4), // doc 2, chunk 0
|
||||
];
|
||||
let results = search_vector_dedup(rows, 10);
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].document_id, 1); // doc 1 best = 0.3
|
||||
assert!((results[0].distance - 0.3).abs() < f64::EPSILON);
|
||||
assert_eq!(results[1].document_id, 2); // doc 2 = 0.4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_respects_limit() {
|
||||
let rows = vec![
|
||||
(1000_i64, 0.1_f64),
|
||||
(2000, 0.2),
|
||||
(3000, 0.3),
|
||||
];
|
||||
let results = search_vector_dedup(rows, 2);
|
||||
assert_eq!(results.len(), 2);
|
||||
}
|
||||
|
||||
/// Helper for testing dedup logic without sqlite-vec
|
||||
fn search_vector_dedup(rows: Vec<(i64, f64)>, limit: usize) -> Vec<VectorResult> {
|
||||
let mut best: HashMap<i64, f64> = HashMap::new();
|
||||
for (rowid, distance) in rows {
|
||||
let (document_id, _) = decode_rowid(rowid);
|
||||
best.entry(document_id)
|
||||
.and_modify(|d| {
|
||||
if distance < *d {
|
||||
*d = distance;
|
||||
}
|
||||
})
|
||||
.or_insert(distance);
|
||||
}
|
||||
let mut results: Vec<VectorResult> = best
|
||||
.into_iter()
|
||||
.map(|(document_id, distance)| VectorResult { document_id, distance })
|
||||
.collect();
|
||||
results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(std::cmp::Ordering::Equal));
|
||||
results.truncate(limit);
|
||||
results
|
||||
}
|
||||
}
|
||||
183
tests/embedding.rs
Normal file
183
tests/embedding.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
//! Integration tests for embedding storage and vector search.
|
||||
//!
|
||||
//! These tests create an in-memory SQLite database with sqlite-vec loaded,
|
||||
//! apply all migrations through 009 (embeddings), and verify KNN search
|
||||
//! and metadata operations.
|
||||
|
||||
use lore::core::db::create_connection;
|
||||
use rusqlite::Connection;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Create a test DB on disk (required for sqlite-vec which needs the extension loaded).
|
||||
/// Uses create_connection to get the sqlite-vec extension registered.
|
||||
fn create_test_db() -> (TempDir, Connection) {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let db_path = tmp.path().join("test.db");
|
||||
let conn = create_connection(&db_path).unwrap();
|
||||
|
||||
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
||||
|
||||
for version in 1..=9 {
|
||||
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.file_name()
|
||||
.to_string_lossy()
|
||||
.starts_with(&format!("{:03}", version))
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert!(!entries.is_empty(), "Migration {} not found", version);
|
||||
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
|
||||
conn.execute_batch(&sql)
|
||||
.unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
|
||||
}
|
||||
|
||||
// Seed a project
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
(tmp, conn)
|
||||
}
|
||||
|
||||
fn insert_document(conn: &Connection, id: i64, title: &str, content: &str) {
|
||||
conn.execute(
|
||||
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
|
||||
VALUES (?1, 'issue', ?1, 1, ?2, ?3, 'hash_' || ?1, 'https://example.com/' || ?1)",
|
||||
rusqlite::params![id, title, content],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Create a 768-dim vector with a specific dimension set to 1.0 (unit vector along axis).
|
||||
fn axis_vector(dim: usize) -> Vec<f32> {
|
||||
let mut v = vec![0.0f32; 768];
|
||||
v[dim] = 1.0;
|
||||
v
|
||||
}
|
||||
|
||||
fn insert_embedding(conn: &Connection, doc_id: i64, chunk_index: i64, embedding: &[f32]) {
|
||||
let rowid = doc_id * 1000 + chunk_index;
|
||||
let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO embeddings (rowid, embedding) VALUES (?1, ?2)",
|
||||
rusqlite::params![rowid, embedding_bytes],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
conn.execute(
|
||||
"INSERT INTO embedding_metadata
|
||||
(document_id, chunk_index, model, dims, document_hash, chunk_hash, created_at, attempt_count)
|
||||
VALUES (?1, ?2, 'nomic-embed-text', 768, 'hash_' || ?1, 'chunk_hash', ?3, 1)",
|
||||
rusqlite::params![doc_id, chunk_index, now],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn knn_search_returns_nearest_neighbors() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "Doc A", "Content about authentication.");
|
||||
insert_document(&conn, 2, "Doc B", "Content about database optimization.");
|
||||
insert_document(&conn, 3, "Doc C", "Content about logging infrastructure.");
|
||||
|
||||
// Doc 1: axis 0, Doc 2: axis 1, Doc 3: axis 2
|
||||
insert_embedding(&conn, 1, 0, &axis_vector(0));
|
||||
insert_embedding(&conn, 2, 0, &axis_vector(1));
|
||||
insert_embedding(&conn, 3, 0, &axis_vector(2));
|
||||
|
||||
// Query vector close to axis 0 (should match doc 1)
|
||||
let mut query = vec![0.0f32; 768];
|
||||
query[0] = 0.9;
|
||||
query[1] = 0.1;
|
||||
|
||||
let results = lore::search::search_vector(&conn, &query, 10).unwrap();
|
||||
|
||||
assert!(!results.is_empty(), "Should return at least one result");
|
||||
assert_eq!(results[0].document_id, 1, "Nearest neighbor should be doc 1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn knn_search_respects_limit() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
for i in 1..=10 {
|
||||
insert_document(&conn, i, &format!("Doc {}", i), "Some content.");
|
||||
insert_embedding(&conn, i, 0, &axis_vector(i as usize));
|
||||
}
|
||||
|
||||
let results = lore::search::search_vector(&conn, &axis_vector(0), 3).unwrap();
|
||||
assert!(results.len() <= 3, "Results should be capped at limit");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn knn_search_deduplicates_chunks() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "Multi-chunk doc", "Very long content that was chunked.");
|
||||
|
||||
// Same document, two chunks, both similar to query
|
||||
let mut v1 = vec![0.0f32; 768];
|
||||
v1[0] = 1.0;
|
||||
let mut v2 = vec![0.0f32; 768];
|
||||
v2[0] = 0.95;
|
||||
v2[1] = 0.05;
|
||||
|
||||
insert_embedding(&conn, 1, 0, &v1);
|
||||
insert_embedding(&conn, 1, 1, &v2);
|
||||
|
||||
let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();
|
||||
|
||||
// Should deduplicate: same document_id appears at most once
|
||||
let unique_docs: std::collections::HashSet<i64> = results.iter().map(|r| r.document_id).collect();
|
||||
assert_eq!(
|
||||
unique_docs.len(),
|
||||
results.len(),
|
||||
"Each document should appear at most once in results"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn orphan_trigger_deletes_embeddings_on_document_delete() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "Will be deleted", "Content.");
|
||||
insert_embedding(&conn, 1, 0, &axis_vector(0));
|
||||
|
||||
// Verify embedding exists
|
||||
let count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(count, 1, "Embedding should exist before delete");
|
||||
|
||||
// Delete the document
|
||||
conn.execute("DELETE FROM documents WHERE id = 1", []).unwrap();
|
||||
|
||||
// Verify embedding was cascade-deleted via trigger
|
||||
let count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embeddings WHERE rowid = 1000", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(count, 0, "Trigger should delete embeddings when document is deleted");
|
||||
|
||||
// Verify metadata was cascade-deleted via FK
|
||||
let meta_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM embedding_metadata WHERE document_id = 1", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(meta_count, 0, "Metadata should be cascade-deleted");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_database_returns_no_results() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
let results = lore::search::search_vector(&conn, &axis_vector(0), 10).unwrap();
|
||||
assert!(results.is_empty(), "Empty DB should return no results");
|
||||
}
|
||||
65
tests/fixtures/golden_queries.json
vendored
Normal file
65
tests/fixtures/golden_queries.json
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"query": "authentication login",
|
||||
"mode": "lexical",
|
||||
"filters": {},
|
||||
"expected_doc_ids": [1],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "Basic auth keywords should find the OAuth login issue"
|
||||
},
|
||||
{
|
||||
"query": "database migration",
|
||||
"mode": "lexical",
|
||||
"filters": {},
|
||||
"expected_doc_ids": [3],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "Database migration terms should find the migration issue"
|
||||
},
|
||||
{
|
||||
"query": "user profile",
|
||||
"mode": "lexical",
|
||||
"filters": {},
|
||||
"expected_doc_ids": [2],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "User profile keywords should find the profile MR"
|
||||
},
|
||||
{
|
||||
"query": "API rate limiting",
|
||||
"mode": "lexical",
|
||||
"filters": {},
|
||||
"expected_doc_ids": [5],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "Rate limiting query should find the discussion document"
|
||||
},
|
||||
{
|
||||
"query": "performance optimization",
|
||||
"mode": "lexical",
|
||||
"filters": {},
|
||||
"expected_doc_ids": [4],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "Performance terms should find the performance MR"
|
||||
},
|
||||
{
|
||||
"query": "token refresh",
|
||||
"mode": "lexical",
|
||||
"filters": {"source_type": "issue"},
|
||||
"expected_doc_ids": [1],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "Token refresh with issue filter should find auth issue only"
|
||||
},
|
||||
{
|
||||
"query": "CSS styling frontend",
|
||||
"mode": "lexical",
|
||||
"filters": {},
|
||||
"expected_doc_ids": [6],
|
||||
"min_results": 1,
|
||||
"max_rank": 10,
|
||||
"description": "Frontend CSS query should find the UI improvements issue"
|
||||
}
|
||||
]
|
||||
198
tests/fts_search.rs
Normal file
198
tests/fts_search.rs
Normal file
@@ -0,0 +1,198 @@
|
||||
//! Integration tests for FTS5 search.
|
||||
//!
|
||||
//! These tests create an in-memory SQLite database, apply migrations through 008 (FTS5),
|
||||
//! seed documents, and verify search behavior.
|
||||
|
||||
use rusqlite::Connection;
|
||||
|
||||
fn create_test_db() -> Connection {
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.pragma_update(None, "foreign_keys", "ON").unwrap();
|
||||
|
||||
let migrations_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
||||
|
||||
for version in 1..=8 {
|
||||
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.file_name()
|
||||
.to_string_lossy()
|
||||
.starts_with(&format!("{:03}", version))
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert!(!entries.is_empty(), "Migration {} not found", version);
|
||||
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
|
||||
conn.execute_batch(&sql)
|
||||
.unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
|
||||
}
|
||||
|
||||
// Seed a project
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
conn
|
||||
}
|
||||
|
||||
fn insert_document(conn: &Connection, id: i64, source_type: &str, title: &str, content: &str) {
|
||||
conn.execute(
|
||||
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
|
||||
VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://example.com/' || ?1)",
|
||||
rusqlite::params![id, source_type, title, content],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_basic_search() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Authentication bug", "Users cannot login when using OAuth tokens. The JWT refresh fails silently.");
|
||||
insert_document(&conn, 2, "merge_request", "Add user profile page", "This MR adds a new user profile page with avatar upload support.");
|
||||
insert_document(&conn, 3, "issue", "Database migration failing", "The migration script crashes on PostgreSQL 14 due to deprecated syntax.");
|
||||
|
||||
let results = lore::search::search_fts(&conn, "authentication login", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
|
||||
assert!(!results.is_empty(), "Expected at least one result for 'authentication login'");
|
||||
assert_eq!(results[0].document_id, 1, "Authentication issue should be top result");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_stemming_matches() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Running tests", "The test runner is executing integration tests.");
|
||||
insert_document(&conn, 2, "issue", "Deployment config", "Deployment configuration for production servers.");
|
||||
|
||||
// "running" should match "runner" and "executing" via porter stemmer
|
||||
let results = lore::search::search_fts(&conn, "running", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
assert!(!results.is_empty(), "Stemming should match 'running' to 'runner'");
|
||||
assert_eq!(results[0].document_id, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_empty_results() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Bug fix", "Fixed a null pointer dereference in the parser.");
|
||||
|
||||
let results = lore::search::search_fts(&conn, "kubernetes deployment helm", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
assert!(results.is_empty(), "No documents should match unrelated query");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_special_characters_handled() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "C++ compiler", "The C++ compiler segfaults on template metaprogramming.");
|
||||
|
||||
// Special characters should not crash the search
|
||||
let results = lore::search::search_fts(&conn, "C++ compiler", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
// Safe mode sanitizes the query — it should still return results or at least not crash
|
||||
assert!(results.len() <= 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_result_ordering_by_relevance() {
|
||||
let conn = create_test_db();
|
||||
|
||||
// Doc 1: "authentication" in title and content
|
||||
insert_document(&conn, 1, "issue", "Authentication system redesign", "The authentication system needs a complete redesign. Authentication flows are broken.");
|
||||
// Doc 2: "authentication" only in content, once
|
||||
insert_document(&conn, 2, "issue", "Login page update", "Updated the login page with better authentication error messages.");
|
||||
// Doc 3: unrelated
|
||||
insert_document(&conn, 3, "issue", "Database optimization", "Optimize database queries for faster response times.");
|
||||
|
||||
let results = lore::search::search_fts(&conn, "authentication", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
|
||||
assert!(results.len() >= 2, "Should match at least 2 documents");
|
||||
// Doc 1 should rank higher (more occurrences of the term)
|
||||
assert_eq!(results[0].document_id, 1, "Document with more term occurrences should rank first");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_respects_limit() {
|
||||
let conn = create_test_db();
|
||||
|
||||
for i in 1..=20 {
|
||||
insert_document(
|
||||
&conn,
|
||||
i,
|
||||
"issue",
|
||||
&format!("Bug report {}", i),
|
||||
&format!("This is bug report number {} about the login system.", i),
|
||||
);
|
||||
}
|
||||
|
||||
let results = lore::search::search_fts(&conn, "bug login", 5, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
assert!(results.len() <= 5, "Results should be capped at limit");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_snippet_generated() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Performance issue", "The application performance degrades significantly when more than 100 users are connected simultaneously. Memory usage spikes to 4GB.");
|
||||
|
||||
let results = lore::search::search_fts(&conn, "performance", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
|
||||
assert!(!results.is_empty());
|
||||
// Snippet should contain some text (may have FTS5 highlight markers)
|
||||
assert!(!results[0].snippet.is_empty(), "Snippet should be generated");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_triggers_sync_on_insert() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Test document", "This is test content for FTS trigger verification.");
|
||||
|
||||
// Verify FTS table has an entry via direct query
|
||||
let fts_count: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'test'", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(fts_count, 1, "FTS trigger should auto-index on INSERT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_triggers_sync_on_delete() {
|
||||
let conn = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Deletable document", "This content will be deleted from the index.");
|
||||
|
||||
// Verify it's indexed
|
||||
let before: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'deletable'", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(before, 1);
|
||||
|
||||
// Delete the document
|
||||
conn.execute("DELETE FROM documents WHERE id = 1", []).unwrap();
|
||||
|
||||
// Verify it's removed from FTS
|
||||
let after: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents_fts WHERE documents_fts MATCH 'deletable'", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(after, 0, "FTS trigger should remove entry on DELETE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_null_title_handled() {
|
||||
let conn = create_test_db();
|
||||
|
||||
// Discussion documents have NULL titles
|
||||
conn.execute(
|
||||
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
|
||||
VALUES (1, 'discussion', 1, 1, NULL, 'Discussion about API rate limiting strategies.', 'hash1', 'https://example.com/1')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let results = lore::search::search_fts(&conn, "rate limiting", 10, lore::search::FtsQueryMode::Safe).unwrap();
|
||||
assert!(!results.is_empty(), "Should find documents with NULL title");
|
||||
}
|
||||
279
tests/golden_query_tests.rs
Normal file
279
tests/golden_query_tests.rs
Normal file
@@ -0,0 +1,279 @@
|
||||
//! Golden query test suite.
|
||||
//!
|
||||
//! Verifies end-to-end search quality with known-good expected results.
|
||||
//! Uses a seeded SQLite DB with deterministic fixture data and no external
|
||||
//! dependencies (no Ollama, no GitLab).
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use rusqlite::Connection;
|
||||
use serde::Deserialize;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use lore::search::{self, FtsQueryMode, SearchFilters, SearchMode, search_fts, apply_filters};
|
||||
|
||||
/// A golden query test case.
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct GoldenQuery {
|
||||
query: String,
|
||||
mode: String,
|
||||
#[serde(default)]
|
||||
filters: GoldenFilters,
|
||||
expected_doc_ids: Vec<i64>,
|
||||
min_results: usize,
|
||||
max_rank: usize,
|
||||
description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
struct GoldenFilters {
|
||||
source_type: Option<String>,
|
||||
author: Option<String>,
|
||||
project: Option<String>,
|
||||
#[serde(default)]
|
||||
labels: Vec<String>,
|
||||
}
|
||||
|
||||
fn load_golden_queries() -> Vec<GoldenQuery> {
|
||||
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests/fixtures/golden_queries.json");
|
||||
let content = std::fs::read_to_string(&path)
|
||||
.unwrap_or_else(|_| panic!("Failed to read golden queries fixture"));
|
||||
serde_json::from_str(&content)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse golden queries: {}", e))
|
||||
}
|
||||
|
||||
/// Create an in-memory database with FTS5 schema and seed deterministic fixture data.
|
||||
fn create_seeded_db() -> Connection {
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.pragma_update(None, "foreign_keys", "ON").unwrap();
|
||||
|
||||
// Apply migrations 001-008 (FTS5)
|
||||
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
||||
for version in 1..=8 {
|
||||
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.file_name()
|
||||
.to_string_lossy()
|
||||
.starts_with(&format!("{:03}", version))
|
||||
})
|
||||
.collect();
|
||||
assert!(!entries.is_empty(), "Migration {} not found", version);
|
||||
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
|
||||
conn.execute_batch(&sql)
|
||||
.unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
|
||||
}
|
||||
|
||||
// Seed project
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url)
|
||||
VALUES (1, 100, 'group/project', 'https://gitlab.example.com/group/project')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Seed deterministic documents
|
||||
let documents = vec![
|
||||
// id=1: Auth issue (matches: authentication, login, OAuth, JWT, token, refresh)
|
||||
(1, "issue", "Authentication and login broken with OAuth",
|
||||
"Users cannot login when using OAuth tokens. The JWT token refresh fails silently, \
|
||||
causing authentication errors. When the access token expires, the refresh flow returns \
|
||||
a 401 instead of fetching new credentials. Login page shows a generic error. \
|
||||
Multiple users reported authentication failures across all OAuth providers.",
|
||||
"testuser"),
|
||||
|
||||
// id=2: User profile MR (matches: user, profile, avatar, upload)
|
||||
(2, "merge_request", "Add user profile page with avatar upload",
|
||||
"This merge request adds a new user profile page. Users can now upload their avatar, \
|
||||
edit their display name, and manage notification preferences. The profile page includes \
|
||||
responsive design for mobile and desktop viewports.",
|
||||
"developer1"),
|
||||
|
||||
// id=3: Database migration issue (matches: database, migration, PostgreSQL, schema)
|
||||
(3, "issue", "Database migration failing on PostgreSQL 14",
|
||||
"The database migration script crashes on PostgreSQL 14 due to deprecated syntax. \
|
||||
The ALTER TABLE command uses a syntax removed in PG14. Migration 042 needs to be \
|
||||
rewritten to use the new schema modification syntax. All staging environments affected.",
|
||||
"dba_admin"),
|
||||
|
||||
// id=4: Performance MR (matches: performance, optimization, caching, query)
|
||||
(4, "merge_request", "Performance optimization for dashboard queries",
|
||||
"Optimized the dashboard query performance by adding database indexes and implementing \
|
||||
Redis caching for frequently accessed reports. Query execution time reduced from 3.2s \
|
||||
to 180ms. Added connection pooling and prepared statement caching.",
|
||||
"senior_dev"),
|
||||
|
||||
// id=5: API rate limiting discussion (matches: API, rate, limiting, throttle)
|
||||
(5, "discussion", "API rate limiting strategies for public endpoints",
|
||||
"Discussion about implementing API rate limiting on public-facing endpoints. \
|
||||
Proposed approaches: token bucket with sliding window, fixed window counters, \
|
||||
or leaky bucket algorithm. Rate limits should be configurable per API key tier. \
|
||||
Need to handle burst traffic during peak hours without throttling legitimate users.",
|
||||
"architect"),
|
||||
|
||||
// id=6: UI/CSS issue (matches: CSS, styling, frontend, responsive, UI)
|
||||
(6, "issue", "CSS styling issues on mobile frontend",
|
||||
"Multiple CSS styling problems on the mobile frontend. The navigation menu overlaps \
|
||||
content on screens smaller than 768px. Button text truncates on compact viewports. \
|
||||
Frontend responsive breakpoints need adjustment. The UI components library has \
|
||||
conflicting CSS specificity with the theme system.",
|
||||
"frontend_dev"),
|
||||
|
||||
// id=7: CI/CD MR (matches: CI, CD, pipeline, deployment, Docker)
|
||||
(7, "merge_request", "Revamp CI/CD pipeline with Docker caching",
|
||||
"Complete overhaul of the CI/CD pipeline. Added Docker layer caching to speed up \
|
||||
builds. Deployment stages now run in parallel where possible. Added rollback \
|
||||
support for failed deployments. Pipeline runtime reduced from 45min to 12min.",
|
||||
"devops_lead"),
|
||||
|
||||
// id=8: Security issue (matches: security, vulnerability, XSS, injection)
|
||||
(8, "issue", "Security vulnerability in form submission",
|
||||
"A cross-site scripting (XSS) vulnerability was found in the comment submission form. \
|
||||
User input is not properly sanitized before rendering. The security scanner also flagged \
|
||||
potential SQL injection in the search endpoint. Both vulnerabilities need immediate patching.",
|
||||
"security_team"),
|
||||
];
|
||||
|
||||
for (id, source_type, title, content, author) in &documents {
|
||||
conn.execute(
|
||||
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url, author_username)
|
||||
VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://gitlab.example.com/group/project/-/' || ?2 || 's/' || ?1, ?5)",
|
||||
rusqlite::params![id, source_type, title, content, author],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Seed labels for filtered queries
|
||||
conn.execute_batch(
|
||||
"INSERT INTO document_labels (document_id, label_name) VALUES (1, 'bug');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (1, 'authentication');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (3, 'bug');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (3, 'database');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (6, 'bug');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (6, 'frontend');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (8, 'security');
|
||||
INSERT INTO document_labels (document_id, label_name) VALUES (8, 'critical');",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
conn
|
||||
}
|
||||
|
||||
fn build_search_filters(golden: &GoldenFilters) -> SearchFilters {
|
||||
let source_type = golden.source_type.as_deref().and_then(|s| match s {
|
||||
"issue" => Some(lore::documents::SourceType::Issue),
|
||||
"merge_request" => Some(lore::documents::SourceType::MergeRequest),
|
||||
"discussion" => Some(lore::documents::SourceType::Discussion),
|
||||
_ => None,
|
||||
});
|
||||
|
||||
SearchFilters {
|
||||
source_type,
|
||||
author: golden.author.clone(),
|
||||
labels: golden.labels.clone(),
|
||||
limit: 100,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn golden_queries_all_pass() {
|
||||
let queries = load_golden_queries();
|
||||
let conn = create_seeded_db();
|
||||
|
||||
let mut failures: Vec<String> = Vec::new();
|
||||
|
||||
for (i, gq) in queries.iter().enumerate() {
|
||||
let mode = SearchMode::parse(&gq.mode).unwrap_or(SearchMode::Lexical);
|
||||
|
||||
// For lexical-only golden queries (no Ollama needed)
|
||||
assert_eq!(
|
||||
mode,
|
||||
SearchMode::Lexical,
|
||||
"Golden query {} uses non-lexical mode '{}' which requires Ollama — not supported in CI",
|
||||
i,
|
||||
gq.mode
|
||||
);
|
||||
|
||||
// Run FTS search
|
||||
let fts_results = search_fts(&conn, &gq.query, 50, FtsQueryMode::Safe).unwrap();
|
||||
let doc_ids: Vec<i64> = fts_results.iter().map(|r| r.document_id).collect();
|
||||
|
||||
// Apply filters if any
|
||||
let filters = build_search_filters(&gq.filters);
|
||||
let filtered_ids = if filters.has_any_filter() {
|
||||
apply_filters(&conn, &doc_ids, &filters).unwrap()
|
||||
} else {
|
||||
doc_ids.clone()
|
||||
};
|
||||
|
||||
// Check min_results
|
||||
if filtered_ids.len() < gq.min_results {
|
||||
failures.push(format!(
|
||||
"FAIL [{}] \"{}\": expected >= {} results, got {} (description: {})",
|
||||
i, gq.query, gq.min_results, filtered_ids.len(), gq.description
|
||||
));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check each expected doc_id is in top max_rank
|
||||
for expected_id in &gq.expected_doc_ids {
|
||||
let position = filtered_ids.iter().position(|id| id == expected_id);
|
||||
match position {
|
||||
Some(pos) if pos < gq.max_rank => {
|
||||
// Pass
|
||||
}
|
||||
Some(pos) => {
|
||||
failures.push(format!(
|
||||
"FAIL [{}] \"{}\": expected doc_id {} in top {}, found at rank {} (description: {})",
|
||||
i, gq.query, expected_id, gq.max_rank, pos + 1, gq.description
|
||||
));
|
||||
}
|
||||
None => {
|
||||
failures.push(format!(
|
||||
"FAIL [{}] \"{}\": expected doc_id {} not found in results {:?} (description: {})",
|
||||
i, gq.query, expected_id, filtered_ids, gq.description
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !failures.is_empty() {
|
||||
panic!(
|
||||
"Golden query failures ({}/{}):\n{}",
|
||||
failures.len(),
|
||||
queries.len(),
|
||||
failures.join("\n")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn golden_queries_fixture_is_valid() {
|
||||
let queries = load_golden_queries();
|
||||
assert!(
|
||||
queries.len() >= 5,
|
||||
"Golden queries fixture should have at least 5 queries, got {}",
|
||||
queries.len()
|
||||
);
|
||||
|
||||
for (i, gq) in queries.iter().enumerate() {
|
||||
assert!(!gq.query.is_empty(), "Query {} has empty query string", i);
|
||||
assert!(
|
||||
!gq.expected_doc_ids.is_empty(),
|
||||
"Query {} has no expected doc IDs",
|
||||
i
|
||||
);
|
||||
assert!(gq.min_results > 0, "Query {} has min_results=0", i);
|
||||
assert!(gq.max_rank > 0, "Query {} has max_rank=0", i);
|
||||
assert!(
|
||||
SearchMode::parse(&gq.mode).is_some(),
|
||||
"Query {} has invalid mode '{}'",
|
||||
i,
|
||||
gq.mode
|
||||
);
|
||||
}
|
||||
}
|
||||
206
tests/hybrid_search.rs
Normal file
206
tests/hybrid_search.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
//! Integration tests for hybrid search combining FTS + vector.
|
||||
//!
|
||||
//! Tests all three search modes (lexical, semantic, hybrid) and
|
||||
//! verifies graceful degradation when embeddings are unavailable.
|
||||
|
||||
use lore::core::db::create_connection;
|
||||
use lore::search::{FtsQueryMode, SearchFilters, SearchMode, search_fts, search_hybrid};
|
||||
use rusqlite::Connection;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn create_test_db() -> (TempDir, Connection) {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let db_path = tmp.path().join("test.db");
|
||||
let conn = create_connection(&db_path).unwrap();
|
||||
|
||||
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
||||
|
||||
for version in 1..=9 {
|
||||
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.file_name()
|
||||
.to_string_lossy()
|
||||
.starts_with(&format!("{:03}", version))
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert!(!entries.is_empty(), "Migration {} not found", version);
|
||||
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
|
||||
conn.execute_batch(&sql)
|
||||
.unwrap_or_else(|e| panic!("Migration {} failed: {}", version, e));
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
(tmp, conn)
|
||||
}
|
||||
|
||||
fn insert_document(conn: &Connection, id: i64, source_type: &str, title: &str, content: &str) {
|
||||
conn.execute(
|
||||
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url, author_username)
|
||||
VALUES (?1, ?2, ?1, 1, ?3, ?4, 'hash_' || ?1, 'https://example.com/' || ?1, 'testuser')",
|
||||
rusqlite::params![id, source_type, title, content],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn lexical_mode_uses_fts_only() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Authentication bug", "OAuth token refresh fails silently.");
|
||||
insert_document(&conn, 2, "issue", "Database migration", "Migration script crashes on PostgreSQL.");
|
||||
|
||||
let filters = SearchFilters {
|
||||
limit: 10,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
let (results, warnings) = rt
|
||||
.block_on(search_hybrid(
|
||||
&conn,
|
||||
None,
|
||||
"authentication",
|
||||
SearchMode::Lexical,
|
||||
&filters,
|
||||
FtsQueryMode::Safe,
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
assert!(!results.is_empty(), "Lexical search should find results");
|
||||
assert_eq!(results[0].document_id, 1);
|
||||
// Lexical mode should not produce Ollama-related warnings
|
||||
assert!(
|
||||
warnings.iter().all(|w| !w.contains("Ollama")),
|
||||
"Lexical mode should not warn about Ollama"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_mode_no_embeddings_required() {
|
||||
// Use in-memory DB without sqlite-vec for pure FTS
|
||||
let conn = Connection::open_in_memory().unwrap();
|
||||
conn.pragma_update(None, "foreign_keys", "ON").unwrap();
|
||||
|
||||
let migrations_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("migrations");
|
||||
// Only apply through migration 008 (FTS5, no embeddings)
|
||||
for version in 1..=8 {
|
||||
let entries: Vec<_> = std::fs::read_dir(&migrations_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.file_name()
|
||||
.to_string_lossy()
|
||||
.starts_with(&format!("{:03}", version))
|
||||
})
|
||||
.collect();
|
||||
let sql = std::fs::read_to_string(entries[0].path()).unwrap();
|
||||
conn.execute_batch(&sql).unwrap();
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO documents (id, source_type, source_id, project_id, title, content_text, content_hash, url)
|
||||
VALUES (1, 'issue', 1, 1, 'Test issue', 'Content about testing and verification.', 'h1', 'https://example.com/1')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let results = search_fts(&conn, "testing", 10, FtsQueryMode::Safe).unwrap();
|
||||
assert!(!results.is_empty(), "FTS should work without embeddings tables");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hybrid_mode_degrades_to_fts_without_client() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Performance issue", "Application is slow under load.");
|
||||
|
||||
let filters = SearchFilters {
|
||||
limit: 10,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
let (results, warnings) = rt
|
||||
.block_on(search_hybrid(
|
||||
&conn,
|
||||
None, // No Ollama client
|
||||
"performance slow",
|
||||
SearchMode::Hybrid,
|
||||
&filters,
|
||||
FtsQueryMode::Safe,
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
assert!(!results.is_empty(), "Should fall back to FTS results");
|
||||
// Should warn about missing Ollama client
|
||||
assert!(
|
||||
warnings.iter().any(|w| w.to_lowercase().contains("vector") || w.to_lowercase().contains("ollama") || w.to_lowercase().contains("client") || w.to_lowercase().contains("fallback") || w.to_lowercase().contains("fts")),
|
||||
"Should produce a degradation warning, got: {:?}",
|
||||
warnings
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rrf_ranking_combines_signals() {
|
||||
use lore::search::rank_rrf;
|
||||
|
||||
// Two documents with different rankings in each signal
|
||||
let vector_results = vec![(1_i64, 0.1), (2, 0.5)]; // doc 1 closer
|
||||
let fts_results = vec![(2_i64, -5.0), (1, -3.0)]; // doc 2 higher BM25
|
||||
|
||||
let rrf = rank_rrf(&vector_results, &fts_results);
|
||||
|
||||
assert_eq!(rrf.len(), 2, "Should return both documents");
|
||||
// Both docs appear in both signals, so both get RRF scores
|
||||
for r in &rrf {
|
||||
assert!(r.rrf_score > 0.0, "RRF score should be positive");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filters_by_source_type() {
|
||||
let (_tmp, conn) = create_test_db();
|
||||
|
||||
insert_document(&conn, 1, "issue", "Bug report", "Authentication bug in login flow.");
|
||||
insert_document(&conn, 2, "merge_request", "Fix auth", "Fixed authentication issue.");
|
||||
|
||||
let filters = SearchFilters {
|
||||
source_type: Some(lore::documents::SourceType::Issue),
|
||||
limit: 10,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let all_ids = vec![1, 2];
|
||||
let filtered = lore::search::apply_filters(&conn, &all_ids, &filters).unwrap();
|
||||
|
||||
assert_eq!(filtered.len(), 1, "Filter should remove non-issue documents");
|
||||
assert_eq!(filtered[0], 1, "Only issue document should remain");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_mode_variants_exist() {
|
||||
// Verify all enum variants compile and are distinct
|
||||
let hybrid = SearchMode::Hybrid;
|
||||
let lexical = SearchMode::Lexical;
|
||||
let semantic = SearchMode::Semantic;
|
||||
|
||||
assert_ne!(hybrid, lexical);
|
||||
assert_ne!(hybrid, semantic);
|
||||
assert_ne!(lexical, semantic);
|
||||
}
|
||||
Reference in New Issue
Block a user