feat: implement per-note search and document pipeline

- Add SourceType::Note with extract_note_document() and ParentMetadataCache
- Migration 022: composite indexes for notes queries + author_id column
- Migration 024: table rebuild adding 'note' to CHECK constraints, defense triggers
- Migration 025: backfill existing non-system notes into dirty queue
- Add lore notes CLI command with 17 filter options (author, path, resolution, etc.)
- Support table/json/jsonl/csv output formats with field selection
- Wire note dirty tracking through discussion and MR discussion ingestion
- Fix test_migration_024_preserves_existing_data off-by-one (tested wrong migration)
- Fix upsert_document_inner returning false for label/path-only changes
This commit is contained in:
teernisse
2026-02-12 12:37:11 -05:00
parent fda9cd8835
commit 83cd16c918
21 changed files with 5345 additions and 126 deletions

View File

@@ -39,6 +39,7 @@ pub fn run_generate_docs(
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
result.seeded += seed_dirty_notes(&conn, project_filter)?;
}
let regen =
@@ -67,6 +68,10 @@ fn seed_dirty(
SourceType::Issue => "issues",
SourceType::MergeRequest => "merge_requests",
SourceType::Discussion => "discussions",
SourceType::Note => {
// NOTE-2E will implement seed_dirty_notes separately (needs is_system filter)
unreachable!("Note seeding handled by seed_dirty_notes, not seed_dirty")
}
};
let type_str = source_type.as_str();
let now = chrono::Utc::now().timestamp_millis();
@@ -125,6 +130,55 @@ fn seed_dirty(
Ok(total_seeded)
}
fn seed_dirty_notes(conn: &Connection, project_filter: Option<&str>) -> Result<usize> {
let now = chrono::Utc::now().timestamp_millis();
let mut total_seeded: usize = 0;
let mut last_id: i64 = 0;
loop {
let inserted = if let Some(project) = project_filter {
let project_id = resolve_project(conn, project)?;
conn.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT 'note', id, ?1, 0, NULL, NULL, NULL
FROM notes WHERE id > ?2 AND project_id = ?3 AND is_system = 0 ORDER BY id LIMIT ?4
ON CONFLICT(source_type, source_id) DO NOTHING",
rusqlite::params![now, last_id, project_id, FULL_MODE_CHUNK_SIZE],
)?
} else {
conn.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT 'note', id, ?1, 0, NULL, NULL, NULL
FROM notes WHERE id > ?2 AND is_system = 0 ORDER BY id LIMIT ?3
ON CONFLICT(source_type, source_id) DO NOTHING",
rusqlite::params![now, last_id, FULL_MODE_CHUNK_SIZE],
)?
};
if inserted == 0 {
break;
}
let max_id: i64 = conn.query_row(
"SELECT MAX(id) FROM (SELECT id FROM notes WHERE id > ?1 AND is_system = 0 ORDER BY id LIMIT ?2)",
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|row| row.get(0),
)?;
total_seeded += inserted;
last_id = max_id;
}
info!(
source_type = "note",
seeded = total_seeded,
"Seeded dirty_sources"
);
Ok(total_seeded)
}
pub fn print_generate_docs(result: &GenerateDocsResult) {
let mode = if result.full_mode {
"full"
@@ -186,3 +240,81 @@ pub fn print_generate_docs_json(result: &GenerateDocsResult, elapsed_ms: u64) {
};
println!("{}", serde_json::to_string(&output).unwrap());
}
#[cfg(test)]
mod tests {
use std::path::Path;
use crate::core::db::{create_connection, run_migrations};
use super::*;
fn setup_db() -> Connection {
let conn = create_connection(Path::new(":memory:")).unwrap();
run_migrations(&conn).unwrap();
conn.execute(
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url) VALUES (1, 100, 'group/project', 'https://gitlab.com/group/project')",
[],
).unwrap();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 1, 'Test', 'opened', 1000, 2000, 3000)",
[],
).unwrap();
conn.execute(
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
[],
).unwrap();
conn
}
fn insert_note(conn: &Connection, id: i64, gitlab_id: i64, is_system: bool) {
conn.execute(
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (?1, ?2, 1, 1, 'alice', 'note body', 1000, 2000, 3000, ?3)",
rusqlite::params![id, gitlab_id, is_system as i32],
).unwrap();
}
#[test]
fn test_full_seed_includes_notes() {
let conn = setup_db();
insert_note(&conn, 1, 101, false);
insert_note(&conn, 2, 102, false);
insert_note(&conn, 3, 103, false);
insert_note(&conn, 4, 104, true); // system note — should be excluded
let seeded = seed_dirty_notes(&conn, None).unwrap();
assert_eq!(seeded, 3);
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'note'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(count, 3);
}
#[test]
fn test_note_document_count_stable_after_second_generate_docs_full() {
let conn = setup_db();
insert_note(&conn, 1, 101, false);
insert_note(&conn, 2, 102, false);
let first = seed_dirty_notes(&conn, None).unwrap();
assert_eq!(first, 2);
// Second run should be idempotent (ON CONFLICT DO NOTHING)
let second = seed_dirty_notes(&conn, None).unwrap();
assert_eq!(second, 0);
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'note'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(count, 2);
}
}