feat: implement per-note search and document pipeline

- Add SourceType::Note with extract_note_document() and ParentMetadataCache
- Migration 022: composite indexes for notes queries + author_id column
- Migration 024: table rebuild adding 'note' to CHECK constraints, defense triggers
- Migration 025: backfill existing non-system notes into dirty queue
- Add lore notes CLI command with 17 filter options (author, path, resolution, etc.)
- Support table/json/jsonl/csv output formats with field selection
- Wire note dirty tracking through discussion and MR discussion ingestion
- Fix test_migration_024_preserves_existing_data off-by-one (tested wrong migration)
- Fix upsert_document_inner returning false for label/path-only changes
This commit is contained in:
teernisse
2026-02-12 12:37:11 -05:00
parent fda9cd8835
commit 83cd16c918
21 changed files with 5345 additions and 126 deletions

View File

@@ -186,6 +186,31 @@ const COMMAND_FLAGS: &[(&str, &[&str])] = &[
],
),
("drift", &["--threshold", "--project"]),
(
"notes",
&[
"--limit",
"--fields",
"--format",
"--author",
"--note-type",
"--contains",
"--note-id",
"--gitlab-note-id",
"--discussion-id",
"--include-system",
"--for-issue",
"--for-mr",
"--project",
"--since",
"--until",
"--path",
"--resolution",
"--sort",
"--asc",
"--open",
],
),
(
"init",
&[

View File

@@ -39,6 +39,7 @@ pub fn run_generate_docs(
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
result.seeded += seed_dirty_notes(&conn, project_filter)?;
}
let regen =
@@ -67,6 +68,10 @@ fn seed_dirty(
SourceType::Issue => "issues",
SourceType::MergeRequest => "merge_requests",
SourceType::Discussion => "discussions",
SourceType::Note => {
// NOTE-2E will implement seed_dirty_notes separately (needs is_system filter)
unreachable!("Note seeding handled by seed_dirty_notes, not seed_dirty")
}
};
let type_str = source_type.as_str();
let now = chrono::Utc::now().timestamp_millis();
@@ -125,6 +130,55 @@ fn seed_dirty(
Ok(total_seeded)
}
fn seed_dirty_notes(conn: &Connection, project_filter: Option<&str>) -> Result<usize> {
let now = chrono::Utc::now().timestamp_millis();
let mut total_seeded: usize = 0;
let mut last_id: i64 = 0;
loop {
let inserted = if let Some(project) = project_filter {
let project_id = resolve_project(conn, project)?;
conn.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT 'note', id, ?1, 0, NULL, NULL, NULL
FROM notes WHERE id > ?2 AND project_id = ?3 AND is_system = 0 ORDER BY id LIMIT ?4
ON CONFLICT(source_type, source_id) DO NOTHING",
rusqlite::params![now, last_id, project_id, FULL_MODE_CHUNK_SIZE],
)?
} else {
conn.execute(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT 'note', id, ?1, 0, NULL, NULL, NULL
FROM notes WHERE id > ?2 AND is_system = 0 ORDER BY id LIMIT ?3
ON CONFLICT(source_type, source_id) DO NOTHING",
rusqlite::params![now, last_id, FULL_MODE_CHUNK_SIZE],
)?
};
if inserted == 0 {
break;
}
let max_id: i64 = conn.query_row(
"SELECT MAX(id) FROM (SELECT id FROM notes WHERE id > ?1 AND is_system = 0 ORDER BY id LIMIT ?2)",
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|row| row.get(0),
)?;
total_seeded += inserted;
last_id = max_id;
}
info!(
source_type = "note",
seeded = total_seeded,
"Seeded dirty_sources"
);
Ok(total_seeded)
}
pub fn print_generate_docs(result: &GenerateDocsResult) {
let mode = if result.full_mode {
"full"
@@ -186,3 +240,81 @@ pub fn print_generate_docs_json(result: &GenerateDocsResult, elapsed_ms: u64) {
};
println!("{}", serde_json::to_string(&output).unwrap());
}
#[cfg(test)]
mod tests {
use std::path::Path;
use crate::core::db::{create_connection, run_migrations};
use super::*;
fn setup_db() -> Connection {
let conn = create_connection(Path::new(":memory:")).unwrap();
run_migrations(&conn).unwrap();
conn.execute(
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url) VALUES (1, 100, 'group/project', 'https://gitlab.com/group/project')",
[],
).unwrap();
conn.execute(
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 1, 'Test', 'opened', 1000, 2000, 3000)",
[],
).unwrap();
conn.execute(
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
[],
).unwrap();
conn
}
fn insert_note(conn: &Connection, id: i64, gitlab_id: i64, is_system: bool) {
conn.execute(
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (?1, ?2, 1, 1, 'alice', 'note body', 1000, 2000, 3000, ?3)",
rusqlite::params![id, gitlab_id, is_system as i32],
).unwrap();
}
#[test]
fn test_full_seed_includes_notes() {
let conn = setup_db();
insert_note(&conn, 1, 101, false);
insert_note(&conn, 2, 102, false);
insert_note(&conn, 3, 103, false);
insert_note(&conn, 4, 104, true); // system note — should be excluded
let seeded = seed_dirty_notes(&conn, None).unwrap();
assert_eq!(seeded, 3);
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'note'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(count, 3);
}
#[test]
fn test_note_document_count_stable_after_second_generate_docs_full() {
let conn = setup_db();
insert_note(&conn, 1, 101, false);
insert_note(&conn, 2, 102, false);
let first = seed_dirty_notes(&conn, None).unwrap();
assert_eq!(first, 2);
// Second run should be idempotent (ON CONFLICT DO NOTHING)
let second = seed_dirty_notes(&conn, None).unwrap();
assert_eq!(second, 0);
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'note'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(count, 2);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -30,8 +30,10 @@ pub use ingest::{
};
pub use init::{InitInputs, InitOptions, InitResult, run_init};
pub use list::{
ListFilters, MrListFilters, open_issue_in_browser, open_mr_in_browser, print_list_issues,
print_list_issues_json, print_list_mrs, print_list_mrs_json, run_list_issues, run_list_mrs,
ListFilters, MrListFilters, NoteListFilters, open_issue_in_browser, open_mr_in_browser,
print_list_issues, print_list_issues_json, print_list_mrs, print_list_mrs_json,
print_list_notes, print_list_notes_csv, print_list_notes_json, print_list_notes_jsonl,
query_notes, run_list_issues, run_list_mrs,
};
pub use search::{
SearchCliFilters, SearchResponse, print_search_results, print_search_results_json, run_search,

View File

@@ -334,6 +334,7 @@ pub fn print_search_results(response: &SearchResponse) {
"issue" => "Issue",
"merge_request" => "MR",
"discussion" => "Discussion",
"note" => "Note",
_ => &result.source_type,
};

View File

@@ -112,6 +112,9 @@ pub enum Commands {
/// List or show merge requests
Mrs(MrsArgs),
/// List notes from discussions
Notes(NotesArgs),
/// Ingest data from GitLab
Ingest(IngestArgs),
@@ -489,6 +492,113 @@ pub struct MrsArgs {
pub no_open: bool,
}
#[derive(Parser)]
#[command(after_help = "\x1b[1mExamples:\x1b[0m
lore notes # List 50 most recent notes
lore notes --author alice --since 7d # Notes by alice in last 7 days
lore notes --for-issue 42 -p group/repo # Notes on issue #42
lore notes --path src/ --resolution unresolved # Unresolved diff notes in src/")]
pub struct NotesArgs {
/// Maximum results
#[arg(
short = 'n',
long = "limit",
default_value = "50",
help_heading = "Output"
)]
pub limit: usize,
/// Select output fields (comma-separated, or 'minimal' preset: id,author_username,body,created_at_iso)
#[arg(long, help_heading = "Output", value_delimiter = ',')]
pub fields: Option<Vec<String>>,
/// Output format (table, json, jsonl, csv)
#[arg(
long,
default_value = "table",
value_parser = ["table", "json", "jsonl", "csv"],
help_heading = "Output"
)]
pub format: String,
/// Filter by author username
#[arg(short = 'a', long, help_heading = "Filters")]
pub author: Option<String>,
/// Filter by note type (DiffNote, DiscussionNote)
#[arg(long, help_heading = "Filters")]
pub note_type: Option<String>,
/// Filter by body text (substring match)
#[arg(long, help_heading = "Filters")]
pub contains: Option<String>,
/// Filter by internal note ID
#[arg(long, help_heading = "Filters")]
pub note_id: Option<i64>,
/// Filter by GitLab note ID
#[arg(long, help_heading = "Filters")]
pub gitlab_note_id: Option<i64>,
/// Filter by discussion ID
#[arg(long, help_heading = "Filters")]
pub discussion_id: Option<String>,
/// Include system notes (excluded by default)
#[arg(long, help_heading = "Filters")]
pub include_system: bool,
/// Filter to notes on a specific issue IID (requires --project or default_project)
#[arg(long, conflicts_with = "for_mr", help_heading = "Filters")]
pub for_issue: Option<i64>,
/// Filter to notes on a specific MR IID (requires --project or default_project)
#[arg(long, conflicts_with = "for_issue", help_heading = "Filters")]
pub for_mr: Option<i64>,
/// Filter by project path
#[arg(short = 'p', long, help_heading = "Filters")]
pub project: Option<String>,
/// Filter by time (7d, 2w, 1m, or YYYY-MM-DD)
#[arg(long, help_heading = "Filters")]
pub since: Option<String>,
/// Filter until date (YYYY-MM-DD, inclusive end-of-day)
#[arg(long, help_heading = "Filters")]
pub until: Option<String>,
/// Filter by file path (exact match or prefix with trailing /)
#[arg(long, help_heading = "Filters")]
pub path: Option<String>,
/// Filter by resolution status (any, unresolved, resolved)
#[arg(
long,
value_parser = ["any", "unresolved", "resolved"],
help_heading = "Filters"
)]
pub resolution: Option<String>,
/// Sort field (created, updated)
#[arg(
long,
value_parser = ["created", "updated"],
default_value = "created",
help_heading = "Sorting"
)]
pub sort: String,
/// Sort ascending (default: descending)
#[arg(long, help_heading = "Sorting")]
pub asc: bool,
/// Open first matching item in browser
#[arg(long, help_heading = "Actions")]
pub open: bool,
}
#[derive(Parser)]
pub struct IngestArgs {
/// Entity to ingest (issues, mrs). Omit to ingest everything
@@ -556,8 +666,8 @@ pub struct SearchArgs {
#[arg(long, default_value = "hybrid", value_parser = ["lexical", "hybrid", "semantic"], help_heading = "Mode")]
pub mode: String,
/// Filter by source type (issue, mr, discussion)
#[arg(long = "type", value_name = "TYPE", value_parser = ["issue", "mr", "discussion"], help_heading = "Filters")]
/// Filter by source type (issue, mr, discussion, note)
#[arg(long = "type", value_name = "TYPE", value_parser = ["issue", "mr", "discussion", "note"], help_heading = "Filters")]
pub source_type: Option<String>,
/// Filter by author username

View File

@@ -64,6 +64,10 @@ pub fn expand_fields_preset(fields: &[String], entity: &str) -> Vec<String> {
.iter()
.map(|s| (*s).to_string())
.collect(),
"notes" => ["id", "author_username", "body", "created_at_iso"]
.iter()
.map(|s| (*s).to_string())
.collect(),
_ => fields.to_vec(),
}
} else {
@@ -82,3 +86,25 @@ pub fn strip_schemas(commands: &mut serde_json::Value) {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_expand_fields_preset_notes() {
let fields = vec!["minimal".to_string()];
let expanded = expand_fields_preset(&fields, "notes");
assert_eq!(
expanded,
["id", "author_username", "body", "created_at_iso"]
);
}
#[test]
fn test_expand_fields_preset_passthrough() {
let fields = vec!["id".to_string(), "body".to_string()];
let expanded = expand_fields_preset(&fields, "notes");
assert_eq!(expanded, ["id", "body"]);
}
}