feat: implement per-note search and document pipeline

- Add SourceType::Note with extract_note_document() and ParentMetadataCache - Migration 022: composite indexes for notes queries + author_id column - Migration 024: table rebuild adding 'note' to CHECK constraints, defense triggers - Migration 025: backfill existing non-system notes into dirty queue - Add lore notes CLI command with 17 filter options (author, path, resolution, etc.) - Support table/json/jsonl/csv output formats with field selection - Wire note dirty tracking through discussion and MR discussion ingestion - Fix test_migration_024_preserves_existing_data off-by-one (tested wrong migration) - Fix upsert_document_inner returning false for label/path-only changes
2026-02-12 12:37:11 -05:00
parent fda9cd8835
commit 83cd16c918
21 changed files with 5345 additions and 126 deletions
--- a/src/cli/autocorrect.rs
+++ b/src/cli/autocorrect.rs
@@ -186,6 +186,31 @@ const COMMAND_FLAGS: &[(&str, &[&str])] = &[
        ],
    ),
    ("drift", &["--threshold", "--project"]),
+    (
+        "notes",
+        &[
+            "--limit",
+            "--fields",
+            "--format",
+            "--author",
+            "--note-type",
+            "--contains",
+            "--note-id",
+            "--gitlab-note-id",
+            "--discussion-id",
+            "--include-system",
+            "--for-issue",
+            "--for-mr",
+            "--project",
+            "--since",
+            "--until",
+            "--path",
+            "--resolution",
+            "--sort",
+            "--asc",
+            "--open",
+        ],
+    ),
    (
        "init",
        &[
--- a/src/cli/commands/generate_docs.rs
+++ b/src/cli/commands/generate_docs.rs
@@ -39,6 +39,7 @@ pub fn run_generate_docs(
        result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
        result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
        result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
+        result.seeded += seed_dirty_notes(&conn, project_filter)?;
    }

    let regen =
@@ -67,6 +68,10 @@ fn seed_dirty(
        SourceType::Issue => "issues",
        SourceType::MergeRequest => "merge_requests",
        SourceType::Discussion => "discussions",
+        SourceType::Note => {
+            // NOTE-2E will implement seed_dirty_notes separately (needs is_system filter)
+            unreachable!("Note seeding handled by seed_dirty_notes, not seed_dirty")
+        }
    };
    let type_str = source_type.as_str();
    let now = chrono::Utc::now().timestamp_millis();
@@ -125,6 +130,55 @@ fn seed_dirty(
    Ok(total_seeded)
 }

+fn seed_dirty_notes(conn: &Connection, project_filter: Option<&str>) -> Result<usize> {
+    let now = chrono::Utc::now().timestamp_millis();
+    let mut total_seeded: usize = 0;
+    let mut last_id: i64 = 0;
+
+    loop {
+        let inserted = if let Some(project) = project_filter {
+            let project_id = resolve_project(conn, project)?;
+
+            conn.execute(
+                "INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
+                 SELECT 'note', id, ?1, 0, NULL, NULL, NULL
+                 FROM notes WHERE id > ?2 AND project_id = ?3 AND is_system = 0 ORDER BY id LIMIT ?4
+                 ON CONFLICT(source_type, source_id) DO NOTHING",
+                rusqlite::params![now, last_id, project_id, FULL_MODE_CHUNK_SIZE],
+            )?
+        } else {
+            conn.execute(
+                "INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
+                 SELECT 'note', id, ?1, 0, NULL, NULL, NULL
+                 FROM notes WHERE id > ?2 AND is_system = 0 ORDER BY id LIMIT ?3
+                 ON CONFLICT(source_type, source_id) DO NOTHING",
+                rusqlite::params![now, last_id, FULL_MODE_CHUNK_SIZE],
+            )?
+        };
+
+        if inserted == 0 {
+            break;
+        }
+
+        let max_id: i64 = conn.query_row(
+            "SELECT MAX(id) FROM (SELECT id FROM notes WHERE id > ?1 AND is_system = 0 ORDER BY id LIMIT ?2)",
+            rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
+            |row| row.get(0),
+        )?;
+
+        total_seeded += inserted;
+        last_id = max_id;
+    }
+
+    info!(
+        source_type = "note",
+        seeded = total_seeded,
+        "Seeded dirty_sources"
+    );
+
+    Ok(total_seeded)
+}
+
 pub fn print_generate_docs(result: &GenerateDocsResult) {
    let mode = if result.full_mode {
        "full"
@@ -186,3 +240,81 @@ pub fn print_generate_docs_json(result: &GenerateDocsResult, elapsed_ms: u64) {
    };
    println!("{}", serde_json::to_string(&output).unwrap());
 }
+
+#[cfg(test)]
+mod tests {
+    use std::path::Path;
+
+    use crate::core::db::{create_connection, run_migrations};
+
+    use super::*;
+
+    fn setup_db() -> Connection {
+        let conn = create_connection(Path::new(":memory:")).unwrap();
+        run_migrations(&conn).unwrap();
+        conn.execute(
+            "INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url) VALUES (1, 100, 'group/project', 'https://gitlab.com/group/project')",
+            [],
+        ).unwrap();
+        conn.execute(
+            "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 1, 'Test', 'opened', 1000, 2000, 3000)",
+            [],
+        ).unwrap();
+        conn.execute(
+            "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
+            [],
+        ).unwrap();
+        conn
+    }
+
+    fn insert_note(conn: &Connection, id: i64, gitlab_id: i64, is_system: bool) {
+        conn.execute(
+            "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (?1, ?2, 1, 1, 'alice', 'note body', 1000, 2000, 3000, ?3)",
+            rusqlite::params![id, gitlab_id, is_system as i32],
+        ).unwrap();
+    }
+
+    #[test]
+    fn test_full_seed_includes_notes() {
+        let conn = setup_db();
+        insert_note(&conn, 1, 101, false);
+        insert_note(&conn, 2, 102, false);
+        insert_note(&conn, 3, 103, false);
+        insert_note(&conn, 4, 104, true); // system note — should be excluded
+
+        let seeded = seed_dirty_notes(&conn, None).unwrap();
+        assert_eq!(seeded, 3);
+
+        let count: i64 = conn
+            .query_row(
+                "SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'note'",
+                [],
+                |row| row.get(0),
+            )
+            .unwrap();
+        assert_eq!(count, 3);
+    }
+
+    #[test]
+    fn test_note_document_count_stable_after_second_generate_docs_full() {
+        let conn = setup_db();
+        insert_note(&conn, 1, 101, false);
+        insert_note(&conn, 2, 102, false);
+
+        let first = seed_dirty_notes(&conn, None).unwrap();
+        assert_eq!(first, 2);
+
+        // Second run should be idempotent (ON CONFLICT DO NOTHING)
+        let second = seed_dirty_notes(&conn, None).unwrap();
+        assert_eq!(second, 0);
+
+        let count: i64 = conn
+            .query_row(
+                "SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'note'",
+                [],
+                |row| row.get(0),
+            )
+            .unwrap();
+        assert_eq!(count, 2);
+    }
+}
--- a/src/cli/commands/list.rs
+++ b/src/cli/commands/list.rs
--- a/src/cli/commands/mod.rs
+++ b/src/cli/commands/mod.rs
@@ -30,8 +30,10 @@ pub use ingest::{
 };
 pub use init::{InitInputs, InitOptions, InitResult, run_init};
 pub use list::{
-    ListFilters, MrListFilters, open_issue_in_browser, open_mr_in_browser, print_list_issues,
-    print_list_issues_json, print_list_mrs, print_list_mrs_json, run_list_issues, run_list_mrs,
+    ListFilters, MrListFilters, NoteListFilters, open_issue_in_browser, open_mr_in_browser,
+    print_list_issues, print_list_issues_json, print_list_mrs, print_list_mrs_json,
+    print_list_notes, print_list_notes_csv, print_list_notes_json, print_list_notes_jsonl,
+    query_notes, run_list_issues, run_list_mrs,
 };
 pub use search::{
    SearchCliFilters, SearchResponse, print_search_results, print_search_results_json, run_search,
--- a/src/cli/commands/search.rs
+++ b/src/cli/commands/search.rs
@@ -334,6 +334,7 @@ pub fn print_search_results(response: &SearchResponse) {
            "issue" => "Issue",
            "merge_request" => "MR",
            "discussion" => "Discussion",
+            "note" => "Note",
            _ => &result.source_type,
        };

--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -112,6 +112,9 @@ pub enum Commands {
    /// List or show merge requests
    Mrs(MrsArgs),

+    /// List notes from discussions
+    Notes(NotesArgs),
+
    /// Ingest data from GitLab
    Ingest(IngestArgs),

@@ -489,6 +492,113 @@ pub struct MrsArgs {
    pub no_open: bool,
 }

+#[derive(Parser)]
+#[command(after_help = "\x1b[1mExamples:\x1b[0m
+  lore notes                                  # List 50 most recent notes
+  lore notes --author alice --since 7d        # Notes by alice in last 7 days
+  lore notes --for-issue 42 -p group/repo     # Notes on issue #42
+  lore notes --path src/ --resolution unresolved  # Unresolved diff notes in src/")]
+pub struct NotesArgs {
+    /// Maximum results
+    #[arg(
+        short = 'n',
+        long = "limit",
+        default_value = "50",
+        help_heading = "Output"
+    )]
+    pub limit: usize,
+
+    /// Select output fields (comma-separated, or 'minimal' preset: id,author_username,body,created_at_iso)
+    #[arg(long, help_heading = "Output", value_delimiter = ',')]
+    pub fields: Option<Vec<String>>,
+
+    /// Output format (table, json, jsonl, csv)
+    #[arg(
+        long,
+        default_value = "table",
+        value_parser = ["table", "json", "jsonl", "csv"],
+        help_heading = "Output"
+    )]
+    pub format: String,
+
+    /// Filter by author username
+    #[arg(short = 'a', long, help_heading = "Filters")]
+    pub author: Option<String>,
+
+    /// Filter by note type (DiffNote, DiscussionNote)
+    #[arg(long, help_heading = "Filters")]
+    pub note_type: Option<String>,
+
+    /// Filter by body text (substring match)
+    #[arg(long, help_heading = "Filters")]
+    pub contains: Option<String>,
+
+    /// Filter by internal note ID
+    #[arg(long, help_heading = "Filters")]
+    pub note_id: Option<i64>,
+
+    /// Filter by GitLab note ID
+    #[arg(long, help_heading = "Filters")]
+    pub gitlab_note_id: Option<i64>,
+
+    /// Filter by discussion ID
+    #[arg(long, help_heading = "Filters")]
+    pub discussion_id: Option<String>,
+
+    /// Include system notes (excluded by default)
+    #[arg(long, help_heading = "Filters")]
+    pub include_system: bool,
+
+    /// Filter to notes on a specific issue IID (requires --project or default_project)
+    #[arg(long, conflicts_with = "for_mr", help_heading = "Filters")]
+    pub for_issue: Option<i64>,
+
+    /// Filter to notes on a specific MR IID (requires --project or default_project)
+    #[arg(long, conflicts_with = "for_issue", help_heading = "Filters")]
+    pub for_mr: Option<i64>,
+
+    /// Filter by project path
+    #[arg(short = 'p', long, help_heading = "Filters")]
+    pub project: Option<String>,
+
+    /// Filter by time (7d, 2w, 1m, or YYYY-MM-DD)
+    #[arg(long, help_heading = "Filters")]
+    pub since: Option<String>,
+
+    /// Filter until date (YYYY-MM-DD, inclusive end-of-day)
+    #[arg(long, help_heading = "Filters")]
+    pub until: Option<String>,
+
+    /// Filter by file path (exact match or prefix with trailing /)
+    #[arg(long, help_heading = "Filters")]
+    pub path: Option<String>,
+
+    /// Filter by resolution status (any, unresolved, resolved)
+    #[arg(
+        long,
+        value_parser = ["any", "unresolved", "resolved"],
+        help_heading = "Filters"
+    )]
+    pub resolution: Option<String>,
+
+    /// Sort field (created, updated)
+    #[arg(
+        long,
+        value_parser = ["created", "updated"],
+        default_value = "created",
+        help_heading = "Sorting"
+    )]
+    pub sort: String,
+
+    /// Sort ascending (default: descending)
+    #[arg(long, help_heading = "Sorting")]
+    pub asc: bool,
+
+    /// Open first matching item in browser
+    #[arg(long, help_heading = "Actions")]
+    pub open: bool,
+}
+
 #[derive(Parser)]
 pub struct IngestArgs {
    /// Entity to ingest (issues, mrs). Omit to ingest everything
@@ -556,8 +666,8 @@ pub struct SearchArgs {
    #[arg(long, default_value = "hybrid", value_parser = ["lexical", "hybrid", "semantic"], help_heading = "Mode")]
    pub mode: String,

-    /// Filter by source type (issue, mr, discussion)
-    #[arg(long = "type", value_name = "TYPE", value_parser = ["issue", "mr", "discussion"], help_heading = "Filters")]
+    /// Filter by source type (issue, mr, discussion, note)
+    #[arg(long = "type", value_name = "TYPE", value_parser = ["issue", "mr", "discussion", "note"], help_heading = "Filters")]
    pub source_type: Option<String>,

    /// Filter by author username
--- a/src/cli/robot.rs
+++ b/src/cli/robot.rs
@@ -64,6 +64,10 @@ pub fn expand_fields_preset(fields: &[String], entity: &str) -> Vec<String> {
                .iter()
                .map(|s| (*s).to_string())
                .collect(),
+            "notes" => ["id", "author_username", "body", "created_at_iso"]
+                .iter()
+                .map(|s| (*s).to_string())
+                .collect(),
            _ => fields.to_vec(),
        }
    } else {
@@ -82,3 +86,25 @@ pub fn strip_schemas(commands: &mut serde_json::Value) {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_expand_fields_preset_notes() {
+        let fields = vec!["minimal".to_string()];
+        let expanded = expand_fields_preset(&fields, "notes");
+        assert_eq!(
+            expanded,
+            ["id", "author_username", "body", "created_at_iso"]
+        );
+    }
+
+    #[test]
+    fn test_expand_fields_preset_passthrough() {
+        let fields = vec!["id".to_string(), "body".to_string()];
+        let expanded = expand_fields_preset(&fields, "notes");
+        assert_eq!(expanded, ["id", "body"]);
+    }
+}