feat: implement per-note search and document pipeline
- Add SourceType::Note with extract_note_document() and ParentMetadataCache - Migration 022: composite indexes for notes queries + author_id column - Migration 024: table rebuild adding 'note' to CHECK constraints, defense triggers - Migration 025: backfill existing non-system notes into dirty queue - Add lore notes CLI command with 17 filter options (author, path, resolution, etc.) - Support table/json/jsonl/csv output formats with field selection - Wire note dirty tracking through discussion and MR discussion ingestion - Fix test_migration_024_preserves_existing_data off-by-one (tested wrong migration) - Fix upsert_document_inner returning false for label/path-only changes
This commit is contained in:
@@ -4,8 +4,8 @@ use tracing::{debug, instrument, warn};
|
||||
|
||||
use crate::core::error::Result;
|
||||
use crate::documents::{
|
||||
DocumentData, SourceType, extract_discussion_document, extract_issue_document,
|
||||
extract_mr_document,
|
||||
DocumentData, ParentMetadataCache, SourceType, extract_discussion_document,
|
||||
extract_issue_document, extract_mr_document, extract_note_document_cached,
|
||||
};
|
||||
use crate::ingestion::dirty_tracker::{clear_dirty, get_dirty_sources, record_dirty_error};
|
||||
|
||||
@@ -27,6 +27,7 @@ pub fn regenerate_dirty_documents(
|
||||
let mut result = RegenerateResult::default();
|
||||
|
||||
let mut estimated_total: usize = 0;
|
||||
let mut cache = ParentMetadataCache::new();
|
||||
|
||||
loop {
|
||||
let dirty = get_dirty_sources(conn)?;
|
||||
@@ -41,7 +42,7 @@ pub fn regenerate_dirty_documents(
|
||||
estimated_total = estimated_total.max(processed_so_far + remaining);
|
||||
|
||||
for (source_type, source_id) in &dirty {
|
||||
match regenerate_one(conn, *source_type, *source_id) {
|
||||
match regenerate_one(conn, *source_type, *source_id, &mut cache) {
|
||||
Ok(changed) => {
|
||||
if changed {
|
||||
result.regenerated += 1;
|
||||
@@ -83,11 +84,17 @@ pub fn regenerate_dirty_documents(
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn regenerate_one(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<bool> {
|
||||
fn regenerate_one(
|
||||
conn: &Connection,
|
||||
source_type: SourceType,
|
||||
source_id: i64,
|
||||
cache: &mut ParentMetadataCache,
|
||||
) -> Result<bool> {
|
||||
let doc = match source_type {
|
||||
SourceType::Issue => extract_issue_document(conn, source_id)?,
|
||||
SourceType::MergeRequest => extract_mr_document(conn, source_id)?,
|
||||
SourceType::Discussion => extract_discussion_document(conn, source_id)?,
|
||||
SourceType::Note => extract_note_document_cached(conn, source_id, cache)?,
|
||||
};
|
||||
|
||||
let Some(doc) = doc else {
|
||||
@@ -122,11 +129,7 @@ fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<bool>
|
||||
)
|
||||
.optional()?;
|
||||
|
||||
let content_changed = match &existing {
|
||||
Some((_, old_content_hash, _, _)) => old_content_hash != &doc.content_hash,
|
||||
None => true,
|
||||
};
|
||||
|
||||
// Fast path: if all three hashes match, nothing changed at all.
|
||||
if let Some((_, ref old_content_hash, ref old_labels_hash, ref old_paths_hash)) = existing
|
||||
&& old_content_hash == &doc.content_hash
|
||||
&& old_labels_hash == &doc.labels_hash
|
||||
@@ -134,6 +137,7 @@ fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<bool>
|
||||
{
|
||||
return Ok(false);
|
||||
}
|
||||
// Past this point at least one hash differs, so the document will be updated.
|
||||
|
||||
let labels_json = serde_json::to_string(&doc.labels).unwrap_or_else(|_| "[]".to_string());
|
||||
|
||||
@@ -243,7 +247,8 @@ fn upsert_document_inner(conn: &Connection, doc: &DocumentData) -> Result<bool>
|
||||
}
|
||||
}
|
||||
|
||||
Ok(content_changed)
|
||||
// We passed the triple-hash fast path, so at least one hash differs.
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn delete_document(conn: &Connection, source_type: SourceType, source_id: i64) -> Result<()> {
|
||||
@@ -473,4 +478,316 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(label_count, 1);
|
||||
}
|
||||
|
||||
fn setup_note_db() -> Connection {
|
||||
let conn = setup_db();
|
||||
conn.execute_batch(
|
||||
"
|
||||
CREATE TABLE merge_requests (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_id INTEGER UNIQUE NOT NULL,
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
iid INTEGER NOT NULL,
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
state TEXT,
|
||||
draft INTEGER NOT NULL DEFAULT 0,
|
||||
author_username TEXT,
|
||||
source_branch TEXT,
|
||||
target_branch TEXT,
|
||||
head_sha TEXT,
|
||||
references_short TEXT,
|
||||
references_full TEXT,
|
||||
detailed_merge_status TEXT,
|
||||
merge_user_username TEXT,
|
||||
created_at INTEGER,
|
||||
updated_at INTEGER,
|
||||
merged_at INTEGER,
|
||||
closed_at INTEGER,
|
||||
last_seen_at INTEGER NOT NULL,
|
||||
discussions_synced_for_updated_at INTEGER,
|
||||
discussions_sync_last_attempt_at INTEGER,
|
||||
discussions_sync_attempts INTEGER DEFAULT 0,
|
||||
discussions_sync_last_error TEXT,
|
||||
resource_events_synced_for_updated_at INTEGER,
|
||||
web_url TEXT,
|
||||
raw_payload_id INTEGER
|
||||
);
|
||||
CREATE TABLE mr_labels (
|
||||
merge_request_id INTEGER REFERENCES merge_requests(id),
|
||||
label_id INTEGER REFERENCES labels(id),
|
||||
PRIMARY KEY(merge_request_id, label_id)
|
||||
);
|
||||
CREATE TABLE discussions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_discussion_id TEXT NOT NULL,
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
issue_id INTEGER REFERENCES issues(id),
|
||||
merge_request_id INTEGER,
|
||||
noteable_type TEXT NOT NULL,
|
||||
individual_note INTEGER NOT NULL DEFAULT 0,
|
||||
first_note_at INTEGER,
|
||||
last_note_at INTEGER,
|
||||
last_seen_at INTEGER NOT NULL,
|
||||
resolvable INTEGER NOT NULL DEFAULT 0,
|
||||
resolved INTEGER NOT NULL DEFAULT 0
|
||||
);
|
||||
CREATE TABLE notes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gitlab_id INTEGER UNIQUE NOT NULL,
|
||||
discussion_id INTEGER NOT NULL REFERENCES discussions(id),
|
||||
project_id INTEGER NOT NULL REFERENCES projects(id),
|
||||
note_type TEXT,
|
||||
is_system INTEGER NOT NULL DEFAULT 0,
|
||||
author_username TEXT,
|
||||
body TEXT,
|
||||
created_at INTEGER NOT NULL,
|
||||
updated_at INTEGER NOT NULL,
|
||||
last_seen_at INTEGER NOT NULL,
|
||||
position INTEGER,
|
||||
resolvable INTEGER NOT NULL DEFAULT 0,
|
||||
resolved INTEGER NOT NULL DEFAULT 0,
|
||||
resolved_by TEXT,
|
||||
resolved_at INTEGER,
|
||||
position_old_path TEXT,
|
||||
position_new_path TEXT,
|
||||
position_old_line INTEGER,
|
||||
position_new_line INTEGER,
|
||||
raw_payload_id INTEGER
|
||||
);
|
||||
",
|
||||
)
|
||||
.unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_note_document() {
|
||||
let conn = setup_note_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Test Issue', 'opened', 'alice', 1000, 2000, 3000, 'https://example.com/issues/42')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'This is a note', 1000, 2000, 3000, 0)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
mark_dirty(&conn, SourceType::Note, 1).unwrap();
|
||||
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
||||
assert_eq!(result.regenerated, 1);
|
||||
assert_eq!(result.unchanged, 0);
|
||||
assert_eq!(result.errored, 0);
|
||||
|
||||
let (source_type, content): (String, String) = conn
|
||||
.query_row(
|
||||
"SELECT source_type, content_text FROM documents WHERE source_id = 1",
|
||||
[],
|
||||
|r| Ok((r.get(0)?, r.get(1)?)),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(source_type, "note");
|
||||
assert!(content.contains("[[Note]]"));
|
||||
assert!(content.contains("author: @bob"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_note_system_note_deletes() {
|
||||
let conn = setup_note_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bot', 'assigned to @alice', 1000, 2000, 3000, 1)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
// Pre-insert a document for this note (simulating a previously-generated doc)
|
||||
conn.execute(
|
||||
"INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash) VALUES ('note', 1, 1, 'old content', 'oldhash')",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
mark_dirty(&conn, SourceType::Note, 1).unwrap();
|
||||
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
||||
assert_eq!(result.regenerated, 1);
|
||||
|
||||
let count: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM documents WHERE source_type = 'note'",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(count, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regenerate_note_unchanged() {
|
||||
let conn = setup_note_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000, 'https://example.com/issues/42')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'Some note', 1000, 2000, 3000, 0)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
mark_dirty(&conn, SourceType::Note, 1).unwrap();
|
||||
let r1 = regenerate_dirty_documents(&conn, None).unwrap();
|
||||
assert_eq!(r1.regenerated, 1);
|
||||
|
||||
mark_dirty(&conn, SourceType::Note, 1).unwrap();
|
||||
let r2 = regenerate_dirty_documents(&conn, None).unwrap();
|
||||
assert_eq!(r2.unchanged, 1);
|
||||
assert_eq!(r2.regenerated, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_note_regeneration_batch_uses_cache() {
|
||||
let conn = setup_note_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Shared Issue', 'opened', 'alice', 1000, 2000, 3000, 'https://example.com/issues/42')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
for i in 1..=10 {
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (?1, ?2, 1, 1, 'bob', ?3, 1000, 2000, 3000, 0)",
|
||||
rusqlite::params![i, i * 100, format!("Note body {}", i)],
|
||||
).unwrap();
|
||||
mark_dirty(&conn, SourceType::Note, i).unwrap();
|
||||
}
|
||||
|
||||
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
||||
assert_eq!(result.regenerated, 10);
|
||||
assert_eq!(result.errored, 0);
|
||||
|
||||
let count: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM documents WHERE source_type = 'note'",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(count, 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_note_regeneration_cache_consistent_with_direct_extraction() {
|
||||
let conn = setup_note_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Consistency Check', 'opened', 'alice', 1000, 2000, 3000, 'https://example.com/issues/42')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO labels (id, project_id, name) VALUES (1, 1, 'backend')",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO issue_labels (issue_id, label_id) VALUES (1, 1)",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'Some content', 1000, 2000, 3000, 0)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
use crate::documents::extract_note_document;
|
||||
let direct = extract_note_document(&conn, 1).unwrap().unwrap();
|
||||
|
||||
let mut cache = ParentMetadataCache::new();
|
||||
let cached = extract_note_document_cached(&conn, 1, &mut cache)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(direct.content_text, cached.content_text);
|
||||
assert_eq!(direct.content_hash, cached.content_hash);
|
||||
assert_eq!(direct.labels, cached.labels);
|
||||
assert_eq!(direct.labels_hash, cached.labels_hash);
|
||||
assert_eq!(direct.paths_hash, cached.paths_hash);
|
||||
assert_eq!(direct.title, cached.title);
|
||||
assert_eq!(direct.url, cached.url);
|
||||
assert_eq!(direct.author_username, cached.author_username);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_note_regeneration_cache_invalidates_across_parents() {
|
||||
let conn = setup_note_db();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Issue Alpha', 'opened', 1000, 2000, 3000, 'https://example.com/issues/42')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at, web_url) VALUES (2, 20, 1, 99, 'Issue Beta', 'opened', 1000, 2000, 3000, 'https://example.com/issues/99')",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (2, 'disc_2', 1, 2, 'Issue', 3000)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'Alpha note', 1000, 2000, 3000, 0)",
|
||||
[],
|
||||
).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (2, 200, 2, 1, 'alice', 'Beta note', 1000, 2000, 3000, 0)",
|
||||
[],
|
||||
).unwrap();
|
||||
|
||||
mark_dirty(&conn, SourceType::Note, 1).unwrap();
|
||||
mark_dirty(&conn, SourceType::Note, 2).unwrap();
|
||||
|
||||
let result = regenerate_dirty_documents(&conn, None).unwrap();
|
||||
assert_eq!(result.regenerated, 2);
|
||||
assert_eq!(result.errored, 0);
|
||||
|
||||
let alpha_content: String = conn
|
||||
.query_row(
|
||||
"SELECT content_text FROM documents WHERE source_type = 'note' AND source_id = 1",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
let beta_content: String = conn
|
||||
.query_row(
|
||||
"SELECT content_text FROM documents WHERE source_type = 'note' AND source_id = 2",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert!(alpha_content.contains("parent_iid: 42"));
|
||||
assert!(alpha_content.contains("parent_title: Issue Alpha"));
|
||||
assert!(beta_content.contains("parent_iid: 99"));
|
||||
assert!(beta_content.contains("parent_title: Issue Beta"));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user