use super::*; use crate::ingestion::dirty_tracker::mark_dirty; fn setup_db() -> Connection { let conn = Connection::open_in_memory().unwrap(); conn.execute_batch(" CREATE TABLE projects ( id INTEGER PRIMARY KEY, gitlab_project_id INTEGER UNIQUE NOT NULL, path_with_namespace TEXT NOT NULL, default_branch TEXT, web_url TEXT, created_at INTEGER, updated_at INTEGER, raw_payload_id INTEGER ); INSERT INTO projects (id, gitlab_project_id, path_with_namespace) VALUES (1, 100, 'group/project'); CREATE TABLE issues ( id INTEGER PRIMARY KEY, gitlab_id INTEGER UNIQUE NOT NULL, project_id INTEGER NOT NULL REFERENCES projects(id), iid INTEGER NOT NULL, title TEXT, description TEXT, state TEXT NOT NULL, author_username TEXT, created_at INTEGER NOT NULL, updated_at INTEGER NOT NULL, last_seen_at INTEGER NOT NULL, discussions_synced_for_updated_at INTEGER, resource_events_synced_for_updated_at INTEGER, web_url TEXT, raw_payload_id INTEGER ); CREATE TABLE labels ( id INTEGER PRIMARY KEY, gitlab_id INTEGER, project_id INTEGER NOT NULL REFERENCES projects(id), name TEXT NOT NULL, color TEXT, description TEXT ); CREATE TABLE issue_labels ( issue_id INTEGER NOT NULL REFERENCES issues(id), label_id INTEGER NOT NULL REFERENCES labels(id), PRIMARY KEY(issue_id, label_id) ); CREATE TABLE documents ( id INTEGER PRIMARY KEY, source_type TEXT NOT NULL, source_id INTEGER NOT NULL, project_id INTEGER NOT NULL, author_username TEXT, label_names TEXT, created_at INTEGER, updated_at INTEGER, url TEXT, title TEXT, content_text TEXT NOT NULL, content_hash TEXT NOT NULL, labels_hash TEXT NOT NULL DEFAULT '', paths_hash TEXT NOT NULL DEFAULT '', is_truncated INTEGER NOT NULL DEFAULT 0, truncated_reason TEXT, UNIQUE(source_type, source_id) ); CREATE TABLE document_labels ( document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, label_name TEXT NOT NULL, PRIMARY KEY(document_id, label_name) ); CREATE TABLE document_paths ( document_id INTEGER NOT NULL REFERENCES documents(id) ON DELETE CASCADE, path TEXT NOT NULL, PRIMARY KEY(document_id, path) ); CREATE TABLE dirty_sources ( source_type TEXT NOT NULL, source_id INTEGER NOT NULL, queued_at INTEGER NOT NULL, attempt_count INTEGER NOT NULL DEFAULT 0, last_attempt_at INTEGER, last_error TEXT, next_attempt_at INTEGER, PRIMARY KEY(source_type, source_id) ); CREATE INDEX idx_dirty_sources_next_attempt ON dirty_sources(next_attempt_at); ").unwrap(); conn } #[test] fn test_regenerate_creates_document() { let conn = setup_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test Issue', 'Description here', 'opened', 'alice', 1000, 2000, 3000)", [], ).unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 1); assert_eq!(result.unchanged, 0); assert_eq!(result.errored, 0); let count: i64 = conn .query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0)) .unwrap(); assert_eq!(count, 1); let content: String = conn .query_row("SELECT content_text FROM documents", [], |r| r.get(0)) .unwrap(); assert!(content.contains("[[Issue]] #42: Test Issue")); } #[test] fn test_regenerate_unchanged() { let conn = setup_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, description, state, author_username, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'Desc', 'opened', 'alice', 1000, 2000, 3000)", [], ).unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); let r1 = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(r1.regenerated, 1); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); let r2 = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(r2.unchanged, 1); assert_eq!(r2.regenerated, 0); } #[test] fn test_regenerate_deleted_source() { let conn = setup_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)", [], ).unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); regenerate_dirty_documents(&conn, None).unwrap(); conn.execute("PRAGMA foreign_keys = OFF", []).unwrap(); conn.execute("DELETE FROM issues WHERE id = 1", []).unwrap(); conn.execute("PRAGMA foreign_keys = ON", []).unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 1); let count: i64 = conn .query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0)) .unwrap(); assert_eq!(count, 0); } #[test] fn test_regenerate_drains_queue() { let conn = setup_db(); for i in 1..=10 { conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (?1, ?2, 1, ?1, 'Test', 'opened', 1000, 2000, 3000)", rusqlite::params![i, i * 10], ).unwrap(); mark_dirty(&conn, SourceType::Issue, i).unwrap(); } let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 10); let dirty = get_dirty_sources(&conn).unwrap(); assert!(dirty.is_empty()); } #[test] fn test_triple_hash_fast_path() { let conn = setup_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)", [], ).unwrap(); conn.execute( "INSERT INTO labels (id, project_id, name) VALUES (1, 1, 'bug')", [], ) .unwrap(); conn.execute( "INSERT INTO issue_labels (issue_id, label_id) VALUES (1, 1)", [], ) .unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); regenerate_dirty_documents(&conn, None).unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.unchanged, 1); let label_count: i64 = conn .query_row("SELECT COUNT(*) FROM document_labels", [], |r| r.get(0)) .unwrap(); assert_eq!(label_count, 1); } fn setup_note_db() -> Connection { let conn = setup_db(); conn.execute_batch( " CREATE TABLE merge_requests ( id INTEGER PRIMARY KEY, gitlab_id INTEGER UNIQUE NOT NULL, project_id INTEGER NOT NULL REFERENCES projects(id), iid INTEGER NOT NULL, title TEXT, description TEXT, state TEXT, draft INTEGER NOT NULL DEFAULT 0, author_username TEXT, source_branch TEXT, target_branch TEXT, head_sha TEXT, references_short TEXT, references_full TEXT, detailed_merge_status TEXT, merge_user_username TEXT, created_at INTEGER, updated_at INTEGER, merged_at INTEGER, closed_at INTEGER, last_seen_at INTEGER NOT NULL, discussions_synced_for_updated_at INTEGER, discussions_sync_last_attempt_at INTEGER, discussions_sync_attempts INTEGER DEFAULT 0, discussions_sync_last_error TEXT, resource_events_synced_for_updated_at INTEGER, web_url TEXT, raw_payload_id INTEGER ); CREATE TABLE mr_labels ( merge_request_id INTEGER REFERENCES merge_requests(id), label_id INTEGER REFERENCES labels(id), PRIMARY KEY(merge_request_id, label_id) ); CREATE TABLE discussions ( id INTEGER PRIMARY KEY, gitlab_discussion_id TEXT NOT NULL, project_id INTEGER NOT NULL REFERENCES projects(id), issue_id INTEGER REFERENCES issues(id), merge_request_id INTEGER, noteable_type TEXT NOT NULL, individual_note INTEGER NOT NULL DEFAULT 0, first_note_at INTEGER, last_note_at INTEGER, last_seen_at INTEGER NOT NULL, resolvable INTEGER NOT NULL DEFAULT 0, resolved INTEGER NOT NULL DEFAULT 0 ); CREATE TABLE notes ( id INTEGER PRIMARY KEY, gitlab_id INTEGER UNIQUE NOT NULL, discussion_id INTEGER NOT NULL REFERENCES discussions(id), project_id INTEGER NOT NULL REFERENCES projects(id), note_type TEXT, is_system INTEGER NOT NULL DEFAULT 0, author_username TEXT, body TEXT, created_at INTEGER NOT NULL, updated_at INTEGER NOT NULL, last_seen_at INTEGER NOT NULL, position INTEGER, resolvable INTEGER NOT NULL DEFAULT 0, resolved INTEGER NOT NULL DEFAULT 0, resolved_by TEXT, resolved_at INTEGER, position_old_path TEXT, position_new_path TEXT, position_old_line INTEGER, position_new_line INTEGER, raw_payload_id INTEGER ); ", ) .unwrap(); conn } #[test] fn test_regenerate_note_document() { let conn = setup_note_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Test Issue', 'opened', 'alice', 1000, 2000, 3000, 'https://example.com/issues/42')", [], ).unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)", [], ).unwrap(); conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'This is a note', 1000, 2000, 3000, 0)", [], ).unwrap(); mark_dirty(&conn, SourceType::Note, 1).unwrap(); let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 1); assert_eq!(result.unchanged, 0); assert_eq!(result.errored, 0); let (source_type, content): (String, String) = conn .query_row( "SELECT source_type, content_text FROM documents WHERE source_id = 1", [], |r| Ok((r.get(0)?, r.get(1)?)), ) .unwrap(); assert_eq!(source_type, "note"); assert!(content.contains("[[Note]]")); assert!(content.contains("author: @bob")); } #[test] fn test_regenerate_note_system_note_deletes() { let conn = setup_note_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000)", [], ).unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)", [], ).unwrap(); conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bot', 'assigned to @alice', 1000, 2000, 3000, 1)", [], ).unwrap(); // Pre-insert a document for this note (simulating a previously-generated doc) conn.execute( "INSERT INTO documents (source_type, source_id, project_id, content_text, content_hash) VALUES ('note', 1, 1, 'old content', 'oldhash')", [], ).unwrap(); mark_dirty(&conn, SourceType::Note, 1).unwrap(); let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 1); let count: i64 = conn .query_row( "SELECT COUNT(*) FROM documents WHERE source_type = 'note'", [], |r| r.get(0), ) .unwrap(); assert_eq!(count, 0); } #[test] fn test_regenerate_note_unchanged() { let conn = setup_note_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Test', 'opened', 1000, 2000, 3000, 'https://example.com/issues/42')", [], ).unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)", [], ).unwrap(); conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'Some note', 1000, 2000, 3000, 0)", [], ).unwrap(); mark_dirty(&conn, SourceType::Note, 1).unwrap(); let r1 = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(r1.regenerated, 1); mark_dirty(&conn, SourceType::Note, 1).unwrap(); let r2 = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(r2.unchanged, 1); assert_eq!(r2.regenerated, 0); } #[test] fn test_note_regeneration_batch_uses_cache() { let conn = setup_note_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Shared Issue', 'opened', 'alice', 1000, 2000, 3000, 'https://example.com/issues/42')", [], ).unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)", [], ).unwrap(); for i in 1..=10 { conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (?1, ?2, 1, 1, 'bob', ?3, 1000, 2000, 3000, 0)", rusqlite::params![i, i * 100, format!("Note body {}", i)], ).unwrap(); mark_dirty(&conn, SourceType::Note, i).unwrap(); } let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 10); assert_eq!(result.errored, 0); let count: i64 = conn .query_row( "SELECT COUNT(*) FROM documents WHERE source_type = 'note'", [], |r| r.get(0), ) .unwrap(); assert_eq!(count, 10); } #[test] fn test_note_regeneration_cache_consistent_with_direct_extraction() { let conn = setup_note_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, author_username, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Consistency Check', 'opened', 'alice', 1000, 2000, 3000, 'https://example.com/issues/42')", [], ).unwrap(); conn.execute( "INSERT INTO labels (id, project_id, name) VALUES (1, 1, 'backend')", [], ) .unwrap(); conn.execute( "INSERT INTO issue_labels (issue_id, label_id) VALUES (1, 1)", [], ) .unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)", [], ).unwrap(); conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'Some content', 1000, 2000, 3000, 0)", [], ).unwrap(); use crate::documents::extract_note_document; let direct = extract_note_document(&conn, 1).unwrap().unwrap(); let mut cache = ParentMetadataCache::new(); let cached = extract_note_document_cached(&conn, 1, &mut cache) .unwrap() .unwrap(); assert_eq!(direct.content_text, cached.content_text); assert_eq!(direct.content_hash, cached.content_hash); assert_eq!(direct.labels, cached.labels); assert_eq!(direct.labels_hash, cached.labels_hash); assert_eq!(direct.paths_hash, cached.paths_hash); assert_eq!(direct.title, cached.title); assert_eq!(direct.url, cached.url); assert_eq!(direct.author_username, cached.author_username); } #[test] fn test_note_regeneration_cache_invalidates_across_parents() { let conn = setup_note_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at, web_url) VALUES (1, 10, 1, 42, 'Issue Alpha', 'opened', 1000, 2000, 3000, 'https://example.com/issues/42')", [], ).unwrap(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at, web_url) VALUES (2, 20, 1, 99, 'Issue Beta', 'opened', 1000, 2000, 3000, 'https://example.com/issues/99')", [], ).unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (1, 'disc_1', 1, 1, 'Issue', 3000)", [], ).unwrap(); conn.execute( "INSERT INTO discussions (id, gitlab_discussion_id, project_id, issue_id, noteable_type, last_seen_at) VALUES (2, 'disc_2', 1, 2, 'Issue', 3000)", [], ).unwrap(); conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (1, 100, 1, 1, 'bob', 'Alpha note', 1000, 2000, 3000, 0)", [], ).unwrap(); conn.execute( "INSERT INTO notes (id, gitlab_id, discussion_id, project_id, author_username, body, created_at, updated_at, last_seen_at, is_system) VALUES (2, 200, 2, 1, 'alice', 'Beta note', 1000, 2000, 3000, 0)", [], ).unwrap(); mark_dirty(&conn, SourceType::Note, 1).unwrap(); mark_dirty(&conn, SourceType::Note, 2).unwrap(); let result = regenerate_dirty_documents(&conn, None).unwrap(); assert_eq!(result.regenerated, 2); assert_eq!(result.errored, 0); let alpha_content: String = conn .query_row( "SELECT content_text FROM documents WHERE source_type = 'note' AND source_id = 1", [], |r| r.get(0), ) .unwrap(); let beta_content: String = conn .query_row( "SELECT content_text FROM documents WHERE source_type = 'note' AND source_id = 2", [], |r| r.get(0), ) .unwrap(); assert!(alpha_content.contains("parent_iid: 42")); assert!(alpha_content.contains("parent_title: Issue Alpha")); assert!(beta_content.contains("parent_iid: 99")); assert!(beta_content.contains("parent_title: Issue Beta")); } #[test] fn test_scoped_regen_only_processes_specified_sources() { let conn = setup_db(); // Insert two issues conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'First Issue', 'opened', 1000, 2000, 3000)", [], ).unwrap(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (2, 20, 1, 43, 'Second Issue', 'opened', 1000, 2000, 3000)", [], ).unwrap(); // Mark both dirty mark_dirty(&conn, SourceType::Issue, 1).unwrap(); mark_dirty(&conn, SourceType::Issue, 2).unwrap(); // Regenerate only issue 1 let result = regenerate_dirty_documents_for_sources(&conn, &[(SourceType::Issue, 1)]).unwrap(); assert_eq!(result.regenerated, 1); assert_eq!(result.errored, 0); // Issue 1 should be regenerated and cleared from dirty let doc_count: i64 = conn .query_row( "SELECT COUNT(*) FROM documents WHERE source_type = 'issue' AND source_id = 1", [], |r| r.get(0), ) .unwrap(); assert_eq!(doc_count, 1); // Issue 2 should still be dirty let dirty_count: i64 = conn .query_row( "SELECT COUNT(*) FROM dirty_sources WHERE source_type = 'issue' AND source_id = 2", [], |r| r.get(0), ) .unwrap(); assert_eq!(dirty_count, 1); } #[test] fn test_scoped_regen_returns_document_ids() { let conn = setup_db(); conn.execute( "INSERT INTO issues (id, gitlab_id, project_id, iid, title, state, created_at, updated_at, last_seen_at) VALUES (1, 10, 1, 42, 'Test Issue', 'opened', 1000, 2000, 3000)", [], ).unwrap(); mark_dirty(&conn, SourceType::Issue, 1).unwrap(); let result = regenerate_dirty_documents_for_sources(&conn, &[(SourceType::Issue, 1)]).unwrap(); assert_eq!(result.document_ids.len(), 1); // Verify returned ID matches the actual document let actual_id: i64 = conn .query_row( "SELECT id FROM documents WHERE source_type = 'issue' AND source_id = 1", [], |r| r.get(0), ) .unwrap(); assert_eq!(result.document_ids[0], actual_id); } #[test] fn test_scoped_regen_handles_missing_source() { let conn = setup_db(); // Don't insert any issues — source_id 999 doesn't exist // But mark it dirty so the function tries to process it mark_dirty(&conn, SourceType::Issue, 999).unwrap(); let result = regenerate_dirty_documents_for_sources(&conn, &[(SourceType::Issue, 999)]).unwrap(); // Source doesn't exist, so regenerate_one returns Ok(true) deleting the doc. // No document_id to collect since there's nothing in the documents table. assert_eq!(result.regenerated, 1); assert_eq!(result.errored, 0); assert!(result.document_ids.is_empty()); }