//! Generate searchable documents from ingested GitLab data. use console::style; use rusqlite::Connection; use serde::Serialize; use tracing::info; use crate::Config; use crate::core::db::create_connection; use crate::core::error::Result; use crate::core::paths::get_db_path; use crate::core::project::resolve_project; use crate::documents::{SourceType, regenerate_dirty_documents}; const FULL_MODE_CHUNK_SIZE: i64 = 2000; /// Result of a generate-docs run. #[derive(Debug, Default)] pub struct GenerateDocsResult { pub regenerated: usize, pub unchanged: usize, pub errored: usize, pub seeded: usize, pub full_mode: bool, } /// Run the generate-docs pipeline. /// /// Default mode: process only existing dirty_sources entries. /// Full mode: seed dirty_sources with ALL entities, then drain. pub fn run_generate_docs( config: &Config, full: bool, project_filter: Option<&str>, ) -> Result { let db_path = get_db_path(config.storage.db_path.as_deref()); let conn = create_connection(&db_path)?; let mut result = GenerateDocsResult { full_mode: full, ..Default::default() }; if full { result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?; result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?; result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?; } let regen = regenerate_dirty_documents(&conn)?; result.regenerated = regen.regenerated; result.unchanged = regen.unchanged; result.errored = regen.errored; if full { // Optimize FTS index after bulk rebuild let _ = conn.execute( "INSERT INTO documents_fts(documents_fts) VALUES('optimize')", [], ); info!("FTS index optimized after full rebuild"); } Ok(result) } /// Seed dirty_sources with all entities of the given type using keyset pagination. fn seed_dirty( conn: &Connection, source_type: SourceType, project_filter: Option<&str>, ) -> Result { let table = match source_type { SourceType::Issue => "issues", SourceType::MergeRequest => "merge_requests", SourceType::Discussion => "discussions", }; let type_str = source_type.as_str(); let now = chrono::Utc::now().timestamp_millis(); let mut total_seeded: usize = 0; let mut last_id: i64 = 0; loop { let inserted = if let Some(project) = project_filter { let project_id = resolve_project(conn, project)?; conn.execute( &format!( "INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at) SELECT ?1, id, ?2, 0, NULL, NULL, NULL FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5 ON CONFLICT(source_type, source_id) DO NOTHING" ), rusqlite::params![type_str, now, last_id, project_id, FULL_MODE_CHUNK_SIZE], )? } else { conn.execute( &format!( "INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at) SELECT ?1, id, ?2, 0, NULL, NULL, NULL FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4 ON CONFLICT(source_type, source_id) DO NOTHING" ), rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE], )? }; if inserted == 0 { break; } // Advance keyset cursor to the max id within the chunk window let max_id: i64 = conn.query_row( &format!( "SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)", table = table ), rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE], |row| row.get(0), )?; total_seeded += inserted; last_id = max_id; } info!( source_type = type_str, seeded = total_seeded, "Seeded dirty_sources" ); Ok(total_seeded) } /// Print human-readable output. pub fn print_generate_docs(result: &GenerateDocsResult) { let mode = if result.full_mode { "full" } else { "incremental" }; println!( "{} Document generation complete ({})", style("done").green().bold(), mode ); if result.full_mode { println!(" Seeded: {}", result.seeded); } println!(" Regenerated: {}", result.regenerated); println!(" Unchanged: {}", result.unchanged); if result.errored > 0 { println!(" Errored: {}", style(result.errored).red()); } } /// JSON output structures. #[derive(Serialize)] struct GenerateDocsJsonOutput { ok: bool, data: GenerateDocsJsonData, } #[derive(Serialize)] struct GenerateDocsJsonData { mode: String, #[serde(skip_serializing_if = "Option::is_none")] seeded: Option, regenerated: usize, unchanged: usize, errored: usize, } /// Print JSON robot-mode output. pub fn print_generate_docs_json(result: &GenerateDocsResult) { let output = GenerateDocsJsonOutput { ok: true, data: GenerateDocsJsonData { mode: if result.full_mode { "full".to_string() } else { "incremental".to_string() }, seeded: if result.full_mode { Some(result.seeded) } else { None }, regenerated: result.regenerated, unchanged: result.unchanged, errored: result.errored, }, }; println!("{}", serde_json::to_string(&output).unwrap()); }