Automated formatting and lint corrections from parallel agent work: - cargo fmt: import reordering (alphabetical), line wrapping to respect max width, trailing comma normalization, destructuring alignment, function signature reformatting, match arm formatting - clippy (pedantic): Range::contains() instead of manual comparisons, i64::from() instead of `as i64` casts, .clamp() instead of .max().min() chains, let-chain refactors (if-let with &&), #[allow(clippy::too_many_arguments)] and #[allow(clippy::field_reassign_with_default)] where warranted - Removed trailing blank lines and extra whitespace No behavioral changes. All existing tests pass unmodified. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
197 lines
5.8 KiB
Rust
197 lines
5.8 KiB
Rust
//! Generate searchable documents from ingested GitLab data.
|
|
|
|
use console::style;
|
|
use rusqlite::Connection;
|
|
use serde::Serialize;
|
|
use tracing::info;
|
|
|
|
use crate::Config;
|
|
use crate::core::db::create_connection;
|
|
use crate::core::error::Result;
|
|
use crate::core::paths::get_db_path;
|
|
use crate::core::project::resolve_project;
|
|
use crate::documents::{SourceType, regenerate_dirty_documents};
|
|
|
|
const FULL_MODE_CHUNK_SIZE: i64 = 2000;
|
|
|
|
/// Result of a generate-docs run.
|
|
#[derive(Debug, Default)]
|
|
pub struct GenerateDocsResult {
|
|
pub regenerated: usize,
|
|
pub unchanged: usize,
|
|
pub errored: usize,
|
|
pub seeded: usize,
|
|
pub full_mode: bool,
|
|
}
|
|
|
|
/// Run the generate-docs pipeline.
|
|
///
|
|
/// Default mode: process only existing dirty_sources entries.
|
|
/// Full mode: seed dirty_sources with ALL entities, then drain.
|
|
pub fn run_generate_docs(
|
|
config: &Config,
|
|
full: bool,
|
|
project_filter: Option<&str>,
|
|
) -> Result<GenerateDocsResult> {
|
|
let db_path = get_db_path(config.storage.db_path.as_deref());
|
|
let conn = create_connection(&db_path)?;
|
|
let mut result = GenerateDocsResult {
|
|
full_mode: full,
|
|
..Default::default()
|
|
};
|
|
|
|
if full {
|
|
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
|
|
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
|
|
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
|
|
}
|
|
|
|
let regen = regenerate_dirty_documents(&conn)?;
|
|
result.regenerated = regen.regenerated;
|
|
result.unchanged = regen.unchanged;
|
|
result.errored = regen.errored;
|
|
|
|
if full {
|
|
// Optimize FTS index after bulk rebuild
|
|
let _ = conn.execute(
|
|
"INSERT INTO documents_fts(documents_fts) VALUES('optimize')",
|
|
[],
|
|
);
|
|
info!("FTS index optimized after full rebuild");
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Seed dirty_sources with all entities of the given type using keyset pagination.
|
|
fn seed_dirty(
|
|
conn: &Connection,
|
|
source_type: SourceType,
|
|
project_filter: Option<&str>,
|
|
) -> Result<usize> {
|
|
let table = match source_type {
|
|
SourceType::Issue => "issues",
|
|
SourceType::MergeRequest => "merge_requests",
|
|
SourceType::Discussion => "discussions",
|
|
};
|
|
let type_str = source_type.as_str();
|
|
let now = chrono::Utc::now().timestamp_millis();
|
|
|
|
let mut total_seeded: usize = 0;
|
|
let mut last_id: i64 = 0;
|
|
|
|
loop {
|
|
let inserted = if let Some(project) = project_filter {
|
|
let project_id = resolve_project(conn, project)?;
|
|
|
|
conn.execute(
|
|
&format!(
|
|
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
|
|
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
|
|
FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5
|
|
ON CONFLICT(source_type, source_id) DO NOTHING"
|
|
),
|
|
rusqlite::params![type_str, now, last_id, project_id, FULL_MODE_CHUNK_SIZE],
|
|
)?
|
|
} else {
|
|
conn.execute(
|
|
&format!(
|
|
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
|
|
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
|
|
FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4
|
|
ON CONFLICT(source_type, source_id) DO NOTHING"
|
|
),
|
|
rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE],
|
|
)?
|
|
};
|
|
|
|
if inserted == 0 {
|
|
break;
|
|
}
|
|
|
|
// Advance keyset cursor to the max id within the chunk window
|
|
let max_id: i64 = conn.query_row(
|
|
&format!(
|
|
"SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)",
|
|
table = table
|
|
),
|
|
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|
|
|row| row.get(0),
|
|
)?;
|
|
|
|
total_seeded += inserted;
|
|
last_id = max_id;
|
|
}
|
|
|
|
info!(
|
|
source_type = type_str,
|
|
seeded = total_seeded,
|
|
"Seeded dirty_sources"
|
|
);
|
|
|
|
Ok(total_seeded)
|
|
}
|
|
|
|
/// Print human-readable output.
|
|
pub fn print_generate_docs(result: &GenerateDocsResult) {
|
|
let mode = if result.full_mode {
|
|
"full"
|
|
} else {
|
|
"incremental"
|
|
};
|
|
println!(
|
|
"{} Document generation complete ({})",
|
|
style("done").green().bold(),
|
|
mode
|
|
);
|
|
|
|
if result.full_mode {
|
|
println!(" Seeded: {}", result.seeded);
|
|
}
|
|
println!(" Regenerated: {}", result.regenerated);
|
|
println!(" Unchanged: {}", result.unchanged);
|
|
if result.errored > 0 {
|
|
println!(" Errored: {}", style(result.errored).red());
|
|
}
|
|
}
|
|
|
|
/// JSON output structures.
|
|
#[derive(Serialize)]
|
|
struct GenerateDocsJsonOutput {
|
|
ok: bool,
|
|
data: GenerateDocsJsonData,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct GenerateDocsJsonData {
|
|
mode: String,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
seeded: Option<usize>,
|
|
regenerated: usize,
|
|
unchanged: usize,
|
|
errored: usize,
|
|
}
|
|
|
|
/// Print JSON robot-mode output.
|
|
pub fn print_generate_docs_json(result: &GenerateDocsResult) {
|
|
let output = GenerateDocsJsonOutput {
|
|
ok: true,
|
|
data: GenerateDocsJsonData {
|
|
mode: if result.full_mode {
|
|
"full".to_string()
|
|
} else {
|
|
"incremental".to_string()
|
|
},
|
|
seeded: if result.full_mode {
|
|
Some(result.seeded)
|
|
} else {
|
|
None
|
|
},
|
|
regenerated: result.regenerated,
|
|
unchanged: result.unchanged,
|
|
errored: result.errored,
|
|
},
|
|
};
|
|
println!("{}", serde_json::to_string(&output).unwrap());
|
|
}
|