Files
gitlore/src/cli/commands/generate_docs.rs
Taylor Eernisse a50fc78823 style: Apply cargo fmt and clippy fixes across codebase
Automated formatting and lint corrections from parallel agent work:

- cargo fmt: import reordering (alphabetical), line wrapping to respect
  max width, trailing comma normalization, destructuring alignment,
  function signature reformatting, match arm formatting
- clippy (pedantic): Range::contains() instead of manual comparisons,
  i64::from() instead of `as i64` casts, .clamp() instead of
  .max().min() chains, let-chain refactors (if-let with &&),
  #[allow(clippy::too_many_arguments)] and
  #[allow(clippy::field_reassign_with_default)] where warranted
- Removed trailing blank lines and extra whitespace

No behavioral changes. All existing tests pass unmodified.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 13:01:59 -05:00

197 lines
5.8 KiB
Rust

//! Generate searchable documents from ingested GitLab data.
use console::style;
use rusqlite::Connection;
use serde::Serialize;
use tracing::info;
use crate::Config;
use crate::core::db::create_connection;
use crate::core::error::Result;
use crate::core::paths::get_db_path;
use crate::core::project::resolve_project;
use crate::documents::{SourceType, regenerate_dirty_documents};
const FULL_MODE_CHUNK_SIZE: i64 = 2000;
/// Result of a generate-docs run.
#[derive(Debug, Default)]
pub struct GenerateDocsResult {
pub regenerated: usize,
pub unchanged: usize,
pub errored: usize,
pub seeded: usize,
pub full_mode: bool,
}
/// Run the generate-docs pipeline.
///
/// Default mode: process only existing dirty_sources entries.
/// Full mode: seed dirty_sources with ALL entities, then drain.
pub fn run_generate_docs(
config: &Config,
full: bool,
project_filter: Option<&str>,
) -> Result<GenerateDocsResult> {
let db_path = get_db_path(config.storage.db_path.as_deref());
let conn = create_connection(&db_path)?;
let mut result = GenerateDocsResult {
full_mode: full,
..Default::default()
};
if full {
result.seeded += seed_dirty(&conn, SourceType::Issue, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::MergeRequest, project_filter)?;
result.seeded += seed_dirty(&conn, SourceType::Discussion, project_filter)?;
}
let regen = regenerate_dirty_documents(&conn)?;
result.regenerated = regen.regenerated;
result.unchanged = regen.unchanged;
result.errored = regen.errored;
if full {
// Optimize FTS index after bulk rebuild
let _ = conn.execute(
"INSERT INTO documents_fts(documents_fts) VALUES('optimize')",
[],
);
info!("FTS index optimized after full rebuild");
}
Ok(result)
}
/// Seed dirty_sources with all entities of the given type using keyset pagination.
fn seed_dirty(
conn: &Connection,
source_type: SourceType,
project_filter: Option<&str>,
) -> Result<usize> {
let table = match source_type {
SourceType::Issue => "issues",
SourceType::MergeRequest => "merge_requests",
SourceType::Discussion => "discussions",
};
let type_str = source_type.as_str();
let now = chrono::Utc::now().timestamp_millis();
let mut total_seeded: usize = 0;
let mut last_id: i64 = 0;
loop {
let inserted = if let Some(project) = project_filter {
let project_id = resolve_project(conn, project)?;
conn.execute(
&format!(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
FROM {table} WHERE id > ?3 AND project_id = ?4 ORDER BY id LIMIT ?5
ON CONFLICT(source_type, source_id) DO NOTHING"
),
rusqlite::params![type_str, now, last_id, project_id, FULL_MODE_CHUNK_SIZE],
)?
} else {
conn.execute(
&format!(
"INSERT INTO dirty_sources (source_type, source_id, queued_at, attempt_count, last_attempt_at, last_error, next_attempt_at)
SELECT ?1, id, ?2, 0, NULL, NULL, NULL
FROM {table} WHERE id > ?3 ORDER BY id LIMIT ?4
ON CONFLICT(source_type, source_id) DO NOTHING"
),
rusqlite::params![type_str, now, last_id, FULL_MODE_CHUNK_SIZE],
)?
};
if inserted == 0 {
break;
}
// Advance keyset cursor to the max id within the chunk window
let max_id: i64 = conn.query_row(
&format!(
"SELECT MAX(id) FROM (SELECT id FROM {table} WHERE id > ?1 ORDER BY id LIMIT ?2)",
table = table
),
rusqlite::params![last_id, FULL_MODE_CHUNK_SIZE],
|row| row.get(0),
)?;
total_seeded += inserted;
last_id = max_id;
}
info!(
source_type = type_str,
seeded = total_seeded,
"Seeded dirty_sources"
);
Ok(total_seeded)
}
/// Print human-readable output.
pub fn print_generate_docs(result: &GenerateDocsResult) {
let mode = if result.full_mode {
"full"
} else {
"incremental"
};
println!(
"{} Document generation complete ({})",
style("done").green().bold(),
mode
);
if result.full_mode {
println!(" Seeded: {}", result.seeded);
}
println!(" Regenerated: {}", result.regenerated);
println!(" Unchanged: {}", result.unchanged);
if result.errored > 0 {
println!(" Errored: {}", style(result.errored).red());
}
}
/// JSON output structures.
#[derive(Serialize)]
struct GenerateDocsJsonOutput {
ok: bool,
data: GenerateDocsJsonData,
}
#[derive(Serialize)]
struct GenerateDocsJsonData {
mode: String,
#[serde(skip_serializing_if = "Option::is_none")]
seeded: Option<usize>,
regenerated: usize,
unchanged: usize,
errored: usize,
}
/// Print JSON robot-mode output.
pub fn print_generate_docs_json(result: &GenerateDocsResult) {
let output = GenerateDocsJsonOutput {
ok: true,
data: GenerateDocsJsonData {
mode: if result.full_mode {
"full".to_string()
} else {
"incremental".to_string()
},
seeded: if result.full_mode {
Some(result.seeded)
} else {
None
},
regenerated: result.regenerated,
unchanged: result.unchanged,
errored: result.errored,
},
};
println!("{}", serde_json::to_string(&output).unwrap());
}