refactor: Remove redundant doc comments throughout codebase

Removes module-level doc comments (//! lines) and excessive inline doc
comments that were duplicating information already evident from:
- Function/struct names (self-documenting code)
- Type signatures (the what is clear from types)
- Implementation context (the how is clear from code)

Affected modules:
- cli/* - Removed command descriptions duplicating clap help text
- core/* - Removed module headers and obvious function docs
- documents/* - Removed extractor/regenerator/truncation docs
- embedding/* - Removed pipeline and chunking docs
- gitlab/* - Removed client and transformer docs (kept type definitions)
- ingestion/* - Removed orchestrator and ingestion docs
- search/* - Removed FTS and vector search docs

Philosophy: Code should be self-documenting. Comments should explain
"why" (business decisions, non-obvious constraints) not "what" (which
the code itself shows). This change reduces noise and maintenance burden
while keeping the codebase just as understandable.

Retains comments for:
- Non-obvious business logic
- Important safety invariants
- Complex algorithm explanations
- Public API boundaries where generated docs matter

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-05 00:04:32 -05:00
parent 976ad92ef0
commit 65583ed5d6
57 changed files with 143 additions and 1693 deletions

View File

@@ -1,15 +1,3 @@
//! MR Discussion ingestion with atomicity guarantees.
//!
//! Critical requirements:
//! - Parse notes BEFORE any destructive DB operations
//! - Watermark advanced ONLY on full pagination success
//! - Upsert + sweep pattern for data replacement
//! - Sync health telemetry for debugging failures
//!
//! Supports two modes:
//! - Streaming: fetch and write incrementally (memory efficient)
//! - Prefetch: fetch all upfront, then write (enables parallel API calls)
use futures::StreamExt;
use rusqlite::{Connection, params};
use tracing::{debug, info, warn};
@@ -29,7 +17,6 @@ use crate::ingestion::dirty_tracker;
use super::merge_requests::MrForDiscussionSync;
/// Result of MR discussion ingestion for a single MR.
#[derive(Debug, Default)]
pub struct IngestMrDiscussionsResult {
pub discussions_fetched: usize,
@@ -40,20 +27,15 @@ pub struct IngestMrDiscussionsResult {
pub pagination_succeeded: bool,
}
/// Prefetched discussions for an MR (ready for DB write).
/// This separates the API fetch phase from the DB write phase to enable parallelism.
#[derive(Debug)]
pub struct PrefetchedMrDiscussions {
pub mr: MrForDiscussionSync,
pub discussions: Vec<PrefetchedDiscussion>,
pub fetch_error: Option<String>,
/// True if any discussions failed to transform (skip sweep if true)
pub had_transform_errors: bool,
/// Count of notes skipped due to transform errors
pub notes_skipped_count: usize,
}
/// A single prefetched discussion with transformed data.
#[derive(Debug)]
pub struct PrefetchedDiscussion {
pub raw: GitLabDiscussion,
@@ -61,8 +43,6 @@ pub struct PrefetchedDiscussion {
pub notes: Vec<NormalizedNote>,
}
/// Fetch discussions for an MR without writing to DB.
/// This can be called in parallel for multiple MRs.
pub async fn prefetch_mr_discussions(
client: &GitLabClient,
gitlab_project_id: i64,
@@ -71,7 +51,6 @@ pub async fn prefetch_mr_discussions(
) -> PrefetchedMrDiscussions {
debug!(mr_iid = mr.iid, "Prefetching discussions for MR");
// Fetch all discussions from GitLab
let raw_discussions = match client
.fetch_all_mr_discussions(gitlab_project_id, mr.iid)
.await
@@ -88,13 +67,11 @@ pub async fn prefetch_mr_discussions(
}
};
// Transform each discussion
let mut discussions = Vec::with_capacity(raw_discussions.len());
let mut had_transform_errors = false;
let mut notes_skipped_count = 0;
for raw in raw_discussions {
// Transform notes
let notes = match transform_notes_with_diff_position(&raw, local_project_id) {
Ok(n) => n,
Err(e) => {
@@ -104,14 +81,12 @@ pub async fn prefetch_mr_discussions(
error = %e,
"Note transform failed during prefetch"
);
// Track the failure - don't sweep stale data if transforms failed
had_transform_errors = true;
notes_skipped_count += raw.notes.len();
continue;
}
};
// Transform discussion
let normalized = transform_mr_discussion(&raw, local_project_id, mr.local_mr_id);
discussions.push(PrefetchedDiscussion {
@@ -130,15 +105,12 @@ pub async fn prefetch_mr_discussions(
}
}
/// Write prefetched discussions to DB.
/// This must be called serially (rusqlite Connection is not Send).
pub fn write_prefetched_mr_discussions(
conn: &Connection,
config: &Config,
local_project_id: i64,
prefetched: PrefetchedMrDiscussions,
) -> Result<IngestMrDiscussionsResult> {
// Sync succeeds only if no fetch errors AND no transform errors
let sync_succeeded = prefetched.fetch_error.is_none() && !prefetched.had_transform_errors;
let mut result = IngestMrDiscussionsResult {
@@ -149,7 +121,6 @@ pub fn write_prefetched_mr_discussions(
let mr = &prefetched.mr;
// Handle fetch errors
if let Some(error) = &prefetched.fetch_error {
warn!(mr_iid = mr.iid, error = %error, "Prefetch failed for MR");
record_sync_health_error(conn, mr.local_mr_id, error)?;
@@ -158,9 +129,7 @@ pub fn write_prefetched_mr_discussions(
let run_seen_at = now_ms();
// Write each discussion
for disc in &prefetched.discussions {
// Count DiffNotes upfront (independent of transaction)
let diffnotes_in_disc = disc
.notes
.iter()
@@ -168,10 +137,8 @@ pub fn write_prefetched_mr_discussions(
.count();
let notes_in_disc = disc.notes.len();
// Start transaction
let tx = conn.unchecked_transaction()?;
// Store raw payload
let payload_bytes = serde_json::to_vec(&disc.raw)?;
let payload_id = Some(store_payload(
&tx,
@@ -184,20 +151,16 @@ pub fn write_prefetched_mr_discussions(
},
)?);
// Upsert discussion
upsert_discussion(&tx, &disc.normalized, run_seen_at, payload_id)?;
// Get local discussion ID
let local_discussion_id: i64 = tx.query_row(
"SELECT id FROM discussions WHERE project_id = ? AND gitlab_discussion_id = ?",
params![local_project_id, &disc.normalized.gitlab_discussion_id],
|row| row.get(0),
)?;
// Mark dirty for document regeneration (inside transaction)
dirty_tracker::mark_dirty_tx(&tx, SourceType::Discussion, local_discussion_id)?;
// Upsert notes
for note in &disc.notes {
let should_store_payload = !note.is_system
|| note.position_new_path.is_some()
@@ -229,15 +192,12 @@ pub fn write_prefetched_mr_discussions(
tx.commit()?;
// Increment counters AFTER successful commit to keep metrics honest
result.discussions_fetched += 1;
result.discussions_upserted += 1;
result.notes_upserted += notes_in_disc;
result.diffnotes_count += diffnotes_in_disc;
}
// Only sweep stale data and advance watermark on full success
// If any discussions failed to transform, preserve existing data
if sync_succeeded {
sweep_stale_discussions(conn, mr.local_mr_id, run_seen_at)?;
sweep_stale_notes(conn, local_project_id, mr.local_mr_id, run_seen_at)?;
@@ -259,7 +219,6 @@ pub fn write_prefetched_mr_discussions(
Ok(result)
}
/// Ingest discussions for MRs that need sync.
pub async fn ingest_mr_discussions(
conn: &Connection,
client: &GitLabClient,
@@ -269,7 +228,7 @@ pub async fn ingest_mr_discussions(
mrs: &[MrForDiscussionSync],
) -> Result<IngestMrDiscussionsResult> {
let mut total_result = IngestMrDiscussionsResult {
pagination_succeeded: true, // Start optimistic
pagination_succeeded: true,
..Default::default()
};
@@ -289,7 +248,6 @@ pub async fn ingest_mr_discussions(
total_result.notes_upserted += result.notes_upserted;
total_result.notes_skipped_bad_timestamp += result.notes_skipped_bad_timestamp;
total_result.diffnotes_count += result.diffnotes_count;
// Pagination failed for any MR means overall failure
if !result.pagination_succeeded {
total_result.pagination_succeeded = false;
}
@@ -309,7 +267,6 @@ pub async fn ingest_mr_discussions(
Ok(total_result)
}
/// Ingest discussions for a single MR.
async fn ingest_discussions_for_mr(
conn: &Connection,
client: &GitLabClient,
@@ -329,13 +286,10 @@ async fn ingest_discussions_for_mr(
"Fetching discussions for MR"
);
// Record sync start time for sweep
let run_seen_at = now_ms();
// Stream discussions from GitLab
let mut discussions_stream = client.paginate_mr_discussions(gitlab_project_id, mr.iid);
// Track if we've received any response
let mut received_first_response = false;
while let Some(disc_result) = discussions_stream.next().await {
@@ -343,7 +297,6 @@ async fn ingest_discussions_for_mr(
received_first_response = true;
}
// Handle pagination errors - don't advance watermark
let gitlab_discussion = match disc_result {
Ok(d) => d,
Err(e) => {
@@ -357,7 +310,6 @@ async fn ingest_discussions_for_mr(
break;
}
};
// CRITICAL: Parse notes BEFORE any destructive DB operations
let notes = match transform_notes_with_diff_position(&gitlab_discussion, local_project_id) {
Ok(notes) => notes,
Err(e) => {
@@ -369,25 +321,21 @@ async fn ingest_discussions_for_mr(
);
result.notes_skipped_bad_timestamp += gitlab_discussion.notes.len();
result.pagination_succeeded = false;
continue; // Skip this discussion, preserve existing data
continue;
}
};
// Count DiffNotes upfront (independent of transaction)
let diffnotes_in_disc = notes
.iter()
.filter(|n| n.position_new_path.is_some() || n.position_old_path.is_some())
.count();
let notes_count = notes.len();
// Transform discussion
let normalized_discussion =
transform_mr_discussion(&gitlab_discussion, local_project_id, mr.local_mr_id);
// Only NOW start transaction (after parse succeeded)
let tx = conn.unchecked_transaction()?;
// Store raw payload
let payload_bytes = serde_json::to_vec(&gitlab_discussion)?;
let payload_id = Some(store_payload(
&tx,
@@ -400,10 +348,8 @@ async fn ingest_discussions_for_mr(
},
)?);
// Upsert discussion with run_seen_at
upsert_discussion(&tx, &normalized_discussion, run_seen_at, payload_id)?;
// Get local discussion ID
let local_discussion_id: i64 = tx.query_row(
"SELECT id FROM discussions WHERE project_id = ? AND gitlab_discussion_id = ?",
params![
@@ -413,12 +359,9 @@ async fn ingest_discussions_for_mr(
|row| row.get(0),
)?;
// Mark dirty for document regeneration (inside transaction)
dirty_tracker::mark_dirty_tx(&tx, SourceType::Discussion, local_discussion_id)?;
// Upsert notes (not delete-all-then-insert)
for note in &notes {
// Selective payload storage: skip system notes without position
let should_store_payload = !note.is_system
|| note.position_new_path.is_some()
|| note.position_old_path.is_some();
@@ -452,22 +395,17 @@ async fn ingest_discussions_for_mr(
tx.commit()?;
// Increment counters AFTER successful commit to keep metrics honest
result.discussions_fetched += 1;
result.discussions_upserted += 1;
result.notes_upserted += notes_count;
result.diffnotes_count += diffnotes_in_disc;
}
// Only sweep stale data and advance watermark on full success
if result.pagination_succeeded && received_first_response {
// Sweep stale discussions for this MR
sweep_stale_discussions(conn, mr.local_mr_id, run_seen_at)?;
// Sweep stale notes for this MR
sweep_stale_notes(conn, local_project_id, mr.local_mr_id, run_seen_at)?;
// Advance watermark
mark_discussions_synced(conn, mr.local_mr_id, mr.updated_at)?;
clear_sync_health_error(conn, mr.local_mr_id)?;
@@ -476,7 +414,6 @@ async fn ingest_discussions_for_mr(
"MR discussion sync complete, watermark advanced"
);
} else if result.pagination_succeeded && !received_first_response {
// Empty response (no discussions) - still safe to sweep and advance
sweep_stale_discussions(conn, mr.local_mr_id, run_seen_at)?;
sweep_stale_notes(conn, local_project_id, mr.local_mr_id, run_seen_at)?;
mark_discussions_synced(conn, mr.local_mr_id, mr.updated_at)?;
@@ -493,7 +430,6 @@ async fn ingest_discussions_for_mr(
Ok(result)
}
/// Upsert a discussion with last_seen_at for sweep.
fn upsert_discussion(
conn: &Connection,
discussion: &crate::gitlab::transformers::NormalizedDiscussion,
@@ -531,7 +467,6 @@ fn upsert_discussion(
Ok(())
}
/// Upsert a note with last_seen_at for sweep.
fn upsert_note(
conn: &Connection,
discussion_id: i64,
@@ -601,7 +536,6 @@ fn upsert_note(
Ok(())
}
/// Sweep stale discussions (not seen in this run).
fn sweep_stale_discussions(conn: &Connection, local_mr_id: i64, run_seen_at: i64) -> Result<usize> {
let deleted = conn.execute(
"DELETE FROM discussions
@@ -614,7 +548,6 @@ fn sweep_stale_discussions(conn: &Connection, local_mr_id: i64, run_seen_at: i64
Ok(deleted)
}
/// Sweep stale notes for discussions belonging to this MR.
fn sweep_stale_notes(
conn: &Connection,
local_project_id: i64,
@@ -636,7 +569,6 @@ fn sweep_stale_notes(
Ok(deleted)
}
/// Mark MR discussions as synced (advance watermark).
fn mark_discussions_synced(conn: &Connection, local_mr_id: i64, updated_at: i64) -> Result<()> {
conn.execute(
"UPDATE merge_requests SET discussions_synced_for_updated_at = ? WHERE id = ?",
@@ -645,7 +577,6 @@ fn mark_discussions_synced(conn: &Connection, local_mr_id: i64, updated_at: i64)
Ok(())
}
/// Record sync health error for debugging.
fn record_sync_health_error(conn: &Connection, local_mr_id: i64, error: &str) -> Result<()> {
conn.execute(
"UPDATE merge_requests SET
@@ -658,7 +589,6 @@ fn record_sync_health_error(conn: &Connection, local_mr_id: i64, error: &str) ->
Ok(())
}
/// Clear sync health error on success.
fn clear_sync_health_error(conn: &Connection, local_mr_id: i64) -> Result<()> {
conn.execute(
"UPDATE merge_requests SET