refactor: Remove redundant doc comments throughout codebase

Removes module-level doc comments (//! lines) and excessive inline doc
comments that were duplicating information already evident from:
- Function/struct names (self-documenting code)
- Type signatures (the what is clear from types)
- Implementation context (the how is clear from code)

Affected modules:
- cli/* - Removed command descriptions duplicating clap help text
- core/* - Removed module headers and obvious function docs
- documents/* - Removed extractor/regenerator/truncation docs
- embedding/* - Removed pipeline and chunking docs
- gitlab/* - Removed client and transformer docs (kept type definitions)
- ingestion/* - Removed orchestrator and ingestion docs
- search/* - Removed FTS and vector search docs

Philosophy: Code should be self-documenting. Comments should explain
"why" (business decisions, non-obvious constraints) not "what" (which
the code itself shows). This change reduces noise and maintenance burden
while keeping the codebase just as understandable.

Retains comments for:
- Non-obvious business logic
- Important safety invariants
- Complex algorithm explanations
- Public API boundaries where generated docs matter

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-05 00:04:32 -05:00
parent 976ad92ef0
commit 65583ed5d6
57 changed files with 143 additions and 1693 deletions

View File

@@ -1,12 +1,3 @@
//! Issue ingestion with cursor-based incremental sync.
//!
//! Fetches issues from GitLab and stores them locally with:
//! - Cursor-based pagination for incremental sync
//! - Raw payload storage with deduplication
//! - Label extraction and stale-link removal
//! - Milestone normalization with dedicated table
//! - Tracking of issues needing discussion sync
use std::ops::Deref;
use futures::StreamExt;
@@ -23,7 +14,6 @@ use crate::gitlab::transformers::{MilestoneRow, transform_issue};
use crate::gitlab::types::GitLabIssue;
use crate::ingestion::dirty_tracker;
/// Result of issue ingestion.
#[derive(Debug, Default)]
pub struct IngestIssuesResult {
pub fetched: usize,
@@ -32,36 +22,31 @@ pub struct IngestIssuesResult {
pub issues_needing_discussion_sync: Vec<IssueForDiscussionSync>,
}
/// Issue that needs discussion sync.
#[derive(Debug, Clone)]
pub struct IssueForDiscussionSync {
pub local_issue_id: i64,
pub iid: i64,
pub updated_at: i64, // ms epoch
pub updated_at: i64,
}
/// Cursor state for incremental sync.
#[derive(Debug, Default)]
struct SyncCursor {
updated_at_cursor: Option<i64>,
tie_breaker_id: Option<i64>,
}
/// Ingest issues for a project.
pub async fn ingest_issues(
conn: &Connection,
client: &GitLabClient,
config: &Config,
project_id: i64, // Local DB project ID
gitlab_project_id: i64, // GitLab project ID
project_id: i64,
gitlab_project_id: i64,
) -> Result<IngestIssuesResult> {
let mut result = IngestIssuesResult::default();
// 1. Get current cursor
let cursor = get_sync_cursor(conn, project_id)?;
debug!(?cursor, "Starting issue ingestion with cursor");
// 2. Stream issues with cursor rewind
let mut issues_stream = client.paginate_issues(
gitlab_project_id,
cursor.updated_at_cursor,
@@ -72,12 +57,10 @@ pub async fn ingest_issues(
let mut last_updated_at: Option<i64> = None;
let mut last_gitlab_id: Option<i64> = None;
// 3. Process each issue
while let Some(issue_result) = issues_stream.next().await {
let issue = issue_result?;
result.fetched += 1;
// Parse timestamp early - skip issues with invalid timestamps
let issue_updated_at = match parse_timestamp(&issue.updated_at) {
Ok(ts) => ts,
Err(e) => {
@@ -90,23 +73,19 @@ pub async fn ingest_issues(
}
};
// Apply local cursor filter (skip already-processed due to rewind overlap)
if !passes_cursor_filter_with_ts(issue.id, issue_updated_at, &cursor) {
debug!(gitlab_id = issue.id, "Skipping already-processed issue");
continue;
}
// Transform and store
let labels_created = process_single_issue(conn, config, project_id, &issue)?;
result.upserted += 1;
result.labels_created += labels_created;
// Track cursor position (use already-parsed timestamp)
last_updated_at = Some(issue_updated_at);
last_gitlab_id = Some(issue.id);
batch_count += 1;
// Incremental cursor update every 100 issues
if batch_count % 100 == 0
&& let (Some(ts), Some(id)) = (last_updated_at, last_gitlab_id)
{
@@ -115,17 +94,12 @@ pub async fn ingest_issues(
}
}
// 4. Final cursor update
if let (Some(ts), Some(id)) = (last_updated_at, last_gitlab_id) {
update_sync_cursor(conn, project_id, ts, id)?;
} else if result.fetched == 0 && cursor.updated_at_cursor.is_some() {
// No new issues returned, but we have an existing cursor.
// Update sync_attempted_at to track that we checked (useful for monitoring)
// The cursor itself stays the same since there's nothing newer to advance to.
debug!("No new issues found, cursor unchanged");
}
// 5. Find issues needing discussion sync
result.issues_needing_discussion_sync = get_issues_needing_discussion_sync(conn, project_id)?;
info!(
@@ -139,11 +113,9 @@ pub async fn ingest_issues(
Ok(result)
}
/// Check if an issue passes the cursor filter (not already processed).
/// Takes pre-parsed timestamp to avoid redundant parsing.
fn passes_cursor_filter_with_ts(gitlab_id: i64, issue_ts: i64, cursor: &SyncCursor) -> bool {
let Some(cursor_ts) = cursor.updated_at_cursor else {
return true; // No cursor = fetch all
return true;
};
if issue_ts < cursor_ts {
@@ -160,12 +132,10 @@ fn passes_cursor_filter_with_ts(gitlab_id: i64, issue_ts: i64, cursor: &SyncCurs
true
}
// Keep the original function for backward compatibility with tests
/// Check if an issue passes the cursor filter (not already processed).
#[cfg(test)]
fn passes_cursor_filter(issue: &GitLabIssue, cursor: &SyncCursor) -> Result<bool> {
let Some(cursor_ts) = cursor.updated_at_cursor else {
return Ok(true); // No cursor = fetch all
return Ok(true);
};
let issue_ts = parse_timestamp(&issue.updated_at)?;
@@ -185,8 +155,6 @@ fn passes_cursor_filter(issue: &GitLabIssue, cursor: &SyncCursor) -> Result<bool
Ok(true)
}
/// Process a single issue: store payload, upsert issue, handle labels.
/// All operations are wrapped in a transaction for atomicity.
fn process_single_issue(
conn: &Connection,
config: &Config,
@@ -195,12 +163,10 @@ fn process_single_issue(
) -> Result<usize> {
let now = now_ms();
// Transform issue first (outside transaction - no DB access)
let payload_bytes = serde_json::to_vec(issue)?;
let transformed = transform_issue(issue)?;
let issue_row = &transformed.issue;
// Wrap all DB operations in a transaction for atomicity
let tx = conn.unchecked_transaction()?;
let labels_created = process_issue_in_transaction(
&tx,
@@ -219,7 +185,6 @@ fn process_single_issue(
Ok(labels_created)
}
/// Inner function that performs all DB operations within a transaction.
#[allow(clippy::too_many_arguments)]
fn process_issue_in_transaction(
tx: &Transaction<'_>,
@@ -235,7 +200,6 @@ fn process_issue_in_transaction(
) -> Result<usize> {
let mut labels_created = 0;
// Store raw payload (deref Transaction to Connection for store_payload)
let payload_id = store_payload(
tx.deref(),
StorePayloadOptions {
@@ -247,14 +211,12 @@ fn process_issue_in_transaction(
},
)?;
// Upsert milestone if present, get local ID
let milestone_id: Option<i64> = if let Some(m) = milestone {
Some(upsert_milestone_tx(tx, project_id, m)?)
} else {
None
};
// Upsert issue (including new fields: due_date, milestone_id, milestone_title)
tx.execute(
"INSERT INTO issues (
gitlab_id, project_id, iid, title, description, state,
@@ -292,35 +254,29 @@ fn process_issue_in_transaction(
),
)?;
// Get local issue ID
let local_issue_id: i64 = tx.query_row(
"SELECT id FROM issues WHERE project_id = ? AND iid = ?",
(project_id, issue_row.iid),
|row| row.get(0),
)?;
// Mark dirty for document regeneration (inside transaction)
dirty_tracker::mark_dirty_tx(tx, SourceType::Issue, local_issue_id)?;
// Clear existing label links (stale removal)
tx.execute(
"DELETE FROM issue_labels WHERE issue_id = ?",
[local_issue_id],
)?;
// Upsert labels and create links
for label_name in label_names {
let label_id = upsert_label_tx(tx, project_id, label_name, &mut labels_created)?;
link_issue_label_tx(tx, local_issue_id, label_id)?;
}
// Clear existing assignee links (stale removal)
tx.execute(
"DELETE FROM issue_assignees WHERE issue_id = ?",
[local_issue_id],
)?;
// Insert assignees
for username in assignee_usernames {
tx.execute(
"INSERT OR IGNORE INTO issue_assignees (issue_id, username) VALUES (?, ?)",
@@ -331,8 +287,6 @@ fn process_issue_in_transaction(
Ok(labels_created)
}
/// Upsert a label within a transaction, returning its ID.
/// Uses INSERT...ON CONFLICT...RETURNING for a single round-trip.
fn upsert_label_tx(
tx: &Transaction<'_>,
project_id: i64,
@@ -347,7 +301,6 @@ fn upsert_label_tx(
|row| row.get(0),
)?;
// If the rowid matches last_insert_rowid, this was a new insert
if tx.last_insert_rowid() == id {
*created_count += 1;
}
@@ -355,7 +308,6 @@ fn upsert_label_tx(
Ok(id)
}
/// Link an issue to a label within a transaction.
fn link_issue_label_tx(tx: &Transaction<'_>, issue_id: i64, label_id: i64) -> Result<()> {
tx.execute(
"INSERT OR IGNORE INTO issue_labels (issue_id, label_id) VALUES (?, ?)",
@@ -364,8 +316,6 @@ fn link_issue_label_tx(tx: &Transaction<'_>, issue_id: i64, label_id: i64) -> Re
Ok(())
}
/// Upsert a milestone within a transaction, returning its local ID.
/// Uses RETURNING to avoid a separate SELECT round-trip.
fn upsert_milestone_tx(
tx: &Transaction<'_>,
project_id: i64,
@@ -398,7 +348,6 @@ fn upsert_milestone_tx(
Ok(local_id)
}
/// Get the current sync cursor for issues.
fn get_sync_cursor(conn: &Connection, project_id: i64) -> Result<SyncCursor> {
let row: Option<(Option<i64>, Option<i64>)> = conn
.query_row(
@@ -418,7 +367,6 @@ fn get_sync_cursor(conn: &Connection, project_id: i64) -> Result<SyncCursor> {
})
}
/// Update the sync cursor.
fn update_sync_cursor(
conn: &Connection,
project_id: i64,
@@ -436,7 +384,6 @@ fn update_sync_cursor(
Ok(())
}
/// Get issues that need discussion sync (updated_at > discussions_synced_for_updated_at).
fn get_issues_needing_discussion_sync(
conn: &Connection,
project_id: i64,
@@ -460,8 +407,6 @@ fn get_issues_needing_discussion_sync(
Ok(issues?)
}
/// Parse ISO 8601 timestamp to milliseconds.
/// Returns an error if parsing fails instead of silently returning 0.
fn parse_timestamp(ts: &str) -> Result<i64> {
chrono::DateTime::parse_from_rfc3339(ts)
.map(|dt| dt.timestamp_millis())
@@ -500,11 +445,10 @@ mod tests {
#[test]
fn cursor_filter_allows_newer_issues() {
let cursor = SyncCursor {
updated_at_cursor: Some(1705312800000), // 2024-01-15T10:00:00Z
updated_at_cursor: Some(1705312800000),
tie_breaker_id: Some(100),
};
// Issue with later timestamp passes
let issue = make_test_issue(101, "2024-01-16T10:00:00.000Z");
assert!(passes_cursor_filter(&issue, &cursor).unwrap_or(false));
}
@@ -516,7 +460,6 @@ mod tests {
tie_breaker_id: Some(100),
};
// Issue with earlier timestamp blocked
let issue = make_test_issue(99, "2024-01-14T10:00:00.000Z");
assert!(!passes_cursor_filter(&issue, &cursor).unwrap_or(true));
}
@@ -528,15 +471,12 @@ mod tests {
tie_breaker_id: Some(100),
};
// Same timestamp, higher ID passes
let issue1 = make_test_issue(101, "2024-01-15T10:00:00.000Z");
assert!(passes_cursor_filter(&issue1, &cursor).unwrap_or(false));
// Same timestamp, same ID blocked
let issue2 = make_test_issue(100, "2024-01-15T10:00:00.000Z");
assert!(!passes_cursor_filter(&issue2, &cursor).unwrap_or(true));
// Same timestamp, lower ID blocked
let issue3 = make_test_issue(99, "2024-01-15T10:00:00.000Z");
assert!(!passes_cursor_filter(&issue3, &cursor).unwrap_or(true));
}