refactor: Remove redundant doc comments throughout codebase

Removes module-level doc comments (//! lines) and excessive inline doc
comments that were duplicating information already evident from:
- Function/struct names (self-documenting code)
- Type signatures (the what is clear from types)
- Implementation context (the how is clear from code)

Affected modules:
- cli/* - Removed command descriptions duplicating clap help text
- core/* - Removed module headers and obvious function docs
- documents/* - Removed extractor/regenerator/truncation docs
- embedding/* - Removed pipeline and chunking docs
- gitlab/* - Removed client and transformer docs (kept type definitions)
- ingestion/* - Removed orchestrator and ingestion docs
- search/* - Removed FTS and vector search docs

Philosophy: Code should be self-documenting. Comments should explain
"why" (business decisions, non-obvious constraints) not "what" (which
the code itself shows). This change reduces noise and maintenance burden
while keeping the codebase just as understandable.

Retains comments for:
- Non-obvious business logic
- Important safety invariants
- Complex algorithm explanations
- Public API boundaries where generated docs matter

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-05 00:04:32 -05:00
parent 976ad92ef0
commit 65583ed5d6
57 changed files with 143 additions and 1693 deletions

View File

@@ -1,9 +1,3 @@
//! Performance metrics types and tracing layer for sync pipeline observability.
//!
//! Provides:
//! - [`StageTiming`]: Serializable timing/counter data for pipeline stages
//! - [`MetricsLayer`]: Custom tracing subscriber layer that captures span timing
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::Instant;
@@ -14,16 +8,10 @@ use tracing::span::{Attributes, Id, Record};
use tracing_subscriber::layer::{Context, Layer};
use tracing_subscriber::registry::LookupSpan;
/// Returns true when value is zero (for serde `skip_serializing_if`).
fn is_zero(v: &usize) -> bool {
*v == 0
}
/// Timing and counter data for a single pipeline stage.
///
/// Supports nested sub-stages for hierarchical timing breakdowns.
/// Fields with zero/empty values are omitted from JSON output to
/// keep robot-mode payloads compact.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StageTiming {
pub name: String,
@@ -43,11 +31,6 @@ pub struct StageTiming {
pub sub_stages: Vec<StageTiming>,
}
// ============================================================================
// MetricsLayer: custom tracing subscriber layer
// ============================================================================
/// Internal data tracked per open span.
struct SpanData {
name: String,
parent_id: Option<u64>,
@@ -57,19 +40,12 @@ struct SpanData {
retries: usize,
}
/// Completed span data with its original ID and parent ID.
struct CompletedSpan {
id: u64,
parent_id: Option<u64>,
timing: StageTiming,
}
/// Custom tracing layer that captures span timing and structured fields.
///
/// Collects data from `#[instrument]` spans and materializes it into
/// a `Vec<StageTiming>` tree via [`extract_timings`].
///
/// Thread-safe via `Arc<Mutex<>>` — suitable for concurrent span operations.
#[derive(Debug, Clone)]
pub struct MetricsLayer {
spans: Arc<Mutex<HashMap<u64, SpanData>>>,
@@ -90,45 +66,34 @@ impl MetricsLayer {
}
}
/// Extract timing tree for a completed run.
///
/// Returns the top-level stages with sub-stages nested.
/// Call after the root span closes.
pub fn extract_timings(&self) -> Vec<StageTiming> {
let completed = self.completed.lock().unwrap_or_else(|e| e.into_inner());
if completed.is_empty() {
return Vec::new();
}
// Build children map: parent_id -> Vec<StageTiming>
let mut children_map: HashMap<u64, Vec<StageTiming>> = HashMap::new();
let mut roots = Vec::new();
let mut id_to_timing: HashMap<u64, StageTiming> = HashMap::new();
// First pass: collect all timings by ID
for entry in completed.iter() {
id_to_timing.insert(entry.id, entry.timing.clone());
}
// Second pass: process in reverse order (children close before parents)
// to build the tree bottom-up
for entry in completed.iter() {
// Attach any children that were collected for this span
if let Some(timing) = id_to_timing.get_mut(&entry.id)
&& let Some(children) = children_map.remove(&entry.id)
{
timing.sub_stages = children;
}
if let Some(parent_id) = entry.parent_id {
// This is a child span — attach to parent's children
if let Some(timing) = id_to_timing.remove(&entry.id) {
children_map.entry(parent_id).or_default().push(timing);
}
if let Some(parent_id) = entry.parent_id
&& let Some(timing) = id_to_timing.remove(&entry.id)
{
children_map.entry(parent_id).or_default().push(timing);
}
}
// Remaining entries in id_to_timing are roots
for entry in completed.iter() {
if entry.parent_id.is_none()
&& let Some(mut timing) = id_to_timing.remove(&entry.id)
@@ -144,7 +109,6 @@ impl MetricsLayer {
}
}
/// Visitor that extracts field values from span attributes.
struct FieldVisitor<'a>(&'a mut HashMap<String, serde_json::Value>);
impl tracing::field::Visit for FieldVisitor<'_> {
@@ -182,7 +146,6 @@ impl tracing::field::Visit for FieldVisitor<'_> {
}
}
/// Visitor that extracts event fields for rate-limit/retry detection.
#[derive(Default)]
struct EventVisitor {
status_code: Option<u64>,
@@ -248,7 +211,6 @@ where
}
fn on_event(&self, event: &tracing::Event<'_>, ctx: Context<'_, S>) {
// Count rate-limit and retry events on the current span
if let Some(span_ref) = ctx.event_span(event) {
let id = span_ref.id();
if let Some(data) = self
@@ -317,7 +279,6 @@ where
}
}
// Manual Debug impl since SpanData and CompletedSpan don't derive Debug
impl std::fmt::Debug for SpanData {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SpanData")
@@ -376,7 +337,6 @@ mod tests {
assert_eq!(json["rate_limit_hits"], 2);
assert_eq!(json["retries"], 5);
// Sub-stage present
let sub = &json["sub_stages"][0];
assert_eq!(sub["name"], "ingest_issues");
assert_eq!(sub["project"], "group/repo");
@@ -400,7 +360,6 @@ mod tests {
let json = serde_json::to_value(&timing).unwrap();
let obj = json.as_object().unwrap();
// Zero fields must be absent
assert!(!obj.contains_key("items_skipped"));
assert!(!obj.contains_key("errors"));
assert!(!obj.contains_key("rate_limit_hits"));
@@ -408,7 +367,6 @@ mod tests {
assert!(!obj.contains_key("sub_stages"));
assert!(!obj.contains_key("project"));
// Required fields always present
assert!(obj.contains_key("name"));
assert!(obj.contains_key("elapsed_ms"));
assert!(obj.contains_key("items_processed"));
@@ -539,13 +497,12 @@ mod tests {
tracing::subscriber::with_default(subscriber, || {
let span = tracing::info_span!("test_stage");
let _guard = span.enter();
// Simulate work
});
let timings = metrics.extract_timings();
assert_eq!(timings.len(), 1);
assert_eq!(timings[0].name, "test_stage");
assert!(timings[0].elapsed_ms < 100); // Should be near-instant
assert!(timings[0].elapsed_ms < 100);
}
#[test]