refactor: Remove redundant doc comments throughout codebase

Removes module-level doc comments (//! lines) and excessive inline doc
comments that were duplicating information already evident from:
- Function/struct names (self-documenting code)
- Type signatures (the what is clear from types)
- Implementation context (the how is clear from code)

Affected modules:
- cli/* - Removed command descriptions duplicating clap help text
- core/* - Removed module headers and obvious function docs
- documents/* - Removed extractor/regenerator/truncation docs
- embedding/* - Removed pipeline and chunking docs
- gitlab/* - Removed client and transformer docs (kept type definitions)
- ingestion/* - Removed orchestrator and ingestion docs
- search/* - Removed FTS and vector search docs

Philosophy: Code should be self-documenting. Comments should explain
"why" (business decisions, non-obvious constraints) not "what" (which
the code itself shows). This change reduces noise and maintenance burden
while keeping the codebase just as understandable.

Retains comments for:
- Non-obvious business logic
- Important safety invariants
- Complex algorithm explanations
- Public API boundaries where generated docs matter

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Taylor Eernisse
2026-02-05 00:04:32 -05:00
parent 976ad92ef0
commit 65583ed5d6
57 changed files with 143 additions and 1693 deletions

View File

@@ -9,7 +9,6 @@ use super::truncation::{
};
use crate::core::error::Result;
/// Source type for documents.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SourceType {
@@ -27,10 +26,6 @@ impl SourceType {
}
}
/// Parse from CLI input, accepting common aliases.
///
/// Accepts: "issue", "issues", "mr", "mrs", "merge_request", "merge_requests",
/// "discussion", "discussions"
pub fn parse(s: &str) -> Option<Self> {
match s.to_lowercase().as_str() {
"issue" | "issues" => Some(Self::Issue),
@@ -47,7 +42,6 @@ impl std::fmt::Display for SourceType {
}
}
/// Generated document ready for storage.
#[derive(Debug, Clone)]
pub struct DocumentData {
pub source_type: SourceType,
@@ -68,16 +62,12 @@ pub struct DocumentData {
pub truncated_reason: Option<String>,
}
/// Compute SHA-256 hash of content.
pub fn compute_content_hash(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
/// Compute SHA-256 hash over a sorted list of strings.
/// Used for labels_hash and paths_hash to detect changes efficiently.
/// Sorts by index reference to avoid cloning, hashes incrementally to avoid join allocation.
pub fn compute_list_hash(items: &[String]) -> String {
let mut indices: Vec<usize> = (0..items.len()).collect();
indices.sort_by(|a, b| items[*a].cmp(&items[*b]));
@@ -91,10 +81,7 @@ pub fn compute_list_hash(items: &[String]) -> String {
format!("{:x}", hasher.finalize())
}
/// Extract a searchable document from an issue.
/// Returns None if the issue has been deleted from the DB.
pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option<DocumentData>> {
// Query main issue entity with project info
let row = conn.query_row(
"SELECT i.id, i.iid, i.title, i.description, i.state, i.author_username,
i.created_at, i.updated_at, i.web_url,
@@ -105,17 +92,17 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
rusqlite::params![issue_id],
|row| {
Ok((
row.get::<_, i64>(0)?, // id
row.get::<_, i64>(1)?, // iid
row.get::<_, Option<String>>(2)?, // title
row.get::<_, Option<String>>(3)?, // description
row.get::<_, String>(4)?, // state
row.get::<_, Option<String>>(5)?, // author_username
row.get::<_, i64>(6)?, // created_at
row.get::<_, i64>(7)?, // updated_at
row.get::<_, Option<String>>(8)?, // web_url
row.get::<_, String>(9)?, // path_with_namespace
row.get::<_, i64>(10)?, // project_id
row.get::<_, i64>(0)?,
row.get::<_, i64>(1)?,
row.get::<_, Option<String>>(2)?,
row.get::<_, Option<String>>(3)?,
row.get::<_, String>(4)?,
row.get::<_, Option<String>>(5)?,
row.get::<_, i64>(6)?,
row.get::<_, i64>(7)?,
row.get::<_, Option<String>>(8)?,
row.get::<_, String>(9)?,
row.get::<_, i64>(10)?,
))
},
);
@@ -138,7 +125,6 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
Err(e) => return Err(e.into()),
};
// Query labels via junction table
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id
@@ -149,10 +135,8 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
.query_map(rusqlite::params![id], |row| row.get(0))?
.collect::<std::result::Result<Vec<_>, _>>()?;
// Build labels JSON array string
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
// Format content_text per PRD template
let display_title = title.as_deref().unwrap_or("(untitled)");
let mut content = format!(
"[[Issue]] #{}: {}\nProject: {}\n",
@@ -167,16 +151,14 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
content.push_str(&format!("Author: @{}\n", author));
}
// Add description section only if description is Some
if let Some(ref desc) = description {
content.push_str("\n--- Description ---\n\n");
content.push_str(desc);
}
let labels_hash = compute_list_hash(&labels);
let paths_hash = compute_list_hash(&[]); // Issues have no paths
let paths_hash = compute_list_hash(&[]);
// Apply hard cap truncation for safety, then hash the final stored content
let hard_cap = truncate_hard_cap(&content);
let content_hash = compute_content_hash(&hard_cap.content);
@@ -200,8 +182,6 @@ pub fn extract_issue_document(conn: &Connection, issue_id: i64) -> Result<Option
}))
}
/// Extract a searchable document from a merge request.
/// Returns None if the MR has been deleted from the DB.
pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<DocumentData>> {
let row = conn.query_row(
"SELECT m.id, m.iid, m.title, m.description, m.state, m.author_username,
@@ -214,19 +194,19 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
rusqlite::params![mr_id],
|row| {
Ok((
row.get::<_, i64>(0)?, // id
row.get::<_, i64>(1)?, // iid
row.get::<_, Option<String>>(2)?, // title
row.get::<_, Option<String>>(3)?, // description
row.get::<_, Option<String>>(4)?, // state
row.get::<_, Option<String>>(5)?, // author_username
row.get::<_, Option<String>>(6)?, // source_branch
row.get::<_, Option<String>>(7)?, // target_branch
row.get::<_, Option<i64>>(8)?, // created_at (nullable in schema)
row.get::<_, Option<i64>>(9)?, // updated_at (nullable in schema)
row.get::<_, Option<String>>(10)?, // web_url
row.get::<_, String>(11)?, // path_with_namespace
row.get::<_, i64>(12)?, // project_id
row.get::<_, i64>(0)?,
row.get::<_, i64>(1)?,
row.get::<_, Option<String>>(2)?,
row.get::<_, Option<String>>(3)?,
row.get::<_, Option<String>>(4)?,
row.get::<_, Option<String>>(5)?,
row.get::<_, Option<String>>(6)?,
row.get::<_, Option<String>>(7)?,
row.get::<_, Option<i64>>(8)?,
row.get::<_, Option<i64>>(9)?,
row.get::<_, Option<String>>(10)?,
row.get::<_, String>(11)?,
row.get::<_, i64>(12)?,
))
},
);
@@ -251,7 +231,6 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
Err(e) => return Err(e.into()),
};
// Query labels via junction table
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM mr_labels ml
JOIN labels l ON l.id = ml.label_id
@@ -278,7 +257,6 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
if let Some(ref author) = author_username {
content.push_str(&format!("Author: @{}\n", author));
}
// Source line: source_branch -> target_branch
if let (Some(src), Some(tgt)) = (&source_branch, &target_branch) {
content.push_str(&format!("Source: {} -> {}\n", src, tgt));
}
@@ -291,7 +269,6 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
let labels_hash = compute_list_hash(&labels);
let paths_hash = compute_list_hash(&[]);
// Apply hard cap truncation for safety, then hash the final stored content
let hard_cap = truncate_hard_cap(&content);
let content_hash = compute_content_hash(&hard_cap.content);
@@ -315,20 +292,16 @@ pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<Docum
}))
}
/// Format ms epoch as YYYY-MM-DD date string.
fn format_date(ms: i64) -> String {
DateTime::from_timestamp_millis(ms)
.map(|dt| dt.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| "unknown".to_string())
}
/// Extract a searchable document from a discussion thread.
/// Returns None if the discussion or its parent has been deleted.
pub fn extract_discussion_document(
conn: &Connection,
discussion_id: i64,
) -> Result<Option<DocumentData>> {
// Query discussion metadata
let disc_row = conn.query_row(
"SELECT d.id, d.noteable_type, d.issue_id, d.merge_request_id,
p.path_with_namespace, p.id AS project_id
@@ -338,12 +311,12 @@ pub fn extract_discussion_document(
rusqlite::params![discussion_id],
|row| {
Ok((
row.get::<_, i64>(0)?, // id
row.get::<_, String>(1)?, // noteable_type
row.get::<_, Option<i64>>(2)?, // issue_id
row.get::<_, Option<i64>>(3)?, // merge_request_id
row.get::<_, String>(4)?, // path_with_namespace
row.get::<_, i64>(5)?, // project_id
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, Option<i64>>(2)?,
row.get::<_, Option<i64>>(3)?,
row.get::<_, String>(4)?,
row.get::<_, i64>(5)?,
))
},
);
@@ -355,7 +328,6 @@ pub fn extract_discussion_document(
Err(e) => return Err(e.into()),
};
// Query parent entity
let (_parent_iid, parent_title, parent_web_url, parent_type_prefix, labels) =
match noteable_type.as_str() {
"Issue" => {
@@ -379,7 +351,6 @@ pub fn extract_discussion_document(
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
Err(e) => return Err(e.into()),
};
// Query parent labels
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM issue_labels il
JOIN labels l ON l.id = il.label_id
@@ -413,7 +384,6 @@ pub fn extract_discussion_document(
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
Err(e) => return Err(e.into()),
};
// Query parent labels
let mut label_stmt = conn.prepare_cached(
"SELECT l.name FROM mr_labels ml
JOIN labels l ON l.id = ml.label_id
@@ -429,7 +399,6 @@ pub fn extract_discussion_document(
_ => return Ok(None),
};
// Query non-system notes in thread order
let mut note_stmt = conn.prepare_cached(
"SELECT n.author_username, n.body, n.created_at, n.gitlab_id,
n.note_type, n.position_old_path, n.position_new_path
@@ -454,7 +423,6 @@ pub fn extract_discussion_document(
body: row.get(1)?,
created_at: row.get(2)?,
gitlab_id: row.get(3)?,
// index 4 is note_type (unused here)
old_path: row.get(5)?,
new_path: row.get(6)?,
})
@@ -465,7 +433,6 @@ pub fn extract_discussion_document(
return Ok(None);
}
// Extract DiffNote paths (deduplicated, sorted)
let mut path_set = BTreeSet::new();
for note in &notes {
if let Some(ref p) = note.old_path
@@ -481,16 +448,13 @@ pub fn extract_discussion_document(
}
let paths: Vec<String> = path_set.into_iter().collect();
// Construct URL: parent_web_url#note_{first_note_gitlab_id}
let first_note_gitlab_id = notes[0].gitlab_id;
let url = parent_web_url
.as_ref()
.map(|wu| format!("{}#note_{}", wu, first_note_gitlab_id));
// First non-system note author
let author_username = notes[0].author.clone();
// Build content
let display_title = parent_title.as_deref().unwrap_or("(untitled)");
let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());
let paths_json = serde_json::to_string(&paths).unwrap_or_else(|_| "[]".to_string());
@@ -507,7 +471,6 @@ pub fn extract_discussion_document(
content.push_str(&format!("Files: {}\n", paths_json));
}
// Build NoteContent list for truncation-aware thread rendering
let note_contents: Vec<NoteContent> = notes
.iter()
.map(|note| NoteContent {
@@ -517,7 +480,6 @@ pub fn extract_discussion_document(
})
.collect();
// Estimate header size to reserve budget for thread content
let header_len = content.len() + "\n--- Thread ---\n\n".len();
let thread_budget = MAX_DISCUSSION_BYTES.saturating_sub(header_len);
@@ -525,7 +487,6 @@ pub fn extract_discussion_document(
content.push_str("\n--- Thread ---\n\n");
content.push_str(&thread_result.content);
// Use first note's created_at and last note's created_at for timestamps
let created_at = notes[0].created_at;
let updated_at = notes.last().map(|n| n.created_at).unwrap_or(created_at);
@@ -545,7 +506,7 @@ pub fn extract_discussion_document(
created_at,
updated_at,
url,
title: None, // Discussions don't have their own title
title: None,
content_text: content,
content_hash,
is_truncated: thread_result.is_truncated,
@@ -580,7 +541,7 @@ mod tests {
Some(SourceType::Discussion)
);
assert_eq!(SourceType::parse("invalid"), None);
assert_eq!(SourceType::parse("ISSUE"), Some(SourceType::Issue)); // case insensitive
assert_eq!(SourceType::parse("ISSUE"), Some(SourceType::Issue));
}
#[test]
@@ -603,8 +564,7 @@ mod tests {
let hash2 = compute_content_hash("hello");
assert_eq!(hash1, hash2);
assert!(!hash1.is_empty());
// SHA-256 of "hello" is known
assert_eq!(hash1.len(), 64); // 256 bits = 64 hex chars
assert_eq!(hash1.len(), 64);
}
#[test]
@@ -631,12 +591,10 @@ mod tests {
fn test_list_hash_empty() {
let hash = compute_list_hash(&[]);
assert_eq!(hash.len(), 64);
// Empty list hashes consistently
let hash2 = compute_list_hash(&[]);
assert_eq!(hash, hash2);
}
// Helper to create an in-memory DB with the required tables for extraction tests
fn setup_test_db() -> Connection {
let conn = Connection::open_in_memory().unwrap();
conn.execute_batch(
@@ -685,7 +643,6 @@ mod tests {
)
.unwrap();
// Insert a test project
conn.execute(
"INSERT INTO projects (id, gitlab_project_id, path_with_namespace, web_url) VALUES (1, 100, 'group/project-one', 'https://gitlab.example.com/group/project-one')",
[],
@@ -871,12 +828,9 @@ mod tests {
insert_issue(&conn, 1, 10, Some("Test"), Some(""), "opened", None, None);
let doc = extract_issue_document(&conn, 1).unwrap().unwrap();
// Empty string description still includes the section header
assert!(doc.content_text.contains("--- Description ---\n\n"));
}
// --- MR extraction tests ---
fn setup_mr_test_db() -> Connection {
let conn = setup_test_db();
conn.execute_batch(
@@ -1067,10 +1021,8 @@ mod tests {
assert!(!doc.content_text.contains("Source:"));
}
// --- Discussion extraction tests ---
fn setup_discussion_test_db() -> Connection {
let conn = setup_mr_test_db(); // includes projects, issues schema, labels, mr tables
let conn = setup_mr_test_db();
conn.execute_batch(
"
CREATE TABLE discussions (
@@ -1166,7 +1118,6 @@ mod tests {
link_issue_label(&conn, 1, 1);
link_issue_label(&conn, 1, 2);
insert_discussion(&conn, 1, "Issue", Some(1), None);
// 1710460800000 = 2024-03-15T00:00:00Z
insert_note(
&conn,
1,
@@ -1213,7 +1164,7 @@ mod tests {
.contains("@janedoe (2024-03-15):\nAgreed. What about refresh token strategy?")
);
assert_eq!(doc.author_username, Some("johndoe".to_string()));
assert!(doc.title.is_none()); // Discussions don't have their own title
assert!(doc.title.is_none());
}
#[test]
@@ -1226,7 +1177,6 @@ mod tests {
#[test]
fn test_discussion_parent_deleted() {
let conn = setup_discussion_test_db();
// Insert issue, create discussion, then delete the issue
insert_issue(
&conn,
99,
@@ -1250,8 +1200,6 @@ mod tests {
None,
None,
);
// Delete the parent issue — FK cascade won't delete discussion in test since
// we used REFERENCES without ON DELETE CASCADE in test schema, so just delete from issues
conn.execute("PRAGMA foreign_keys = OFF", []).unwrap();
conn.execute("DELETE FROM issues WHERE id = 99", [])
.unwrap();
@@ -1358,7 +1306,6 @@ mod tests {
);
let doc = extract_discussion_document(&conn, 1).unwrap().unwrap();
// Paths should be deduplicated and sorted
assert_eq!(doc.paths, vec!["src/new.rs", "src/old.rs"]);
assert!(
doc.content_text
@@ -1498,7 +1445,6 @@ mod tests {
None,
);
// All notes are system notes -> no content -> returns None
let result = extract_discussion_document(&conn, 1).unwrap();
assert!(result.is_none());
}