gitlore/src/documents/extractor/mrs.rs

pub fn extract_mr_document(conn: &Connection, mr_id: i64) -> Result<Option<DocumentData>> {
    let row = conn.query_row(
        "SELECT m.id, m.iid, m.title, m.description, m.state, m.author_username,
                m.source_branch, m.target_branch,
                m.created_at, m.updated_at, m.web_url,
                p.path_with_namespace, p.id AS project_id
         FROM merge_requests m
         JOIN projects p ON p.id = m.project_id
         WHERE m.id = ?1",
        rusqlite::params![mr_id],
        |row| {
            Ok((
                row.get::<_, i64>(0)?,
                row.get::<_, i64>(1)?,
                row.get::<_, Option<String>>(2)?,
                row.get::<_, Option<String>>(3)?,
                row.get::<_, Option<String>>(4)?,
                row.get::<_, Option<String>>(5)?,
                row.get::<_, Option<String>>(6)?,
                row.get::<_, Option<String>>(7)?,
                row.get::<_, Option<i64>>(8)?,
                row.get::<_, Option<i64>>(9)?,
                row.get::<_, Option<String>>(10)?,
                row.get::<_, String>(11)?,
                row.get::<_, i64>(12)?,
            ))
        },
    );

    let (
        id,
        iid,
        title,
        description,
        state,
        author_username,
        source_branch,
        target_branch,
        created_at,
        updated_at,
        web_url,
        path_with_namespace,
        project_id,
    ) = match row {
        Ok(r) => r,
        Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
        Err(e) => return Err(e.into()),
    };

    let mut label_stmt = conn.prepare_cached(
        "SELECT l.name FROM mr_labels ml
         JOIN labels l ON l.id = ml.label_id
         WHERE ml.merge_request_id = ?1
         ORDER BY l.name",
    )?;
    let labels: Vec<String> = label_stmt
        .query_map(rusqlite::params![id], |row| row.get(0))?
        .collect::<std::result::Result<Vec<_>, _>>()?;

    let labels_json = serde_json::to_string(&labels).unwrap_or_else(|_| "[]".to_string());

    let display_title = title.as_deref().unwrap_or("(untitled)");
    let embed_title = normalize_title_for_embedding(display_title);
    let display_state = state.as_deref().unwrap_or("unknown");
    let mut content = format!(
        "[[MergeRequest]] !{}: {}\nProject: {}\n",
        iid, embed_title, path_with_namespace
    );
    if let Some(ref url) = web_url {
        let _ = writeln!(content, "URL: {}", url);
    }
    let _ = writeln!(content, "Labels: {}", labels_json);
    let _ = writeln!(content, "State: {}", display_state);
    if let Some(ref author) = author_username {
        let _ = writeln!(content, "Author: @{}", author);
    }
    if let (Some(src), Some(tgt)) = (&source_branch, &target_branch) {
        let _ = writeln!(content, "Source: {} -> {}", src, tgt);
    }

    if let Some(ref desc) = description {
        content.push_str("\n--- Description ---\n\n");
        // Pre-truncate to avoid unbounded memory allocation for huge descriptions
        let pre_trunc = pre_truncate_description(desc, MAX_DOCUMENT_BYTES_HARD);
        if pre_trunc.was_truncated {
            warn!(
                iid,
                original_bytes = pre_trunc.original_bytes,
                "MR description truncated (oversized)"
            );
        }
        content.push_str(&pre_trunc.content);
    }

    let labels_hash = compute_list_hash(&labels);
    let paths_hash = compute_list_hash(&[]);

    let hard_cap = truncate_hard_cap(&content);
    let content_hash = compute_content_hash(&hard_cap.content);

    Ok(Some(DocumentData {
        source_type: SourceType::MergeRequest,
        source_id: id,
        project_id,
        author_username,
        labels,
        paths: Vec::new(),
        labels_hash,
        paths_hash,
        created_at: created_at.unwrap_or(0),
        updated_at: updated_at.unwrap_or(0),
        url: web_url,
        title: Some(display_title.to_string()),
        content_text: hard_cap.content,
        content_hash,
        is_truncated: hard_cap.is_truncated,
        truncated_reason: hard_cap.reason.map(|r| r.as_str().to_string()),
    }))
}