Extract shared JSONL parsing helpers for parser parity

Introduce three shared helpers in session-parser.ts that both the full
parser and the lightweight metadata extractor can use:

- forEachJsonlLine(content, onLine): Iterates JSONL lines with consistent
  malformed-line handling. Skips invalid JSON lines identically to how
  parseSessionContent handles them. Returns parse error count for diagnostics.

- countMessagesForLine(parsed): Returns the number of messages a single
  JSONL line expands into, using the same classification rules as the
  full parser. User arrays expand tool_result and text blocks; assistant
  arrays expand thinking, text, and tool_use.

- classifyLine(parsed): Classifies a parsed line into one of 8 types
  (user, assistant, system, progress, summary, file_snapshot, queue, other).

The internal extractMessages() function now uses these shared helpers,
ensuring no behavior change while enabling the upcoming metadata extraction
service to reuse the same logic. This guarantees list counts can never drift
from detail-view counts, regardless of future parser changes.

Test coverage includes:
- Malformed line handling parity with full parser
- Parse error counting for truncated/corrupted files
- countMessagesForLine output matches extractMessages().length
- Edge cases: empty files, progress events, array content expansion

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
teernisse
2026-02-28 00:50:53 -05:00
parent b69dffc398
commit c20652924d
2 changed files with 357 additions and 20 deletions

View File

@@ -28,7 +28,7 @@ interface ContentBlock {
content?: string | ContentBlock[]; content?: string | ContentBlock[];
} }
interface RawLine { export interface RawLine {
type?: string; type?: string;
uuid?: string; uuid?: string;
timestamp?: string; timestamp?: string;
@@ -43,6 +43,94 @@ interface RawLine {
subtype?: string; subtype?: string;
} }
export type LineClassification =
| "user"
| "assistant"
| "progress"
| "file-history-snapshot"
| "summary"
| "system"
| "queue-operation"
| "unknown";
export function forEachJsonlLine(
content: string,
onLine: (parsed: RawLine, lineIndex: number) => void
): { parseErrors: number } {
let parseErrors = 0;
const lines = content.split("\n");
for (let i = 0; i < lines.length; i++) {
const trimmed = lines[i].trim();
if (!trimmed) continue;
let parsed: RawLine;
try {
parsed = JSON.parse(trimmed);
} catch {
parseErrors++;
continue;
}
onLine(parsed, i);
}
return { parseErrors };
}
export function classifyLine(parsed: RawLine): LineClassification {
const type = parsed.type;
if (type === "progress") return "progress";
if (type === "file-history-snapshot") return "file-history-snapshot";
if (type === "summary") return "summary";
if (type === "system") return "system";
if (type === "queue-operation") return "queue-operation";
if (type === "user" || parsed.message?.role === "user") return "user";
if (type === "assistant" || parsed.message?.role === "assistant") return "assistant";
return "unknown";
}
export function countMessagesForLine(parsed: RawLine): number {
const classification = classifyLine(parsed);
switch (classification) {
case "progress":
case "file-history-snapshot":
case "summary":
return 1;
case "system":
case "queue-operation":
case "unknown":
return 0;
case "user": {
const content = parsed.message?.content;
if (content === undefined || content === null) return 0;
if (typeof content === "string") return 1;
if (Array.isArray(content)) {
return content.filter(
(b: ContentBlock) => b.type === "tool_result" || b.type === "text"
).length;
}
return 0;
}
case "assistant": {
const content = parsed.message?.content;
if (content === undefined || content === null) return 0;
if (typeof content === "string") return 1;
if (Array.isArray(content)) {
return content.filter(
(b: ContentBlock) =>
b.type === "thinking" || b.type === "text" || b.type === "tool_use"
).length;
}
return 0;
}
}
}
export async function parseSession( export async function parseSession(
filePath: string filePath: string
): Promise<ParsedMessage[]> { ): Promise<ParsedMessage[]> {
@@ -58,31 +146,23 @@ export async function parseSession(
export function parseSessionContent(content: string): ParsedMessage[] { export function parseSessionContent(content: string): ParsedMessage[] {
const messages: ParsedMessage[] = []; const messages: ParsedMessage[] = [];
const lines = content.split("\n").filter((l) => l.trim());
for (let i = 0; i < lines.length; i++) { forEachJsonlLine(content, (parsed, lineIndex) => {
let parsed: RawLine; const extracted = extractMessages(parsed, lineIndex);
try {
parsed = JSON.parse(lines[i]);
} catch {
continue; // Skip malformed lines
}
const extracted = extractMessages(parsed, i);
messages.push(...extracted); messages.push(...extracted);
} });
return messages; return messages;
} }
function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] { function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] {
const messages: ParsedMessage[] = []; const messages: ParsedMessage[] = [];
const type = raw.type; const classification = classifyLine(raw);
const uuid = raw.uuid || `generated-${rawIndex}`; const uuid = raw.uuid || `generated-${rawIndex}`;
const timestamp = raw.timestamp; const timestamp = raw.timestamp;
// Progress/hook messages - content is in `data`, not `content` // Progress/hook messages - content is in `data`, not `content`
if (type === "progress") { if (classification === "progress") {
const data = raw.data; const data = raw.data;
const progressText = data const progressText = data
? formatProgressData(data) ? formatProgressData(data)
@@ -102,7 +182,7 @@ function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] {
} }
// File history snapshot // File history snapshot
if (type === "file-history-snapshot") { if (classification === "file-history-snapshot") {
messages.push({ messages.push({
uuid, uuid,
category: "file_snapshot", category: "file_snapshot",
@@ -114,7 +194,7 @@ function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] {
} }
// Summary message - text is in `summary` field, not `content` // Summary message - text is in `summary` field, not `content`
if (type === "summary") { if (classification === "summary") {
messages.push({ messages.push({
uuid, uuid,
category: "summary", category: "summary",
@@ -126,7 +206,7 @@ function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] {
} }
// System metadata (turn_duration etc.) - skip, not user-facing // System metadata (turn_duration etc.) - skip, not user-facing
if (type === "system" || type === "queue-operation") { if (classification === "system" || classification === "queue-operation") {
return messages; return messages;
} }
@@ -134,7 +214,7 @@ function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] {
const role = raw.message?.role; const role = raw.message?.role;
const content = raw.message?.content; const content = raw.message?.content;
if ((type === "user" || role === "user") && content !== undefined) { if (classification === "user" && content !== undefined) {
if (typeof content === "string") { if (typeof content === "string") {
const category = detectSystemReminder(content) const category = detectSystemReminder(content)
? "system_message" ? "system_message"
@@ -183,7 +263,7 @@ function extractMessages(raw: RawLine, rawIndex: number): ParsedMessage[] {
return messages; return messages;
} }
if ((type === "assistant" || role === "assistant") && content !== undefined) { if (classification === "assistant" && content !== undefined) {
if (typeof content === "string") { if (typeof content === "string") {
messages.push({ messages.push({
uuid, uuid,

View File

@@ -1,5 +1,11 @@
import { describe, it, expect } from "vitest"; import { describe, it, expect } from "vitest";
import { parseSessionContent } from "../../src/server/services/session-parser.js"; import {
parseSessionContent,
forEachJsonlLine,
classifyLine,
countMessagesForLine,
} from "../../src/server/services/session-parser.js";
import type { RawLine } from "../../src/server/services/session-parser.js";
import fs from "fs/promises"; import fs from "fs/promises";
import path from "path"; import path from "path";
@@ -319,4 +325,255 @@ describe("session-parser", () => {
const msgs = parseSessionContent(line); const msgs = parseSessionContent(line);
expect(msgs[0].progressSubtype).toBe("hook"); expect(msgs[0].progressSubtype).toBe("hook");
}); });
describe("forEachJsonlLine", () => {
it("skips malformed JSON lines and reports parseErrors count", () => {
const content = [
"not valid json",
JSON.stringify({ type: "user", message: { role: "user", content: "Hello" } }),
"{broken}",
].join("\n");
const lines: RawLine[] = [];
const result = forEachJsonlLine(content, (parsed) => {
lines.push(parsed);
});
expect(lines).toHaveLength(1);
expect(result.parseErrors).toBe(2);
});
it("skips empty and whitespace-only lines without incrementing parseErrors", () => {
const content = [
"",
" ",
JSON.stringify({ type: "summary", summary: "test" }),
"\t",
"",
].join("\n");
const lines: RawLine[] = [];
const result = forEachJsonlLine(content, (parsed) => {
lines.push(parsed);
});
expect(lines).toHaveLength(1);
expect(result.parseErrors).toBe(0);
});
it("returns parseErrors 0 for empty content", () => {
const lines: RawLine[] = [];
const result = forEachJsonlLine("", (parsed) => {
lines.push(parsed);
});
expect(lines).toHaveLength(0);
expect(result.parseErrors).toBe(0);
});
it("processes content without trailing newline", () => {
const content = JSON.stringify({ type: "summary", summary: "no trailing newline" });
const lines: RawLine[] = [];
forEachJsonlLine(content, (parsed) => {
lines.push(parsed);
});
expect(lines).toHaveLength(1);
expect(lines[0].summary).toBe("no trailing newline");
});
it("passes correct lineIndex to callback", () => {
const content = [
JSON.stringify({ type: "user", message: { role: "user", content: "first" } }),
"",
JSON.stringify({ type: "summary", summary: "third" }),
].join("\n");
const indices: number[] = [];
forEachJsonlLine(content, (_parsed, lineIndex) => {
indices.push(lineIndex);
});
expect(indices).toEqual([0, 2]);
});
});
describe("classifyLine", () => {
it("returns correct classification for each type", () => {
expect(classifyLine({ type: "progress" })).toBe("progress");
expect(classifyLine({ type: "file-history-snapshot" })).toBe("file-history-snapshot");
expect(classifyLine({ type: "summary" })).toBe("summary");
expect(classifyLine({ type: "system" })).toBe("system");
expect(classifyLine({ type: "queue-operation" })).toBe("queue-operation");
expect(classifyLine({ type: "user", message: { role: "user" } })).toBe("user");
expect(classifyLine({ type: "assistant", message: { role: "assistant" } })).toBe("assistant");
expect(classifyLine({})).toBe("unknown");
});
it("classifies by message.role when type is missing", () => {
expect(classifyLine({ message: { role: "user" } })).toBe("user");
expect(classifyLine({ message: { role: "assistant" } })).toBe("assistant");
});
it("returns unknown for missing type and no role", () => {
expect(classifyLine({ message: {} })).toBe("unknown");
expect(classifyLine({ uuid: "orphan" })).toBe("unknown");
});
});
describe("countMessagesForLine", () => {
it("returns 1 for user string message", () => {
const line: RawLine = {
type: "user",
message: { role: "user", content: "Hello" },
};
expect(countMessagesForLine(line)).toBe(1);
});
it("matches extractMessages length for user array with tool_result and text", () => {
const line: RawLine = {
type: "user",
message: {
role: "user",
content: [
{ type: "tool_result", tool_use_id: "t1", content: "result" },
{ type: "text", text: "description" },
],
},
uuid: "u-arr",
};
const msgs = parseSessionContent(JSON.stringify(line));
expect(countMessagesForLine(line)).toBe(msgs.length);
expect(countMessagesForLine(line)).toBe(2);
});
it("matches extractMessages length for assistant array with thinking/text/tool_use", () => {
const line: RawLine = {
type: "assistant",
message: {
role: "assistant",
content: [
{ type: "thinking", thinking: "hmm" },
{ type: "text", text: "response" },
{ type: "tool_use", name: "Read", input: { file_path: "/x" } },
],
},
uuid: "a-arr",
};
const msgs = parseSessionContent(JSON.stringify(line));
expect(countMessagesForLine(line)).toBe(msgs.length);
expect(countMessagesForLine(line)).toBe(3);
});
it("returns 1 for progress/file-history-snapshot/summary", () => {
expect(countMessagesForLine({ type: "progress", data: { type: "hook" } })).toBe(1);
expect(countMessagesForLine({ type: "file-history-snapshot", snapshot: {} })).toBe(1);
expect(countMessagesForLine({ type: "summary", summary: "test" })).toBe(1);
});
it("returns 0 for system/queue-operation", () => {
expect(countMessagesForLine({ type: "system", subtype: "turn_duration" })).toBe(0);
expect(countMessagesForLine({ type: "queue-operation" })).toBe(0);
});
it("returns 0 for unknown type", () => {
expect(countMessagesForLine({})).toBe(0);
expect(countMessagesForLine({ type: "something-new" })).toBe(0);
});
it("returns 0 for user message with empty content array", () => {
const line: RawLine = {
type: "user",
message: { role: "user", content: [] },
};
expect(countMessagesForLine(line)).toBe(0);
});
it("returns 0 for user message with undefined content", () => {
const line: RawLine = {
type: "user",
message: { role: "user" },
};
expect(countMessagesForLine(line)).toBe(0);
});
it("only counts known block types in assistant arrays", () => {
const line: RawLine = {
type: "assistant",
message: {
role: "assistant",
content: [
{ type: "thinking", thinking: "hmm" },
{ type: "unknown_block" },
{ type: "text", text: "hi" },
],
},
};
expect(countMessagesForLine(line)).toBe(2);
});
it("returns 1 for assistant string content", () => {
const line: RawLine = {
type: "assistant",
message: { role: "assistant", content: "direct string" },
};
expect(countMessagesForLine(line)).toBe(1);
});
it("counts user text with system-reminder as 1 (reclassified but still counted)", () => {
const line: RawLine = {
type: "user",
message: { role: "user", content: "<system-reminder>Some reminder</system-reminder>" },
uuid: "u-sr-parity",
};
const msgs = parseSessionContent(JSON.stringify(line));
expect(countMessagesForLine(line)).toBe(msgs.length);
expect(countMessagesForLine(line)).toBe(1);
});
it("handles truncated JSON (crash mid-write)", () => {
const content = [
JSON.stringify({ type: "user", message: { role: "user", content: "ok" }, uuid: "u-ok" }),
'{"type":"assistant","message":{"role":"assistant","content":[{"type":"text","text":"trun',
].join("\n");
const lines: RawLine[] = [];
const result = forEachJsonlLine(content, (parsed) => {
lines.push(parsed);
});
expect(lines).toHaveLength(1);
expect(result.parseErrors).toBe(1);
});
});
describe("parser parity: fixture integration", () => {
it("countMessagesForLine sum matches parseSessionContent on sample-session.jsonl", async () => {
const fixturePath = path.join(__dirname, "../fixtures/sample-session.jsonl");
const content = await fs.readFile(fixturePath, "utf-8");
const parsedMessages = parseSessionContent(content);
let countSum = 0;
forEachJsonlLine(content, (parsed) => {
countSum += countMessagesForLine(parsed);
});
expect(countSum).toBe(parsedMessages.length);
});
it("countMessagesForLine sum matches parseSessionContent on edge-cases.jsonl", async () => {
const fixturePath = path.join(__dirname, "../fixtures/edge-cases.jsonl");
const content = await fs.readFile(fixturePath, "utf-8");
const parsedMessages = parseSessionContent(content);
let countSum = 0;
forEachJsonlLine(content, (parsed) => {
countSum += countMessagesForLine(parsed);
});
expect(countSum).toBe(parsedMessages.length);
});
});
}); });