From 887cc9c4147395d05d0ed46441cef3a4a8c0697a Mon Sep 17 00:00:00 2001 From: aktasbatuhan Date: Mon, 8 Jun 2026 15:42:14 +0100 Subject: [PATCH 1/5] feat(viewer): general kai run viewer (findings + trace) Lift the rollout viewer out of the benchmark harness into a core kai.viewer package so any pipeline run can be inspected, not just benchmark rollouts. - trace.py: the causal trace loader, moved verbatim from evaluation/trace_viewer.py - findings.py: load a run's exploits.json into a Finding view-model, reusing kai.cvss to derive severity and expand the CVSS vector - html.py: a single self-contained, data-first (Tufte) page with two tabs -- Findings (severity dot + score + 0-10 bar, CVSS breakdown, PoC, +/- patch diff) and Trace (the restyled causal spine); XSS-safe via textContent - __main__.py: python -m kai.viewer [-o OUT] [--open] - evaluation/trace_viewer.py: thin shim re-exporting from kai.viewer so cli view / index keep working Reads exploits.json + rollouts/*.jsonl straight off disk; no live state backend needed. --- evaluation/trace_viewer.py | 665 +------------------------------------ src/kai/viewer/__init__.py | 22 ++ src/kai/viewer/__main__.py | 52 +++ src/kai/viewer/findings.py | 191 +++++++++++ src/kai/viewer/html.py | 373 +++++++++++++++++++++ src/kai/viewer/trace.py | 353 ++++++++++++++++++++ tests/test_viewer.py | 118 +++++++ 7 files changed, 1118 insertions(+), 656 deletions(-) create mode 100644 src/kai/viewer/__init__.py create mode 100644 src/kai/viewer/__main__.py create mode 100644 src/kai/viewer/findings.py create mode 100644 src/kai/viewer/html.py create mode 100644 src/kai/viewer/trace.py create mode 100644 tests/test_viewer.py diff --git a/evaluation/trace_viewer.py b/evaluation/trace_viewer.py index d12f189f..569c6947 100644 --- a/evaluation/trace_viewer.py +++ b/evaluation/trace_viewer.py @@ -1,662 +1,15 @@ -"""Self-contained HTML viewer for RLM rollout traces. +"""Compatibility shim — the rollout viewer now lives in :mod:`kai.viewer`. -Reads a rollout directory (the per-agent ``.jsonl`` files an RLM run -writes via :mod:`kai.state.hooks`, plus the optional ``score.json`` / -``run.json`` siblings) and renders a single offline HTML file. - -The view follows **causality, not wall-clock**. The root agent (``exploit``) -is an orchestrator: it reasons, then runs Python, and that Python calls -``spawn_analyzer(...)`` / ``spawn_researcher(...)`` / ``spawn_verifier(...)`` -etc. to delegate a subtask. The sub-agent runs to completion *inside* that -code call and its ``final_answer`` comes back as the call's return value -- -which is why a naive timestamp sort is misleading: the parent iteration is -stamped when it *finishes*, i.e. after the child it spawned has already run, -so the child appears to precede its own cause. - -So we read the root top-to-bottom by iteration number -- reason -> run code --> observe output -- and attach each spawned sub-agent's full sub-transcript -under the exact ``spawn_*()`` call that caused it (matched per agent in call -order), with the value it returned surfaced at the call site. You can expand -a delegation to see *how* the sub-agent reached its answer. - -No external dependencies, no server, no spans -- just the rollouts on disk. -Pulled smoke dirs are flat (``*.jsonl`` next to ``score.json``); a fresh run -nests them under ``state//rollouts/``. Both work: we glob for -``*.jsonl`` and skip any file whose lines aren't valid JSON (empty files, or -``cat: ... No such file`` stubs from a partial ``railway ssh`` pull). +The viewer was lifted out of the benchmark harness into the core package so +``kai view`` can render any pipeline run (findings + agent trace), not just +benchmark rollouts. This module is kept so ``evaluation`` keeps importing +``load_rollout_dir`` / ``render_html`` / ``write_html`` from here; new code +should import from :mod:`kai.viewer` directly. """ from __future__ import annotations -import json -import re -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any - -ROOT_AGENT = "exploit" -SPAWN_RE = re.compile(r"\bspawn_([a-z][a-z_]*)\s*\(") - -# Per-agent tints, assigned in first-appearance order. -PALETTE = [ - "#7fdbca", - "#c792ea", - "#f78c6c", - "#82aaff", - "#ffcb6b", - "#f07178", - "#addb67", - "#89ddff", -] - - -@dataclass -class Iteration: - """One reason -> act -> observe step of an agent.""" - - n: int - timestamp: str - reasoning: str - blocks: list[dict[str, str]] = field(default_factory=list) - - -@dataclass -class AgentTrace: - """A single (sub-)agent's rollout: its metadata + iterations + result.""" - - name: str - depth: int - model: str - backend: str - iterations: list[Iteration] - result: str | None - first_ts: str - color: str = "" - - def legend_dict(self) -> dict[str, Any]: - return { - "name": self.name, - "depth": self.depth, - "model": self.model, - "iters": len(self.iterations), - "color": self.color, - } - - -@dataclass -class RunTrace: - """A whole run: the causal root spine plus run-level header fields.""" - - title: str - benchmark: str - task_id: str - success: bool | None - failure_reason: str | None - poc_source: str | None - models: list[str] - agents: list[AgentTrace] - root_name: str - root_result: str | None - root_steps: list[dict[str, Any]] - unlinked: list[dict[str, Any]] - - def as_dict(self) -> dict[str, Any]: - return { - "title": self.title, - "benchmark": self.benchmark, - "task_id": self.task_id, - "success": self.success, - "failure_reason": self.failure_reason, - "poc_source": self.poc_source, - "models": self.models, - "legend": [a.legend_dict() for a in self.agents], - "root_name": self.root_name, - "root_result": self.root_result, - "root_steps": self.root_steps, - "unlinked": self.unlinked, - } - - -def _load_jsonl(path: Path) -> list[dict[str, Any]]: - """Parse a ``.jsonl`` file, skipping any line that isn't valid JSON. - - Pulled rollout dirs can contain empty files or a ``cat: ... No such - file`` stub where an agent never ran; those simply yield no records. - """ - - records: list[dict[str, Any]] = [] - try: - text = path.read_text(encoding="utf-8", errors="replace") - except OSError: - return records - for line in text.splitlines(): - line = line.strip() - if not line: - continue - try: - obj = json.loads(line) - except json.JSONDecodeError: - continue - if isinstance(obj, dict): - records.append(obj) - return records - - -def _agent_from_records( - fallback_name: str, records: list[dict[str, Any]] -) -> AgentTrace | None: - """Fold a file's records into one :class:`AgentTrace` (or ``None``).""" - - meta = next((r for r in records if r.get("type") == "metadata"), {}) - iters = [ - Iteration( - n=int(r.get("iteration", 0)), - timestamp=str(r.get("timestamp", "")), - reasoning=str(r.get("response", "")), - blocks=[b for b in (r.get("code_blocks") or []) if isinstance(b, dict)], - ) - for r in records - if r.get("type") == "iteration" - ] - if not iters and not meta: - return None - result_rec = next((r for r in records if r.get("type") == "result"), None) - result = str(result_rec.get("final_answer", "")) if result_rec is not None else None - first_ts = str(meta.get("timestamp", "")) or (iters[0].timestamp if iters else "") - return AgentTrace( - name=str(meta.get("agent") or fallback_name), - depth=int(meta.get("depth", 0)), - model=str(meta.get("model", "")), - backend=str(meta.get("backend", "")), - iterations=iters, - result=result, - first_ts=first_ts, - ) - - -def _read_json(path: Path) -> dict[str, Any]: - if not path.exists(): - return {} - try: - obj = json.loads(path.read_text(encoding="utf-8", errors="replace")) - except (OSError, json.JSONDecodeError): - return {} - return obj if isinstance(obj, dict) else {} - - -def _spawn_sessions(records: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Split a sub-agent's records into one entry per spawn, time-ordered. - - The root re-invokes a sub-agent many times; each invocation is a distinct - ``spawn_id`` whose iteration counter restarts at 1. One session == one - delegation the root can match a ``spawn_*()`` call to. - """ - - order: list[str] = [] - sess: dict[str, dict[str, Any]] = {} - for r in records: - sid = str(r.get("spawn_id", "")) - kind = r.get("type") - if kind == "iteration": - if sid not in sess: - sess[sid] = { - "first_ts": str(r.get("timestamp", "")), - "returned": None, - "iters": [], - } - order.append(sid) - sess[sid]["iters"].append( - { - "iter": int(r.get("iteration", 0)), - "ts": str(r.get("timestamp", "")), - "reasoning": str(r.get("response", "")), - "blocks": [ - b for b in (r.get("code_blocks") or []) if isinstance(b, dict) - ], - } - ) - elif kind == "result" and sid in sess: - sess[sid]["returned"] = str(r.get("final_answer", "")) - out = [sess[s] for s in order] - out.sort(key=lambda s: s["first_ts"]) - return out - - -def _child(name: str, color: str, session: dict[str, Any] | None) -> dict[str, Any]: - if session is None: - return {"agent": name, "color": color, "missing": True, "iters": []} - return { - "agent": name, - "color": color, - "returned": session.get("returned"), - "iters": session["iters"], - } - - -def _build_root_spine( - root: AgentTrace, - sessions_by_agent: dict[str, list[dict[str, Any]]], - color_of: dict[str, str], -) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Walk the root's iterations and hang each spawned sub-agent under the - ``spawn_*()`` call that produced it (FIFO per agent name). - - Returns ``(root_steps, unlinked_children)``. ``unlinked`` holds sub-agent - sessions we couldn't tie to a call (count mismatch) so nothing is lost. - """ - - cursor = {name: 0 for name in sessions_by_agent if name != root.name} - steps: list[dict[str, Any]] = [] - for it in root.iterations: - code = "\n".join(b.get("code", "") for b in it.blocks) - children: list[dict[str, Any]] = [] - for name in SPAWN_RE.findall(code): - sessions = sessions_by_agent.get(name) - session = None - if sessions is not None and cursor.get(name, 0) < len(sessions): - session = sessions[cursor[name]] - cursor[name] += 1 - children.append(_child(name, color_of.get(name, "#8a99ad"), session)) - steps.append( - { - "iter": it.n, - "ts": it.timestamp, - "reasoning": it.reasoning, - "blocks": it.blocks, - "delegated": [c["agent"] for c in children], - "children": children, - } - ) - - unlinked: list[dict[str, Any]] = [] - for name, sessions in sessions_by_agent.items(): - if name == root.name: - continue - for session in sessions[cursor.get(name, 0) :]: - unlinked.append(_child(name, color_of.get(name, "#8a99ad"), session)) - return steps, unlinked - +from kai.viewer.html import render_html, write_html +from kai.viewer.trace import load_rollout_dir -def load_rollout_dir(path: Path) -> RunTrace: - """Build a :class:`RunTrace` (root spine + causal nesting) from a dir.""" - - path = Path(path) - if not path.is_dir(): - raise NotADirectoryError(f"{path} is not a directory") - - agents: list[AgentTrace] = [] - records_by_agent: dict[str, list[dict[str, Any]]] = {} - for jf in sorted(path.rglob("*.jsonl")): - if jf.name == "status_updates.jsonl": - continue - records = _load_jsonl(jf) - agent = _agent_from_records(jf.stem, records) - if agent is not None and agent.iterations: - agents.append(agent) - records_by_agent[agent.name] = records - - agents.sort(key=lambda a: (a.depth, a.first_ts, a.name)) - color_of = {a.name: PALETTE[i % len(PALETTE)] for i, a in enumerate(agents)} - for a in agents: - a.color = color_of[a.name] - - root = _pick_root(agents) - sessions_by_agent = { - name: _spawn_sessions(records) for name, records in records_by_agent.items() - } - if root is not None: - root_steps, unlinked = _build_root_spine(root, sessions_by_agent, color_of) - else: - root_steps, unlinked = [], [] - - score = _read_json(path / "score.json") - details = score.get("details") or {} - task_ref = score.get("task_ref") or {} - run = _read_json(path / "run.json") - - benchmark = str(task_ref.get("benchmark") or _guess_benchmark(path.name)) - task_id = str(task_ref.get("task_id") or details.get("task_id") or path.name) - models = sorted({a.model for a in agents if a.model}) - if not models and run.get("root_model"): - models = [str(run["root_model"])] - - return RunTrace( - title=path.name, - benchmark=benchmark, - task_id=task_id, - success=score.get("success"), - failure_reason=score.get("failure_reason"), - poc_source=details.get("poc_source"), - models=models, - agents=agents, - root_name=root.name if root else "", - root_result=root.result if root else None, - root_steps=root_steps, - unlinked=unlinked, - ) - - -def _pick_root(agents: list[AgentTrace]) -> AgentTrace | None: - """The depth-0 orchestrator (prefer the conventional ``exploit``).""" - - if not agents: - return None - named = next((a for a in agents if a.name == ROOT_AGENT and a.depth == 0), None) - if named is not None: - return named - return min(agents, key=lambda a: (a.depth, a.first_ts)) - - -def _guess_benchmark(dir_name: str) -> str: - for known in ("cybergym", "bountybench", "evmbench", "noop"): - if dir_name.startswith(known): - return known - return "rollout" - - -def render_html(run: RunTrace) -> str: - """Render a self-contained HTML page (inline data + CSS + JS).""" - - # ``; escape it in the blob. - data_json = json.dumps(run.as_dict()).replace(" Path: - """Load ``rollout_dir`` and write ``trace.html`` (or ``out``).""" - - run = load_rollout_dir(rollout_dir) - target = out or (Path(rollout_dir) / "trace.html") - target.write_text(render_html(run), encoding="utf-8") - return target - - -_HTML_TEMPLATE = r""" - - - - -RLM rollout trace - - - -
-

Trace:

-
-
-
- -
-
- - - - -""" +__all__ = ["load_rollout_dir", "render_html", "write_html"] diff --git a/src/kai/viewer/__init__.py b/src/kai/viewer/__init__.py new file mode 100644 index 00000000..73288f4e --- /dev/null +++ b/src/kai/viewer/__init__.py @@ -0,0 +1,22 @@ +"""Self-contained HTML viewer for kai runs (findings + agent trace). + +Reads a run directory written by the pipeline -- ``exploits.json`` for the +security findings and ``rollouts/*.jsonl`` (or flat ``*.jsonl``) for the +agent trace -- and renders a single offline HTML file. No server, no +external requests, no live state backend required. +""" + +from __future__ import annotations + +from kai.viewer.findings import Finding, load_findings +from kai.viewer.html import render_html, write_html +from kai.viewer.trace import RunTrace, load_rollout_dir + +__all__ = [ + "Finding", + "RunTrace", + "load_findings", + "load_rollout_dir", + "render_html", + "write_html", +] diff --git a/src/kai/viewer/__main__.py b/src/kai/viewer/__main__.py new file mode 100644 index 00000000..8affaecf --- /dev/null +++ b/src/kai/viewer/__main__.py @@ -0,0 +1,52 @@ +"""CLI entry point: ``python -m kai.viewer [-o OUT] [--open]``. + +Renders a run directory into a single self-contained HTML file. This is the +implementation the ``kai view`` subcommand wraps; it also works standalone. +""" + +from __future__ import annotations + +import argparse +import sys +import webbrowser +from pathlib import Path + +from kai.viewer.html import write_html + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="python -m kai.viewer", + description="Render a kai run (findings + agent trace) to a single HTML file.", + ) + parser.add_argument( + "run_dir", + help="run directory (a state// dir with exploits.json and/or rollouts/)", + ) + parser.add_argument( + "-o", + "--output", + help="output HTML path (default: /trace.html)", + ) + parser.add_argument( + "--open", + action="store_true", + help="open the rendered file in a browser", + ) + args = parser.parse_args(argv) + + run_dir = Path(args.run_dir) + if not run_dir.is_dir(): + print(f"error: {run_dir} is not a directory", file=sys.stderr) + return 2 + + out = Path(args.output) if args.output else None + target = write_html(run_dir, out) + print(target) + if args.open: + webbrowser.open(target.resolve().as_uri()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/kai/viewer/findings.py b/src/kai/viewer/findings.py new file mode 100644 index 00000000..c0c86f5e --- /dev/null +++ b/src/kai/viewer/findings.py @@ -0,0 +1,191 @@ +"""Load security findings from a run's ``exploits.json``. + +A normal pipeline run persists its findings as a JSON array of +:class:`kai.state.models.ExploitRecord` dicts at +``//exploits.json``. This module folds those into the +flat :class:`Finding` view-model the HTML renderer draws, deriving display +helpers (a one-line title, a severity bucket, a human-readable CVSS vector) +without needing a live state backend. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from kai import cvss + +# Human-readable expansions for CVSS 3.1 vector codes, by metric. +_CVSS_LABELS: dict[str, dict[str, str]] = { + "AV": {"N": "Network", "A": "Adjacent", "L": "Local", "P": "Physical"}, + "AC": {"L": "Low", "H": "High"}, + "PR": {"N": "None", "L": "Low", "H": "High"}, + "UI": {"N": "None", "R": "Required"}, + "S": {"U": "Unchanged", "C": "Changed"}, + "C": {"H": "High", "L": "Low", "N": "None"}, + "I": {"H": "High", "L": "Low", "N": "None"}, + "A": {"H": "High", "L": "Low", "N": "None"}, +} +_CVSS_ORDER = ("AV", "AC", "PR", "UI", "S", "C", "I", "A") + +# Status / category ordering: confirmed, runtime-exploitable findings first. +_SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "none": 0} + + +@dataclass +class Finding: + """One vulnerability finding, flattened for display.""" + + exploit_id: str + title: str + hypothesis: str + exploit_sketch: str + file: str + function: str + category: str + status: str + confirmed: bool | None + severity: str + cvss_score: float | None + cvss_vector: str + cvss_rows: list[dict[str, str]] = field(default_factory=list) + poc_code: str = "" + patch: str = "" + attacker_role: str = "" + prerequisite: str = "" + adversarial_viability: str = "" + profit_model: str = "" + critic_summary: str = "" + + def as_dict(self) -> dict[str, Any]: + return { + "exploit_id": self.exploit_id, + "title": self.title, + "hypothesis": self.hypothesis, + "exploit_sketch": self.exploit_sketch, + "file": self.file, + "function": self.function, + "category": self.category, + "status": self.status, + "confirmed": self.confirmed, + "severity": self.severity, + "cvss_score": self.cvss_score, + "cvss_vector": self.cvss_vector, + "cvss_rows": self.cvss_rows, + "poc_code": self.poc_code, + "patch": self.patch, + "attacker_role": self.attacker_role, + "prerequisite": self.prerequisite, + "adversarial_viability": self.adversarial_viability, + "profit_model": self.profit_model, + "critic_summary": self.critic_summary, + } + + +def _title_of(record: dict[str, Any]) -> str: + """A one-line headline: the first sentence of the hypothesis, else a + `` in `` fallback.""" + + hypothesis = str(record.get("hypothesis") or "").strip() + if hypothesis: + first = hypothesis.replace("\n", " ").split(". ")[0].strip().rstrip(".") + return first[:120] + ("…" if len(first) > 120 else "") + category = str(record.get("category") or "finding").replace("_", " ") + fn = str(record.get("function") or "").strip() + return f"{category} in {fn}" if fn else category + + +def _cvss_rows(vector: str, justification: dict[str, str] | None) -> list[dict[str, str]]: + """Expand a CVSS vector into ordered ``{metric, value, why}`` rows.""" + + if not vector: + return [] + try: + metrics = cvss.parse_vector(vector) + except Exception: + return [] + justification = justification or {} + rows: list[dict[str, str]] = [] + for code in _CVSS_ORDER: + if code not in metrics: + continue + value = metrics[code] + rows.append( + { + "metric": code, + "value": _CVSS_LABELS.get(code, {}).get(value, value), + "why": str(justification.get(code, "")), + } + ) + return rows + + +def _severity_of(record: dict[str, Any]) -> str: + """The record's severity, lowercased; derived from the CVSS score when + the field is absent.""" + + severity = str(record.get("severity") or "").strip().lower() + if severity in _SEVERITY_RANK: + return severity + score = record.get("cvss_score") + if isinstance(score, (int, float)): + return cvss.score_to_severity(float(score)).lower() + return "none" + + +def _finding_from_record(record: dict[str, Any]) -> Finding: + return Finding( + exploit_id=str(record.get("exploit_id") or ""), + title=_title_of(record), + hypothesis=str(record.get("hypothesis") or ""), + exploit_sketch=str(record.get("exploit_sketch") or ""), + file=str(record.get("file") or ""), + function=str(record.get("function") or ""), + category=str(record.get("category") or ""), + status=str(record.get("status") or ""), + confirmed=record.get("confirmed"), + severity=_severity_of(record), + cvss_score=record.get("cvss_score"), + cvss_vector=str(record.get("cvss_vector") or ""), + cvss_rows=_cvss_rows( + str(record.get("cvss_vector") or ""), record.get("cvss_justification") + ), + poc_code=str(record.get("poc_code") or ""), + patch=str(record.get("patch") or ""), + attacker_role=str(record.get("attacker_role") or ""), + prerequisite=str(record.get("prerequisite") or record.get("required_privileges") or ""), + adversarial_viability=str(record.get("adversarial_viability") or ""), + profit_model=str(record.get("profit_model") or ""), + critic_summary=str(record.get("critic_summary") or ""), + ) + + +def _sort_key(f: Finding) -> tuple[int, float]: + """Confirmed findings first, then by descending CVSS score.""" + + confirmed = 1 if f.confirmed else 0 + score = f.cvss_score if isinstance(f.cvss_score, (int, float)) else -1.0 + return (confirmed, score) + + +def load_findings(run_dir: Path) -> list[Finding]: + """Read ``/exploits.json`` into sorted :class:`Finding` objects. + + Returns an empty list when the file is absent or unparseable (e.g. a + benchmark rollout dir, which carries ``score.json`` but no findings). + """ + + path = Path(run_dir) / "exploits.json" + if not path.exists(): + return [] + try: + data = json.loads(path.read_text(encoding="utf-8", errors="replace")) + except (OSError, json.JSONDecodeError): + return [] + if not isinstance(data, list): + return [] + findings = [_finding_from_record(r) for r in data if isinstance(r, dict)] + findings.sort(key=_sort_key, reverse=True) + return findings diff --git a/src/kai/viewer/html.py b/src/kai/viewer/html.py new file mode 100644 index 00000000..d8cc98a4 --- /dev/null +++ b/src/kai/viewer/html.py @@ -0,0 +1,373 @@ +"""Render a run into a single self-contained HTML page. + +One file, inline data + CSS + JS, no server and no external requests. The +page has two tabs -- **Findings** (the security report: severity, CVSS, PoC, +patch) and **Trace** (the causal agent spine) -- styled data-first (Tufte): +paper background, one accent colour, severity as a quiet dot + exact score + +a thin 0-10 bar, the patch as a +/- diff. Every dynamic value is written via +``textContent``/DOM nodes, never ``innerHTML``, so unsanitised rollout text +cannot inject markup. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from kai.viewer.findings import Finding, load_findings +from kai.viewer.trace import RunTrace, load_rollout_dir + + +def render_html(run: RunTrace, findings: list[Finding] | None = None) -> str: + """Render the full page from a loaded trace + findings list. + + ``findings`` defaults to empty (e.g. a benchmark rollout dir has a trace + but no ``exploits.json``); the Findings tab then shows an empty state. + """ + + findings = findings or [] + data = { + "title": run.title, + "benchmark": run.benchmark, + "task_id": run.task_id, + "models": run.models, + "run": run.as_dict(), + "findings": [f.as_dict() for f in findings], + } + # ``; escape it in the blob. + blob = json.dumps(data).replace(" Path: + """Load ``run_dir`` (trace + findings) and write a single HTML file. + + Defaults to ``/trace.html`` so existing callers that link to + that name keep working. + """ + + run = load_rollout_dir(run_dir) + findings = load_findings(run_dir) + target = out or (Path(run_dir) / "trace.html") + target.write_text(render_html(run, findings), encoding="utf-8") + return target + + +_TEMPLATE = r""" + + + + +kai — run view + + + +
+

kai

+
+
+
+ +
+ +
+
+
+ +
CVSSFindingCategoryLocation
+
+
+
+ +
+
+
+ + + + + +""" diff --git a/src/kai/viewer/trace.py b/src/kai/viewer/trace.py new file mode 100644 index 00000000..4e676b76 --- /dev/null +++ b/src/kai/viewer/trace.py @@ -0,0 +1,353 @@ +"""Load RLM rollout traces from a run directory. + +Reads the per-agent ``.jsonl`` files an RLM run writes via +:mod:`kai.state.hooks` (plus the optional ``score.json`` / ``run.json`` +siblings) and folds them into a :class:`RunTrace` the HTML renderer can draw. + +The view follows **causality, not wall-clock**. The root agent (``exploit``) +is an orchestrator: it reasons, then runs Python, and that Python calls +``spawn_analyzer(...)`` / ``spawn_researcher(...)`` / ``spawn_verifier(...)`` +etc. to delegate a subtask. The sub-agent runs to completion *inside* that +code call and its ``final_answer`` comes back as the call's return value -- +which is why a naive timestamp sort is misleading: the parent iteration is +stamped when it *finishes*, i.e. after the child it spawned has already run, +so the child appears to precede its own cause. + +So we read the root top-to-bottom by iteration number -- reason -> run code +-> observe output -- and attach each spawned sub-agent's full sub-transcript +under the exact ``spawn_*()`` call that caused it (matched per agent in call +order), with the value it returned surfaced at the call site. + +No external dependencies, no server, no spans -- just the rollouts on disk. +Pulled smoke dirs are flat (``*.jsonl`` next to ``score.json``); a fresh run +nests them under ``state//rollouts/``. Both work: we glob for +``*.jsonl`` and skip any file whose lines aren't valid JSON (empty files, or +``cat: ... No such file`` stubs from a partial ``railway ssh`` pull). +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +ROOT_AGENT = "exploit" +SPAWN_RE = re.compile(r"\bspawn_([a-z][a-z_]*)\s*\(") + +# Per-agent tints, assigned in first-appearance order. +PALETTE = [ + "#7fdbca", + "#c792ea", + "#f78c6c", + "#82aaff", + "#ffcb6b", + "#f07178", + "#addb67", + "#89ddff", +] + + +@dataclass +class Iteration: + """One reason -> act -> observe step of an agent.""" + + n: int + timestamp: str + reasoning: str + blocks: list[dict[str, str]] = field(default_factory=list) + + +@dataclass +class AgentTrace: + """A single (sub-)agent's rollout: its metadata + iterations + result.""" + + name: str + depth: int + model: str + backend: str + iterations: list[Iteration] + result: str | None + first_ts: str + color: str = "" + + def legend_dict(self) -> dict[str, Any]: + return { + "name": self.name, + "depth": self.depth, + "model": self.model, + "iters": len(self.iterations), + "color": self.color, + } + + +@dataclass +class RunTrace: + """A whole run: the causal root spine plus run-level header fields.""" + + title: str + benchmark: str + task_id: str + success: bool | None + failure_reason: str | None + poc_source: str | None + models: list[str] + agents: list[AgentTrace] + root_name: str + root_result: str | None + root_steps: list[dict[str, Any]] + unlinked: list[dict[str, Any]] + + def as_dict(self) -> dict[str, Any]: + return { + "title": self.title, + "benchmark": self.benchmark, + "task_id": self.task_id, + "success": self.success, + "failure_reason": self.failure_reason, + "poc_source": self.poc_source, + "models": self.models, + "legend": [a.legend_dict() for a in self.agents], + "root_name": self.root_name, + "root_result": self.root_result, + "root_steps": self.root_steps, + "unlinked": self.unlinked, + } + + +def _load_jsonl(path: Path) -> list[dict[str, Any]]: + """Parse a ``.jsonl`` file, skipping any line that isn't valid JSON. + + Pulled rollout dirs can contain empty files or a ``cat: ... No such + file`` stub where an agent never ran; those simply yield no records. + """ + + records: list[dict[str, Any]] = [] + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError: + return records + for line in text.splitlines(): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(obj, dict): + records.append(obj) + return records + + +def _agent_from_records( + fallback_name: str, records: list[dict[str, Any]] +) -> AgentTrace | None: + """Fold a file's records into one :class:`AgentTrace` (or ``None``).""" + + meta = next((r for r in records if r.get("type") == "metadata"), {}) + iters = [ + Iteration( + n=int(r.get("iteration", 0)), + timestamp=str(r.get("timestamp", "")), + reasoning=str(r.get("response", "")), + blocks=[b for b in (r.get("code_blocks") or []) if isinstance(b, dict)], + ) + for r in records + if r.get("type") == "iteration" + ] + if not iters and not meta: + return None + result_rec = next((r for r in records if r.get("type") == "result"), None) + result = str(result_rec.get("final_answer", "")) if result_rec is not None else None + first_ts = str(meta.get("timestamp", "")) or (iters[0].timestamp if iters else "") + return AgentTrace( + name=str(meta.get("agent") or fallback_name), + depth=int(meta.get("depth", 0)), + model=str(meta.get("model", "")), + backend=str(meta.get("backend", "")), + iterations=iters, + result=result, + first_ts=first_ts, + ) + + +def _read_json(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + try: + obj = json.loads(path.read_text(encoding="utf-8", errors="replace")) + except (OSError, json.JSONDecodeError): + return {} + return obj if isinstance(obj, dict) else {} + + +def _spawn_sessions(records: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Split a sub-agent's records into one entry per spawn, time-ordered. + + The root re-invokes a sub-agent many times; each invocation is a distinct + ``spawn_id`` whose iteration counter restarts at 1. One session == one + delegation the root can match a ``spawn_*()`` call to. + """ + + order: list[str] = [] + sess: dict[str, dict[str, Any]] = {} + for r in records: + sid = str(r.get("spawn_id", "")) + kind = r.get("type") + if kind == "iteration": + if sid not in sess: + sess[sid] = { + "first_ts": str(r.get("timestamp", "")), + "returned": None, + "iters": [], + } + order.append(sid) + sess[sid]["iters"].append( + { + "iter": int(r.get("iteration", 0)), + "ts": str(r.get("timestamp", "")), + "reasoning": str(r.get("response", "")), + "blocks": [ + b for b in (r.get("code_blocks") or []) if isinstance(b, dict) + ], + } + ) + elif kind == "result" and sid in sess: + sess[sid]["returned"] = str(r.get("final_answer", "")) + out = [sess[s] for s in order] + out.sort(key=lambda s: s["first_ts"]) + return out + + +def _child(name: str, color: str, session: dict[str, Any] | None) -> dict[str, Any]: + if session is None: + return {"agent": name, "color": color, "missing": True, "iters": []} + return { + "agent": name, + "color": color, + "returned": session.get("returned"), + "iters": session["iters"], + } + + +def _build_root_spine( + root: AgentTrace, + sessions_by_agent: dict[str, list[dict[str, Any]]], + color_of: dict[str, str], +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Walk the root's iterations and hang each spawned sub-agent under the + ``spawn_*()`` call that produced it (FIFO per agent name). + + Returns ``(root_steps, unlinked_children)``. ``unlinked`` holds sub-agent + sessions we couldn't tie to a call (count mismatch) so nothing is lost. + """ + + cursor = {name: 0 for name in sessions_by_agent if name != root.name} + steps: list[dict[str, Any]] = [] + for it in root.iterations: + code = "\n".join(b.get("code", "") for b in it.blocks) + children: list[dict[str, Any]] = [] + for name in SPAWN_RE.findall(code): + sessions = sessions_by_agent.get(name) + session = None + if sessions is not None and cursor.get(name, 0) < len(sessions): + session = sessions[cursor[name]] + cursor[name] += 1 + children.append(_child(name, color_of.get(name, "#8a99ad"), session)) + steps.append( + { + "iter": it.n, + "ts": it.timestamp, + "reasoning": it.reasoning, + "blocks": it.blocks, + "delegated": [c["agent"] for c in children], + "children": children, + } + ) + + unlinked: list[dict[str, Any]] = [] + for name, sessions in sessions_by_agent.items(): + if name == root.name: + continue + for session in sessions[cursor.get(name, 0) :]: + unlinked.append(_child(name, color_of.get(name, "#8a99ad"), session)) + return steps, unlinked + + +def load_rollout_dir(path: Path) -> RunTrace: + """Build a :class:`RunTrace` (root spine + causal nesting) from a dir.""" + + path = Path(path) + if not path.is_dir(): + raise NotADirectoryError(f"{path} is not a directory") + + agents: list[AgentTrace] = [] + records_by_agent: dict[str, list[dict[str, Any]]] = {} + for jf in sorted(path.rglob("*.jsonl")): + if jf.name == "status_updates.jsonl": + continue + records = _load_jsonl(jf) + agent = _agent_from_records(jf.stem, records) + if agent is not None and agent.iterations: + agents.append(agent) + records_by_agent[agent.name] = records + + agents.sort(key=lambda a: (a.depth, a.first_ts, a.name)) + color_of = {a.name: PALETTE[i % len(PALETTE)] for i, a in enumerate(agents)} + for a in agents: + a.color = color_of[a.name] + + root = _pick_root(agents) + sessions_by_agent = { + name: _spawn_sessions(records) for name, records in records_by_agent.items() + } + if root is not None: + root_steps, unlinked = _build_root_spine(root, sessions_by_agent, color_of) + else: + root_steps, unlinked = [], [] + + score = _read_json(path / "score.json") + details = score.get("details") or {} + task_ref = score.get("task_ref") or {} + run = _read_json(path / "run.json") + + benchmark = str(task_ref.get("benchmark") or _guess_benchmark(path.name)) + task_id = str(task_ref.get("task_id") or details.get("task_id") or path.name) + models = sorted({a.model for a in agents if a.model}) + if not models and run.get("root_model"): + models = [str(run["root_model"])] + + return RunTrace( + title=path.name, + benchmark=benchmark, + task_id=task_id, + success=score.get("success"), + failure_reason=score.get("failure_reason"), + poc_source=details.get("poc_source"), + models=models, + agents=agents, + root_name=root.name if root else "", + root_result=root.result if root else None, + root_steps=root_steps, + unlinked=unlinked, + ) + + +def _pick_root(agents: list[AgentTrace]) -> AgentTrace | None: + """The depth-0 orchestrator (prefer the conventional ``exploit``).""" + + if not agents: + return None + named = next((a for a in agents if a.name == ROOT_AGENT and a.depth == 0), None) + if named is not None: + return named + return min(agents, key=lambda a: (a.depth, a.first_ts)) + + +def _guess_benchmark(dir_name: str) -> str: + for known in ("cybergym", "bountybench", "evmbench", "noop"): + if dir_name.startswith(known): + return known + return "rollout" diff --git a/tests/test_viewer.py b/tests/test_viewer.py new file mode 100644 index 00000000..9e8cd896 --- /dev/null +++ b/tests/test_viewer.py @@ -0,0 +1,118 @@ +"""Tests for the kai run viewer (findings loader + HTML render).""" + +from __future__ import annotations + +import json +from pathlib import Path + +from kai.viewer import load_findings, render_html, write_html +from kai.viewer.trace import RunTrace, load_rollout_dir + +_EXPLOITS = [ + { + "exploit_id": "e2", + "status": "rejected", + "confirmed": False, + "hypothesis": "Fee truncation rounds small trades to zero.", + "file": "contracts/Fees.sol", + "function": "calcFee", + "category": "theoretical_bounds", + "cvss_score": 4.3, + }, + { + "exploit_id": "e1", + "status": "verified", + "confirmed": True, + "hypothesis": ( + "Reentrancy in withdraw drains the vault. The external call " + "precedes the balance update and there is no guard." + ), + "file": "contracts/Vault.sol", + "function": "withdraw", + "category": "active_exploit", + "severity": "critical", + "cvss_score": 9.1, + "cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H", + "cvss_justification": {"AV": "remote attacker", "AC": "no special conditions"}, + "poc_code": "contract Attacker { function pwn() external {} }", + "patch": "- msg.sender.call{value: amount}(\"\");\n+ balances[msg.sender] -= amount;", + "attacker_role": "anyone", + "prerequisite": "a non-zero deposit", + }, +] + + +def _write_run(dir_path: Path) -> None: + (dir_path / "exploits.json").write_text(json.dumps(_EXPLOITS), encoding="utf-8") + rollouts = dir_path / "rollouts" + rollouts.mkdir() + exploit = [ + {"type": "metadata", "agent": "exploit", "depth": 0, "spawn_id": "r1", + "timestamp": "2026-06-03T00:00:00+00:00", "model": "anthropic/claude-opus-4.8"}, + {"type": "iteration", "agent": "exploit", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:01:00+00:00", + "response": "Analyzing the vault.", "code_blocks": []}, + {"type": "result", "agent": "exploit", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:02:00+00:00", "final_answer": "done"}, + ] + (rollouts / "exploit.jsonl").write_text( + "\n".join(json.dumps(r) for r in exploit), encoding="utf-8" + ) + + +def test_load_findings_sorts_and_derives(tmp_path: Path) -> None: + _write_run(tmp_path) + findings = load_findings(tmp_path) + + # Confirmed critical sorts ahead of the unconfirmed lower-severity finding. + assert [f.exploit_id for f in findings] == ["e1", "e2"] + e1, e2 = findings + assert e1.severity == "critical" + assert e1.title.startswith("Reentrancy in withdraw") + # Severity is derived from the CVSS score when the field is absent. + assert e2.severity == "medium" + # The CVSS vector is expanded into ordered, human-readable rows. + assert [r["metric"] for r in e1.cvss_rows] == ["AV", "AC", "PR", "UI", "S", "C", "I", "A"] + assert e1.cvss_rows[0] == {"metric": "AV", "value": "Network", "why": "remote attacker"} + + +def test_load_findings_missing_file_is_empty(tmp_path: Path) -> None: + assert load_findings(tmp_path) == [] + + +def test_render_is_self_contained_and_has_findings(tmp_path: Path) -> None: + _write_run(tmp_path) + html = render_html(load_rollout_dir(tmp_path), load_findings(tmp_path)) + + assert html.startswith("") + # Fully offline: no external resources. + assert "http://" not in html and "https://" not in html + for needle in ( + "Reentrancy in withdraw", + "contracts/Vault.sol", + "active_exploit", + "critical", + "Attacker", # poc_code + "balances[msg.sender]", # patch diff body + ): + assert needle in html + + +def test_render_without_findings_still_renders(tmp_path: Path) -> None: + # A benchmark-style dir: a trace but no exploits.json. + (tmp_path / "rollouts").mkdir() + run = RunTrace( + title="t", benchmark="rollout", task_id="t", success=None, + failure_reason=None, poc_source=None, models=[], agents=[], + root_name="", root_result=None, root_steps=[], unlinked=[], + ) + html = render_html(run) + assert html.startswith("") + assert "No findings recorded" in html or "view-findings" in html + + +def test_write_html_creates_file(tmp_path: Path) -> None: + _write_run(tmp_path) + out = write_html(tmp_path) + assert out == tmp_path / "trace.html" + assert "Reentrancy in withdraw" in out.read_text(encoding="utf-8") From ca33b4a4a80a5d4e8ead3fc4d6d7ef9f1a43d332 Mon Sep 17 00:00:00 2001 From: aktasbatuhan Date: Mon, 8 Jun 2026 16:15:28 +0100 Subject: [PATCH 2/5] refactor(viewer): extract shared Tufte design system + tighten finding titles - Move the design tokens + shared component CSS (table, severity dot/score/ bar, CVSS/detail blocks, code/diff/output panes) into kai/viewer/style.py. The interactive viewer now composes style.base_css() + its own layout, so the upcoming 'kai report --format html' document can reuse the exact same palette and primitives -- one design system, no drift. - Cap derived finding titles at ~64 chars on a word boundary so a long first hypothesis sentence stays a scannable headline instead of wrapping across table cells and section titles (improves both viewer and report). --- src/kai/viewer/findings.py | 6 +- src/kai/viewer/html.py | 157 ++++++++++++++----------------------- src/kai/viewer/style.py | 84 ++++++++++++++++++++ 3 files changed, 147 insertions(+), 100 deletions(-) create mode 100644 src/kai/viewer/style.py diff --git a/src/kai/viewer/findings.py b/src/kai/viewer/findings.py index c0c86f5e..02ee5dd3 100644 --- a/src/kai/viewer/findings.py +++ b/src/kai/viewer/findings.py @@ -91,7 +91,11 @@ def _title_of(record: dict[str, Any]) -> str: hypothesis = str(record.get("hypothesis") or "").strip() if hypothesis: first = hypothesis.replace("\n", " ").split(". ")[0].strip().rstrip(".") - return first[:120] + ("…" if len(first) > 120 else "") + # Cut at a word boundary so a long first sentence stays a scannable + # headline rather than wrapping across table cells / section titles. + if len(first) > 64: + first = first[:64].rsplit(" ", 1)[0] + "…" + return first category = str(record.get("category") or "finding").replace("_", " ") fn = str(record.get("function") or "").strip() return f"{category} in {fn}" if fn else category diff --git a/src/kai/viewer/html.py b/src/kai/viewer/html.py index d8cc98a4..11fe40f3 100644 --- a/src/kai/viewer/html.py +++ b/src/kai/viewer/html.py @@ -7,6 +7,10 @@ a thin 0-10 bar, the patch as a +/- diff. Every dynamic value is written via ``textContent``/DOM nodes, never ``innerHTML``, so unsanitised rollout text cannot inject markup. + +The palette and shared primitives come from :mod:`kai.viewer.style`, so this +interactive viewer and the static ``kai report --format html`` document share +one design system. """ from __future__ import annotations @@ -14,69 +18,14 @@ import json from pathlib import Path +from kai.viewer import style from kai.viewer.findings import Finding, load_findings from kai.viewer.trace import RunTrace, load_rollout_dir - -def render_html(run: RunTrace, findings: list[Finding] | None = None) -> str: - """Render the full page from a loaded trace + findings list. - - ``findings`` defaults to empty (e.g. a benchmark rollout dir has a trace - but no ``exploits.json``); the Findings tab then shows an empty state. - """ - - findings = findings or [] - data = { - "title": run.title, - "benchmark": run.benchmark, - "task_id": run.task_id, - "models": run.models, - "run": run.as_dict(), - "findings": [f.as_dict() for f in findings], - } - # ``; escape it in the blob. - blob = json.dumps(data).replace(" Path: - """Load ``run_dir`` (trace + findings) and write a single HTML file. - - Defaults to ``/trace.html`` so existing callers that link to - that name keep working. - """ - - run = load_rollout_dir(run_dir) - findings = load_findings(run_dir) - target = out or (Path(run_dir) / "trace.html") - target.write_text(render_html(run, findings), encoding="utf-8") - return target - - -_TEMPLATE = r""" - - - - -kai — run view - diff --git a/src/kai/viewer/style.py b/src/kai/viewer/style.py new file mode 100644 index 00000000..e1e0ca5a --- /dev/null +++ b/src/kai/viewer/style.py @@ -0,0 +1,84 @@ +"""Shared design system for kai's HTML surfaces. + +One palette, one set of primitives, used by both the interactive viewer +(:mod:`kai.viewer.html`) and the static report document (``kai report +--format html``). Each surface concatenates ``TOKENS + COMPONENTS`` with its +own layout CSS, so the look (colours, severity treatment, code/diff blocks) +can never drift between them. +""" + +from __future__ import annotations + +# Design tokens: the palette + the single accent. Dark theme overrides the +# same variables, so every component below is theme-aware for free. +TOKENS = """\ + :root { + --paper:#fafaf7; --panel:#fff; --ink:#1a1a1a; --rule:#e3dfd6; --rule-2:#d8d4cc; + --muted:#8a857c; --muted-2:#6b665d; --accent:#b3261e; --add:#2f6f43; --del:#9a2a22; + --gray-bar:#c8c2b5; --code-bg:#f4f1ea; + } + [data-theme="dark"] { + --paper:#14171b; --panel:#1b1f25; --ink:#e7e3da; --rule:#2a3038; --rule-2:#343b44; + --muted:#9aa3ad; --muted-2:#7f8893; --accent:#e5675d; --add:#7ec99a; --del:#e79a92; + --gray-bar:#3a424c; --code-bg:#11151b; + } +""" + +# Shared component primitives: base type, the findings table, the severity +# encoding (dot + score + 0-10 bar), the key/value + CVSS detail blocks, and +# code / diff / output panes. +COMPONENTS = """\ + * { box-sizing: border-box; } + html, body { margin: 0; } + body { background: var(--paper); color: var(--ink); + font: 14px/1.55 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; } + .serif { font-family: Charter, "Iowan Old Style", Georgia, serif; } + code, pre, .mono { font-family: ui-monospace, "SF Mono", Menlo, Consolas, monospace; } + + table { border-collapse: collapse; width: 100%; } + thead th { text-align: left; font-size: 10px; letter-spacing: .07em; text-transform: uppercase; + color: var(--muted-2); font-weight: 600; padding: 10px 14px 8px; border-bottom: 1px solid var(--rule-2); } + thead th.num { text-align: right; } + tbody tr { border-bottom: 1px solid var(--rule); } + td { padding: 11px 14px; vertical-align: top; } + td.cvss { white-space: nowrap; } + + .dot { display:inline-block; width:8px; height:8px; border-radius:50%; vertical-align: middle; margin-right: 7px; background: var(--gray-bar); } + .sev-critical .dot, .sev-high .dot { background: var(--accent); } + .sev-medium .dot { background: var(--muted-2); } + .score { font-family: ui-monospace, monospace; font-weight: 600; font-size: 13px; } + .bar { display:block; height: 3px; width: 64px; background: var(--gray-bar); margin-top: 6px; border-radius: 2px; } + .bar > i { display:block; height: 100%; background: var(--muted-2); border-radius: 2px; } + .sev-critical .bar > i, .sev-high .bar > i { background: var(--accent); } + .ftitle { font-weight: 600; } + .cat { font-size: 11px; color: var(--muted-2); } + .loc { font-size: 12px; color: var(--muted); } + .unconf { opacity: .62; } + + .kv { display: grid; grid-template-columns: 130px 1fr; gap: 5px 14px; font-size: 13px; margin: 0; } + .kv dt { color: var(--muted-2); } + .kv dd { margin: 0; } + .sec-label { font-size: 11px; letter-spacing: .07em; text-transform: uppercase; color: var(--muted-2); + margin: 18px 0 8px; border-top: 1px solid var(--rule); padding-top: 12px; } + .prose { white-space: pre-wrap; margin: 0; } + .cvss-grid { display: grid; grid-template-columns: max-content max-content 1fr; gap: 5px 14px; + font-size: 12.5px; align-items: baseline; } + .cvss-grid .m { color: var(--muted-2); font-family: ui-monospace, monospace; } + .cvss-grid .v { font-weight: 500; } + .cvss-grid .why { color: var(--muted); font-size: 12px; } + .vector { font-size: 12px; color: var(--muted); margin: 0 0 10px; } + + pre.code, pre.diff, pre.output { margin: 0 0 4px; padding: 11px 13px; border: 1px solid var(--rule-2); + border-radius: 6px; background: var(--code-bg); overflow: auto; font-size: 12.5px; line-height: 1.5; } + pre.code, pre.diff { white-space: pre; } + pre.output { white-space: pre-wrap; color: var(--muted-2); max-height: 320px; } + pre.diff .add { color: var(--add); } + pre.diff .del { color: var(--del); } + .empty { color: var(--muted); padding: 40px 22px; } +""" + + +def base_css() -> str: + """The shared stylesheet: tokens + component primitives.""" + + return TOKENS + COMPONENTS From 9e6fbdd50e2c09ac1ccaaedf55a3e8dbdbd227b4 Mon Sep 17 00:00:00 2001 From: aktasbatuhan Date: Mon, 8 Jun 2026 18:44:02 +0100 Subject: [PATCH 3/5] fix(viewer): hide deduplicated findings from report/view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A finished run's exploits.json keeps merged-away duplicate hypotheses as 'deduplicated' shells (no severity/PoC). They were surfacing as empty 'none'-severity rows and inflating the finding count (a real run showed '10 findings · 9 none'). load_findings now drops them, leaving the confirmed and rejected findings that actually matter. --- src/kai/viewer/findings.py | 11 ++++++++++- tests/test_viewer.py | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/kai/viewer/findings.py b/src/kai/viewer/findings.py index 02ee5dd3..c80ef1b0 100644 --- a/src/kai/viewer/findings.py +++ b/src/kai/viewer/findings.py @@ -33,6 +33,11 @@ # Status / category ordering: confirmed, runtime-exploitable findings first. _SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "none": 0} +# Internal bookkeeping records that aren't user-facing findings: when the +# pipeline merges duplicate hypotheses it keeps the merged-away ones as +# ``deduplicated`` shells (no severity/PoC). The report and viewer hide them. +_HIDDEN_STATUSES = {"deduplicated"} + @dataclass class Finding: @@ -190,6 +195,10 @@ def load_findings(run_dir: Path) -> list[Finding]: return [] if not isinstance(data, list): return [] - findings = [_finding_from_record(r) for r in data if isinstance(r, dict)] + findings = [ + _finding_from_record(r) + for r in data + if isinstance(r, dict) and r.get("status") not in _HIDDEN_STATUSES + ] findings.sort(key=_sort_key, reverse=True) return findings diff --git a/tests/test_viewer.py b/tests/test_viewer.py index 9e8cd896..c3f92b87 100644 --- a/tests/test_viewer.py +++ b/tests/test_viewer.py @@ -60,6 +60,21 @@ def _write_run(dir_path: Path) -> None: ) +def test_load_findings_drops_deduplicated(tmp_path: Path) -> None: + records = [ + {"exploit_id": "keep", "status": "verified_and_fixed", "confirmed": True, + "hypothesis": "real bug", "file": "A.sol", "function": "f", + "category": "active_exploit", "severity": "high", "cvss_score": 8.0}, + {"exploit_id": "dup", "status": "deduplicated", "confirmed": None, + "hypothesis": "merged duplicate", "file": "A.sol", "function": "f", + "category": "active_exploit"}, + ] + (tmp_path / "exploits.json").write_text(json.dumps(records), encoding="utf-8") + findings = load_findings(tmp_path) + # The deduplicated bookkeeping shell is hidden; the real finding remains. + assert [f.exploit_id for f in findings] == ["keep"] + + def test_load_findings_sorts_and_derives(tmp_path: Path) -> None: _write_run(tmp_path) findings = load_findings(tmp_path) From 267121c2dc593571d116729a8bd428e8ab6e5c25 Mon Sep 17 00:00:00 2001 From: aktasbatuhan Date: Tue, 9 Jun 2026 14:36:21 +0100 Subject: [PATCH 4/5] refactor(viewer): move the reusable run viewer into ra/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: the trace viewer and design system are framework-level — any agent built on ra produces rollouts and can render them — so they belong in ra, not kai. - ra/viewer/: trace.py + style.py (moved) + html.py, a self-contained, panel-composed viewer with a built-in causal Trace panel and a reusable render_page(data, panels) composer. 'python -m ra.viewer ' renders any ra run's trace standalone. - kai/viewer/: keeps the security domain layer — findings.py (CVSS/exploits) and a Findings panel — and composes it onto ra's Trace panel via ra.viewer.render_page. Findings stay in kai because they're domain concepts; moving them to ra would invert the dependency (ra must not import kai). evaluation shim and tests updated to import the trace loader from ra.viewer. No behaviour change to 'kai view' (still Findings + Trace, same design). --- evaluation/trace_viewer.py | 3 +- src/kai/viewer/__init__.py | 3 +- src/kai/viewer/html.py | 299 +++++++------------------------- src/ra/viewer/__init__.py | 29 ++++ src/ra/viewer/__main__.py | 47 +++++ src/ra/viewer/html.py | 288 ++++++++++++++++++++++++++++++ src/{kai => ra}/viewer/style.py | 0 src/{kai => ra}/viewer/trace.py | 0 tests/test_ra_viewer.py | 59 +++++++ tests/test_viewer.py | 3 +- 10 files changed, 493 insertions(+), 238 deletions(-) create mode 100644 src/ra/viewer/__init__.py create mode 100644 src/ra/viewer/__main__.py create mode 100644 src/ra/viewer/html.py rename src/{kai => ra}/viewer/style.py (100%) rename src/{kai => ra}/viewer/trace.py (100%) create mode 100644 tests/test_ra_viewer.py diff --git a/evaluation/trace_viewer.py b/evaluation/trace_viewer.py index 569c6947..c509023d 100644 --- a/evaluation/trace_viewer.py +++ b/evaluation/trace_viewer.py @@ -9,7 +9,8 @@ from __future__ import annotations +from ra.viewer.trace import load_rollout_dir + from kai.viewer.html import render_html, write_html -from kai.viewer.trace import load_rollout_dir __all__ = ["load_rollout_dir", "render_html", "write_html"] diff --git a/src/kai/viewer/__init__.py b/src/kai/viewer/__init__.py index 73288f4e..508a1756 100644 --- a/src/kai/viewer/__init__.py +++ b/src/kai/viewer/__init__.py @@ -8,9 +8,10 @@ from __future__ import annotations +from ra.viewer.trace import RunTrace, load_rollout_dir + from kai.viewer.findings import Finding, load_findings from kai.viewer.html import render_html, write_html -from kai.viewer.trace import RunTrace, load_rollout_dir __all__ = [ "Finding", diff --git a/src/kai/viewer/html.py b/src/kai/viewer/html.py index 11fe40f3..9fabb195 100644 --- a/src/kai/viewer/html.py +++ b/src/kai/viewer/html.py @@ -1,141 +1,38 @@ -"""Render a run into a single self-contained HTML page. +"""Render a kai run as a single self-contained HTML page. -One file, inline data + CSS + JS, no server and no external requests. The -page has two tabs -- **Findings** (the security report: severity, CVSS, PoC, -patch) and **Trace** (the causal agent spine) -- styled data-first (Tufte): -paper background, one accent colour, severity as a quiet dot + exact score + -a thin 0-10 bar, the patch as a +/- diff. Every dynamic value is written via -``textContent``/DOM nodes, never ``innerHTML``, so unsanitised rollout text -cannot inject markup. +Composes kai's security **Findings** panel (severity, CVSS, PoC, patch) onto +the reusable viewer in :mod:`ra.viewer` — which supplies the tabbed shell, the +shared design system, and the built-in **Trace** panel. Findings stay here +because they're domain concepts (CVSS, exploits); the trace viewer and styling +live in ``ra`` so any ra agent can reuse them. -The palette and shared primitives come from :mod:`kai.viewer.style`, so this -interactive viewer and the static ``kai report --format html`` document share -one design system. +Every dynamic value is written via ``textContent`` / DOM nodes, so unsanitised +rollout text cannot inject markup. """ from __future__ import annotations -import json from pathlib import Path -from kai.viewer import style -from kai.viewer.findings import Finding, load_findings -from kai.viewer.trace import RunTrace, load_rollout_dir +from ra.viewer.html import Panel, render_page, trace_panel +from ra.viewer.trace import RunTrace, load_rollout_dir -# Viewer-only layout: the chrome (header/tabs/toggle), the master-detail -# split, interactive table rows, and the trace spine. Shared primitives -# (tokens, table, severity, code blocks) live in kai.viewer.style. -_VIEWER_LAYOUT = """\ - header { display: flex; align-items: baseline; gap: 18px; flex-wrap: wrap; - padding: 12px 22px; border-bottom: 1px solid var(--rule-2); - position: sticky; top: 0; background: var(--paper); z-index: 5; } - header h1 { margin: 0; font-size: 16px; font-weight: 600; } - header h1 .sub { color: var(--muted); font-weight: 400; } - .facts { display: flex; gap: 16px; font-size: 12px; color: var(--muted-2); } - .facts b { color: var(--ink); font-weight: 600; } - .facts .crit b { color: var(--accent); } - .spacer { flex: 1 1 auto; } - .tabs { display: flex; gap: 2px; } - .tab { border: 0; background: none; color: var(--muted-2); cursor: pointer; - font: inherit; font-size: 13px; padding: 4px 10px; border-bottom: 2px solid transparent; } - .tab.active { color: var(--ink); border-bottom-color: var(--accent); } - .toggle { border: 1px solid var(--rule-2); background: none; color: var(--muted-2); - border-radius: 5px; cursor: pointer; font-size: 12px; padding: 3px 8px; } +from kai.viewer.findings import Finding, load_findings - .view { display: none; } - .view.active { display: block; } +# Findings-panel layout: the master-detail split + interactive table rows + +# the detail pane. Shared tokens/primitives come from ra.viewer.style. +_FINDINGS_CSS = """\ .split { display: grid; grid-template-columns: minmax(360px, 1fr) minmax(420px, 1.3fr); } @media (max-width: 880px) { .split { grid-template-columns: 1fr; } } - tbody tr { cursor: pointer; } tbody tr:hover { background: color-mix(in srgb, var(--accent) 5%, transparent); } tbody tr.sel { background: color-mix(in srgb, var(--accent) 9%, transparent); } - .detail { border-left: 1px solid var(--rule-2); padding: 18px 22px; min-width: 0; } .detail h2 { margin: 0 0 4px; font-size: 18px; font-weight: 600; line-height: 1.3; } .detail .where { font-size: 12.5px; color: var(--muted); margin-bottom: 16px; } - - .trace { padding: 14px 22px; max-width: 920px; } - .legend { display: flex; flex-wrap: wrap; gap: 12px; font-size: 12px; color: var(--muted-2); - margin-bottom: 16px; padding-bottom: 12px; border-bottom: 1px solid var(--rule); } - .legend .a { display: inline-flex; align-items: center; gap: 6px; } - .legend .sw { width: 9px; height: 9px; border-radius: 2px; display: inline-block; } - .step { border-left: 2px solid var(--rule-2); padding: 0 0 2px 14px; margin: 0 0 16px; } - .step .h { display: flex; gap: 10px; align-items: baseline; font-size: 12px; color: var(--muted-2); margin-bottom: 5px; } - .step .h .who { color: var(--ink); font-weight: 600; } - .step .h .deleg { color: var(--accent); } - .step .h .ts { margin-left: auto; } - .reason { white-space: pre-wrap; margin: 0 0 8px; } - details.spawn { border-left: 2px dashed var(--rule-2); padding-left: 12px; margin: 4px 0 10px; } - details.spawn > summary { cursor: pointer; font-size: 12.5px; color: var(--muted-2); padding: 3px 0; } - details.spawn > summary .who { color: var(--ink); font-weight: 600; } - details.spawn[open] > summary .ret { display: none; } - .ret-box { border: 1px solid var(--rule-2); background: var(--code-bg); border-radius: 6px; - padding: 8px 10px; margin: 4px 0 9px; white-space: pre-wrap; font-size: 12.5px; max-height: 220px; overflow: auto; } - .childit { padding-left: 10px; border-left: 1px solid var(--rule); margin-bottom: 10px; } - .childhead { font-size: 11px; color: var(--muted-2); margin: 9px 0 4px; } - .missing { color: var(--del); font-size: 12px; padding: 4px 0; } - .sec { font-size: 12px; color: var(--muted-2); margin: 26px 0 10px; border-top: 1px solid var(--rule); padding-top: 12px; } - .result { border: 1px solid var(--rule-2); border-radius: 8px; padding: 11px; background: var(--panel); white-space: pre-wrap; margin: 2px 0 0; } """ -_VIEWER_CSS = style.base_css() + _VIEWER_LAYOUT - - -def render_html(run: RunTrace, findings: list[Finding] | None = None) -> str: - """Render the full page from a loaded trace + findings list. - - ``findings`` defaults to empty (e.g. a benchmark rollout dir has a trace - but no ``exploits.json``); the Findings tab then shows an empty state. - """ - - findings = findings or [] - data = { - "title": run.title, - "benchmark": run.benchmark, - "task_id": run.task_id, - "models": run.models, - "run": run.as_dict(), - "findings": [f.as_dict() for f in findings], - } - # ``; escape it in the blob. - blob = json.dumps(data).replace(" Path: - """Load ``run_dir`` (trace + findings) and write a single HTML file. - - Defaults to ``/trace.html`` so existing callers that link to - that name keep working. - """ - - run = load_rollout_dir(run_dir) - findings = load_findings(run_dir) - target = out or (Path(run_dir) / "trace.html") - target.write_text(render_html(run, findings), encoding="utf-8") - return target - - -_TEMPLATE = r""" - - - - -kai — run view - - - -
-

kai

-
-
-
- -
- +_FINDINGS_SECTION = """\
@@ -143,28 +40,11 @@ def write_html(run_dir: Path, out: Path | None = None) -> Path:
-
- -
-
-
+""" - - - - -""" +def render_html(run: RunTrace, findings: list[Finding] | None = None) -> str: + """Render the full kai page (Findings + Trace) from a trace + findings list. + + ``findings`` defaults to empty (e.g. a benchmark rollout dir has a trace but + no ``exploits.json``); the Findings tab then shows an empty state and the + Trace tab opens first. + """ + + findings = findings or [] + data = { + "title": run.title, + "benchmark": run.benchmark, + "task_id": run.task_id, + "models": run.models, + "run": run.as_dict(), + "findings": [f.as_dict() for f in findings], + } + default_view = "findings" if findings else "trace" + return render_page( + data, [_findings_panel(), trace_panel()], brand="kai", default_view=default_view + ) + + +def write_html(run_dir: Path, out: Path | None = None) -> Path: + """Load ``run_dir`` (trace + findings) and write a single HTML file. + + Defaults to ``/trace.html`` so existing callers that link to that + name keep working. + """ + + run = load_rollout_dir(run_dir) + findings = load_findings(run_dir) + target = out or (Path(run_dir) / "trace.html") + target.write_text(render_html(run, findings), encoding="utf-8") + return target diff --git a/src/ra/viewer/__init__.py b/src/ra/viewer/__init__.py new file mode 100644 index 00000000..a38cc784 --- /dev/null +++ b/src/ra/viewer/__init__.py @@ -0,0 +1,29 @@ +"""Reusable HTML viewer for ``ra`` agent runs. + +Renders a run directory's ``*.jsonl`` rollouts into a single offline HTML page +— a tabbed shell with a built-in causal **Trace** panel and a shared design +system (:mod:`ra.viewer.style`). Domain layers compose extra panels on top via +:func:`ra.viewer.html.render_page`; kai-security, for example, adds a security +**Findings** panel. +""" + +from __future__ import annotations + +from ra.viewer.html import ( + Panel, + render_page, + render_trace_html, + trace_panel, + write_trace_html, +) +from ra.viewer.trace import RunTrace, load_rollout_dir + +__all__ = [ + "Panel", + "RunTrace", + "load_rollout_dir", + "render_page", + "render_trace_html", + "trace_panel", + "write_trace_html", +] diff --git a/src/ra/viewer/__main__.py b/src/ra/viewer/__main__.py new file mode 100644 index 00000000..09c72f28 --- /dev/null +++ b/src/ra/viewer/__main__.py @@ -0,0 +1,47 @@ +"""CLI entry point: ``python -m ra.viewer [-o OUT] [--open]``. + +Renders any ``ra`` run's agent trace into a single self-contained HTML file. +Domain tools (e.g. ``kai view``) wrap a richer page on top of this. +""" + +from __future__ import annotations + +import argparse +import sys +import webbrowser +from pathlib import Path + +from ra.viewer.html import write_trace_html + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="python -m ra.viewer", + description="Render an ra run's agent trace to a single HTML file.", + ) + parser.add_argument( + "run_dir", + help="run directory (a dir with *.jsonl rollouts, or state//rollouts/)", + ) + parser.add_argument( + "-o", "--output", help="output HTML path (default: /trace.html)" + ) + parser.add_argument( + "--open", action="store_true", help="open the rendered file in a browser" + ) + args = parser.parse_args(argv) + + run_dir = Path(args.run_dir) + if not run_dir.is_dir(): + print(f"error: {run_dir} is not a directory", file=sys.stderr) + return 2 + + target = write_trace_html(run_dir, Path(args.output) if args.output else None) + print(target) + if args.open: + webbrowser.open(target.resolve().as_uri()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/ra/viewer/html.py b/src/ra/viewer/html.py new file mode 100644 index 00000000..3c3a6f3e --- /dev/null +++ b/src/ra/viewer/html.py @@ -0,0 +1,288 @@ +"""Reusable, self-contained HTML viewer for any ``ra`` agent run. + +This is framework-level: any agent built on ``ra`` writes per-agent +``*.jsonl`` rollouts, and this module renders them into a single offline HTML +page (no server, no external requests). The page is built from **panels** — a +tabbed shell plus one or more views — so a domain layer can add its own panel +(e.g. kai adds a security **Findings** panel) on top of the built-in +**Trace** panel. + +Every dynamic value is written via ``textContent`` / DOM nodes, never +``innerHTML``, so unsanitised rollout text cannot inject markup. The palette +and shared primitives come from :mod:`ra.viewer.style`. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from ra.viewer import style +from ra.viewer.trace import RunTrace, load_rollout_dir + + +@dataclass(frozen=True) +class Panel: + """One tab in the viewer. + + ``js`` defines a render function over the embedded ``DATA``/``RUN`` globals + (and the shared ``el()`` helper); ``render_call`` invokes it at init. + """ + + id: str + label: str + section: str # the
block + css: str + js: str + render_call: str + + +# Shell chrome only (header / tabs / theme toggle / view switching). Panel- and +# domain-specific styling lives on each Panel; shared tokens + primitives come +# from style.base_css(). +_SHELL_CSS = """\ + header { display: flex; align-items: baseline; gap: 18px; flex-wrap: wrap; + padding: 12px 22px; border-bottom: 1px solid var(--rule-2); + position: sticky; top: 0; background: var(--paper); z-index: 5; } + header h1 { margin: 0; font-size: 16px; font-weight: 600; } + header h1 .sub { color: var(--muted); font-weight: 400; } + .facts { display: flex; gap: 16px; font-size: 12px; color: var(--muted-2); } + .facts b { color: var(--ink); font-weight: 600; } + .facts .crit b { color: var(--accent); } + .spacer { flex: 1 1 auto; } + .tabs { display: flex; gap: 2px; } + .tab { border: 0; background: none; color: var(--muted-2); cursor: pointer; + font: inherit; font-size: 13px; padding: 4px 10px; border-bottom: 2px solid transparent; } + .tab.active { color: var(--ink); border-bottom-color: var(--accent); } + .toggle { border: 1px solid var(--rule-2); background: none; color: var(--muted-2); + border-radius: 5px; cursor: pointer; font-size: 12px; padding: 3px 8px; } + .view { display: none; } + .view.active { display: block; } +""" + +# --------------------------------------------------------------------------- +# Built-in Trace panel: the causal agent spine. +# --------------------------------------------------------------------------- +_TRACE_CSS = """\ + .trace { padding: 14px 22px; max-width: 920px; } + .legend { display: flex; flex-wrap: wrap; gap: 12px; font-size: 12px; color: var(--muted-2); + margin-bottom: 16px; padding-bottom: 12px; border-bottom: 1px solid var(--rule); } + .legend .a { display: inline-flex; align-items: center; gap: 6px; } + .legend .sw { width: 9px; height: 9px; border-radius: 2px; display: inline-block; } + .step { border-left: 2px solid var(--rule-2); padding: 0 0 2px 14px; margin: 0 0 16px; } + .step .h { display: flex; gap: 10px; align-items: baseline; font-size: 12px; color: var(--muted-2); margin-bottom: 5px; } + .step .h .who { color: var(--ink); font-weight: 600; } + .step .h .deleg { color: var(--accent); } + .step .h .ts { margin-left: auto; } + .reason { white-space: pre-wrap; margin: 0 0 8px; } + details.spawn { border-left: 2px dashed var(--rule-2); padding-left: 12px; margin: 4px 0 10px; } + details.spawn > summary { cursor: pointer; font-size: 12.5px; color: var(--muted-2); padding: 3px 0; } + details.spawn > summary .who { color: var(--ink); font-weight: 600; } + details.spawn[open] > summary .ret { display: none; } + .ret-box { border: 1px solid var(--rule-2); background: var(--code-bg); border-radius: 6px; + padding: 8px 10px; margin: 4px 0 9px; white-space: pre-wrap; font-size: 12.5px; max-height: 220px; overflow: auto; } + .childit { padding-left: 10px; border-left: 1px solid var(--rule); margin-bottom: 10px; } + .childhead { font-size: 11px; color: var(--muted-2); margin: 9px 0 4px; } + .missing { color: var(--del); font-size: 12px; padding: 4px 0; } + .sec { font-size: 12px; color: var(--muted-2); margin: 26px 0 10px; border-top: 1px solid var(--rule); padding-top: 12px; } + .result { border: 1px solid var(--rule-2); border-radius: 8px; padding: 11px; background: var(--panel); white-space: pre-wrap; margin: 2px 0 0; } +""" + +_TRACE_SECTION = """\ +
+
+
""" + +_TRACE_JS = r""" +function head(s) { return (s || "").replace(/\s+/g, " ").trim().slice(0, 130); } +function proseNode(text) { + const parts = String(text || "").split("```"); + const prose = parts.filter((_, i) => i % 2 === 0).join("\n") + .replace(/
/g, "").replace(/\n{3,}/g, "\n\n").trim(); + return prose ? el("div", "reason", prose) : null; +} +function childNode(child) { + const det = el("details", "spawn"); + det.dataset.agent = child.agent; + if (child.color) det.style.borderLeftColor = child.color; + const sum = el("summary"); + const who = el("span", "who", "⤷ spawned " + child.agent); + if (child.color) who.style.color = child.color; + sum.append(who); + if (child.missing) { + sum.append(el("span", "ret", " — no rollout captured")); + det.append(sum, el("div", "missing", "(sub-agent file absent or empty)")); + return det; + } + sum.append(el("span", "ret", child.returned ? " — returned: " + head(child.returned) : " — (no return value recorded)")); + det.append(sum); + if (child.returned) det.append(el("div", "ret-box", child.returned)); + (child.iters || []).forEach(it => { + const wrap = el("div", "childit"); + wrap.append(el("div", "childhead", child.agent + " · iter " + it.iter)); + const p = proseNode(it.reasoning); if (p) wrap.append(p); + (it.blocks || []).forEach(b => { + if (b.code && b.code.trim()) wrap.append(el("pre", "code", b.code)); + if (b.output && b.output.trim()) wrap.append(el("pre", "output", b.output)); + }); + det.append(wrap); + }); + return det; +} +function stepNode(step) { + const wrap = el("div", "step"); + const h = el("div", "h"); + h.append(el("span", "who", RUN.root_name), el("span", null, "#" + step.iter)); + if (step.delegated && step.delegated.length) h.append(el("span", "deleg", "⤷ " + step.delegated.join(", "))); + h.append(el("span", "ts", (step.ts || "").replace("T", " ").slice(0, 19))); + wrap.append(h); + const p = proseNode(step.reasoning); if (p) wrap.append(p); + (step.blocks || []).forEach(b => { if (b.code && b.code.trim()) wrap.append(el("pre", "code", b.code)); }); + (step.children || []).forEach(c => wrap.append(childNode(c))); + (step.blocks || []).forEach(b => { if (b.output && b.output.trim()) wrap.append(el("pre", "output", b.output)); }); + return wrap; +} +function renderTrace() { + const t = document.getElementById("trace"); + if (!RUN.root_steps || !RUN.root_steps.length) { + t.replaceChildren(el("div", "empty", "No agent rollouts found for this run.")); + return; + } + const legend = el("div", "legend"); + (RUN.legend || []).forEach(a => { + const span = el("span", "a"); + const sw = el("span", "sw"); sw.style.background = a.color; span.append(sw); + span.append(el("span", null, a.name + " · d" + a.depth + " · " + a.iters + " it")); + legend.append(span); + }); + const nodes = [legend]; + RUN.root_steps.forEach(s => nodes.push(stepNode(s))); + if (RUN.root_result) { nodes.push(el("div", "sec", RUN.root_name + " — final answer")); nodes.push(el("div", "result", RUN.root_result)); } + (RUN.unlinked || []).forEach(c => nodes.push(childNode(c))); + t.replaceChildren(...nodes); +} +""" + + +def trace_panel() -> Panel: + """The built-in causal-trace panel, reusable by any ``ra`` agent.""" + + return Panel("trace", "Trace", _TRACE_SECTION, _TRACE_CSS, _TRACE_JS, "renderTrace();") + + +_SHELL = r""" + + + + +__TITLE__ + + + +
+

__BRAND__

+
+
+
+ +
+ +__SECTIONS__ + + + + + +""" + + +def render_page( + data: dict, + panels: list[Panel], + *, + brand: str = "ra", + default_view: str | None = None, +) -> str: + """Assemble a self-contained page from ``data`` + an ordered list of panels. + + ``data`` is embedded as JSON (the panels' JS reads it via the ``DATA`` / + ``RUN`` globals). ``default_view`` is the panel id shown first; it defaults + to the first panel. + """ + + blob = json.dumps(data).replace(" + css = style.base_css() + _SHELL_CSS + "".join(p.css for p in panels) + sections = "\n".join(p.section for p in panels) + panel_js = "\n".join(p.js for p in panels) + render_calls = "\n ".join(p.render_call for p in panels) + panels_meta = json.dumps([{"id": p.id, "label": p.label} for p in panels]) + default = json.dumps(default_view or (panels[0].id if panels else "")) + return ( + _SHELL.replace("__TITLE__", f"{brand} — run view") + .replace("__BRAND__", brand) + .replace("__STYLE__", css) + .replace("__SECTIONS__", sections) + .replace("__PANELS_META__", panels_meta) + .replace("__PANEL_JS__", panel_js) + .replace("__RENDER_CALLS__", render_calls) + .replace("__DEFAULT_VIEW__", default) + .replace("__DATA__", blob) + ) + + +def render_trace_html(run: RunTrace, *, brand: str = "ra") -> str: + """Render a run's causal agent trace as a standalone single-page viewer.""" + + data = { + "title": run.title, + "task_id": run.task_id, + "models": run.models, + "run": run.as_dict(), + } + return render_page(data, [trace_panel()], brand=brand, default_view="trace") + + +def write_trace_html(run_dir: Path, out: Path | None = None) -> Path: + """Load ``run_dir`` and write the standalone trace viewer to ``out``.""" + + run = load_rollout_dir(run_dir) + target = out or (Path(run_dir) / "trace.html") + target.write_text(render_trace_html(run), encoding="utf-8") + return target diff --git a/src/kai/viewer/style.py b/src/ra/viewer/style.py similarity index 100% rename from src/kai/viewer/style.py rename to src/ra/viewer/style.py diff --git a/src/kai/viewer/trace.py b/src/ra/viewer/trace.py similarity index 100% rename from src/kai/viewer/trace.py rename to src/ra/viewer/trace.py diff --git a/tests/test_ra_viewer.py b/tests/test_ra_viewer.py new file mode 100644 index 00000000..90e7bfb1 --- /dev/null +++ b/tests/test_ra_viewer.py @@ -0,0 +1,59 @@ +"""Tests for the reusable ra.viewer (framework-level trace viewer + composer). + +These exercise the viewer with NO kai/findings involvement, proving any ra +agent can render its run trace. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from ra.viewer import Panel, load_rollout_dir, render_page, render_trace_html +from ra.viewer.trace import RunTrace + + +def _write_rollout(dir_path: Path) -> None: + rollouts = dir_path / "rollouts" + rollouts.mkdir() + rows = [ + {"type": "metadata", "agent": "root", "depth": 0, "spawn_id": "r1", + "timestamp": "2026-06-03T00:00:00+00:00", "model": "some/model"}, + {"type": "iteration", "agent": "root", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:01:00+00:00", "response": "thinking", "code_blocks": []}, + {"type": "result", "agent": "root", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:02:00+00:00", "final_answer": "done"}, + ] + (rollouts / "root.jsonl").write_text( + "\n".join(json.dumps(r) for r in rows), encoding="utf-8" + ) + + +def test_render_trace_html_is_self_contained(tmp_path: Path) -> None: + _write_rollout(tmp_path) + html = render_trace_html(load_rollout_dir(tmp_path)) + + assert html.startswith("") + assert "http://" not in html and "https://" not in html + # Trace tab present; no kai Findings tab when used standalone. + assert 'id="view-trace"' in html + assert 'id="view-findings"' not in html + assert "renderTrace();" in html + + +def test_render_page_composes_arbitrary_panels() -> None: + run = RunTrace( + title="t", benchmark=None, task_id="t", success=None, failure_reason=None, + poc_source=None, models=["m"], agents=[], root_name="root", + root_result=None, root_steps=[], unlinked=[], + ) + custom = Panel( + id="notes", label="Notes", + section='

', + css=".notes{}", js="function renderNotes(){document.getElementById('n').textContent='hi';}", + render_call="renderNotes();", + ) + html = render_page({"title": "t", "run": run.as_dict()}, [custom], default_view="notes") + assert 'id="view-notes"' in html + assert "renderNotes();" in html + assert '"id": "notes"' in html or '"id":"notes"' in html diff --git a/tests/test_viewer.py b/tests/test_viewer.py index c3f92b87..0bae92fd 100644 --- a/tests/test_viewer.py +++ b/tests/test_viewer.py @@ -5,8 +5,9 @@ import json from pathlib import Path +from ra.viewer.trace import RunTrace, load_rollout_dir + from kai.viewer import load_findings, render_html, write_html -from kai.viewer.trace import RunTrace, load_rollout_dir _EXPLOITS = [ { From 3826e9e0b553333e035f0eb100d27a3b16143365 Mon Sep 17 00:00:00 2001 From: aktasbatuhan Date: Tue, 9 Jun 2026 15:06:33 +0100 Subject: [PATCH 5/5] fix(viewer): rank by severity before score; create parent dirs on write Review (Copilot, #98): - _sort_key now orders by (confirmed, severity, score) so a high/critical finding with a label but no usable CVSS score isn't sunk below a low finding that has a numeric score. - write_html / write_trace_html mkdir the output's parent, so '-o some/new/dir/trace.html' works instead of raising FileNotFoundError. --- src/kai/viewer/findings.py | 12 +++++++++--- src/kai/viewer/html.py | 1 + src/ra/viewer/html.py | 1 + tests/test_viewer.py | 13 +++++++++++++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/kai/viewer/findings.py b/src/kai/viewer/findings.py index c80ef1b0..2ae1bbd8 100644 --- a/src/kai/viewer/findings.py +++ b/src/kai/viewer/findings.py @@ -171,12 +171,18 @@ def _finding_from_record(record: dict[str, Any]) -> Finding: ) -def _sort_key(f: Finding) -> tuple[int, float]: - """Confirmed findings first, then by descending CVSS score.""" +def _sort_key(f: Finding) -> tuple[int, int, float]: + """Confirmed first, then by severity, then CVSS score as the tie-breaker. + + Severity is the secondary key so a high/critical finding that carries a + label but no usable CVSS score (the fixer can emit one without a vector) + still outranks a low finding that happens to have a numeric score. + """ confirmed = 1 if f.confirmed else 0 + severity = _SEVERITY_RANK.get(f.severity, 0) score = f.cvss_score if isinstance(f.cvss_score, (int, float)) else -1.0 - return (confirmed, score) + return (confirmed, severity, score) def load_findings(run_dir: Path) -> list[Finding]: diff --git a/src/kai/viewer/html.py b/src/kai/viewer/html.py index 9fabb195..64a8db0d 100644 --- a/src/kai/viewer/html.py +++ b/src/kai/viewer/html.py @@ -157,5 +157,6 @@ def write_html(run_dir: Path, out: Path | None = None) -> Path: run = load_rollout_dir(run_dir) findings = load_findings(run_dir) target = out or (Path(run_dir) / "trace.html") + target.parent.mkdir(parents=True, exist_ok=True) target.write_text(render_html(run, findings), encoding="utf-8") return target diff --git a/src/ra/viewer/html.py b/src/ra/viewer/html.py index 3c3a6f3e..ef29e898 100644 --- a/src/ra/viewer/html.py +++ b/src/ra/viewer/html.py @@ -284,5 +284,6 @@ def write_trace_html(run_dir: Path, out: Path | None = None) -> Path: run = load_rollout_dir(run_dir) target = out or (Path(run_dir) / "trace.html") + target.parent.mkdir(parents=True, exist_ok=True) target.write_text(render_trace_html(run), encoding="utf-8") return target diff --git a/tests/test_viewer.py b/tests/test_viewer.py index 0bae92fd..b0986ddc 100644 --- a/tests/test_viewer.py +++ b/tests/test_viewer.py @@ -76,6 +76,19 @@ def test_load_findings_drops_deduplicated(tmp_path: Path) -> None: assert [f.exploit_id for f in findings] == ["keep"] +def test_load_findings_sorts_severity_over_missing_score(tmp_path: Path) -> None: + # A critical finding with a severity label but no CVSS score must still + # outrank a low finding that happens to carry a numeric score. + records = [ + {"exploit_id": "low_scored", "status": "verified", "confirmed": True, + "hypothesis": "low but scored", "severity": "low", "cvss_score": 3.1}, + {"exploit_id": "crit_unscored", "status": "verified", "confirmed": True, + "hypothesis": "critical, no vector", "severity": "critical"}, + ] + (tmp_path / "exploits.json").write_text(json.dumps(records), encoding="utf-8") + assert [f.exploit_id for f in load_findings(tmp_path)] == ["crit_unscored", "low_scored"] + + def test_load_findings_sorts_and_derives(tmp_path: Path) -> None: _write_run(tmp_path) findings = load_findings(tmp_path)