diff --git a/evaluation/trace_viewer.py b/evaluation/trace_viewer.py index d12f189f..c509023d 100644 --- a/evaluation/trace_viewer.py +++ b/evaluation/trace_viewer.py @@ -1,662 +1,16 @@ -"""Self-contained HTML viewer for RLM rollout traces. +"""Compatibility shim — the rollout viewer now lives in :mod:`kai.viewer`. -Reads a rollout directory (the per-agent ``.jsonl`` files an RLM run -writes via :mod:`kai.state.hooks`, plus the optional ``score.json`` / -``run.json`` siblings) and renders a single offline HTML file. - -The view follows **causality, not wall-clock**. The root agent (``exploit``) -is an orchestrator: it reasons, then runs Python, and that Python calls -``spawn_analyzer(...)`` / ``spawn_researcher(...)`` / ``spawn_verifier(...)`` -etc. to delegate a subtask. The sub-agent runs to completion *inside* that -code call and its ``final_answer`` comes back as the call's return value -- -which is why a naive timestamp sort is misleading: the parent iteration is -stamped when it *finishes*, i.e. after the child it spawned has already run, -so the child appears to precede its own cause. - -So we read the root top-to-bottom by iteration number -- reason -> run code --> observe output -- and attach each spawned sub-agent's full sub-transcript -under the exact ``spawn_*()`` call that caused it (matched per agent in call -order), with the value it returned surfaced at the call site. You can expand -a delegation to see *how* the sub-agent reached its answer. - -No external dependencies, no server, no spans -- just the rollouts on disk. -Pulled smoke dirs are flat (``*.jsonl`` next to ``score.json``); a fresh run -nests them under ``state//rollouts/``. Both work: we glob for -``*.jsonl`` and skip any file whose lines aren't valid JSON (empty files, or -``cat: ... No such file`` stubs from a partial ``railway ssh`` pull). +The viewer was lifted out of the benchmark harness into the core package so +``kai view`` can render any pipeline run (findings + agent trace), not just +benchmark rollouts. This module is kept so ``evaluation`` keeps importing +``load_rollout_dir`` / ``render_html`` / ``write_html`` from here; new code +should import from :mod:`kai.viewer` directly. """ from __future__ import annotations -import json -import re -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any - -ROOT_AGENT = "exploit" -SPAWN_RE = re.compile(r"\bspawn_([a-z][a-z_]*)\s*\(") - -# Per-agent tints, assigned in first-appearance order. -PALETTE = [ - "#7fdbca", - "#c792ea", - "#f78c6c", - "#82aaff", - "#ffcb6b", - "#f07178", - "#addb67", - "#89ddff", -] - - -@dataclass -class Iteration: - """One reason -> act -> observe step of an agent.""" - - n: int - timestamp: str - reasoning: str - blocks: list[dict[str, str]] = field(default_factory=list) - - -@dataclass -class AgentTrace: - """A single (sub-)agent's rollout: its metadata + iterations + result.""" - - name: str - depth: int - model: str - backend: str - iterations: list[Iteration] - result: str | None - first_ts: str - color: str = "" - - def legend_dict(self) -> dict[str, Any]: - return { - "name": self.name, - "depth": self.depth, - "model": self.model, - "iters": len(self.iterations), - "color": self.color, - } - - -@dataclass -class RunTrace: - """A whole run: the causal root spine plus run-level header fields.""" - - title: str - benchmark: str - task_id: str - success: bool | None - failure_reason: str | None - poc_source: str | None - models: list[str] - agents: list[AgentTrace] - root_name: str - root_result: str | None - root_steps: list[dict[str, Any]] - unlinked: list[dict[str, Any]] - - def as_dict(self) -> dict[str, Any]: - return { - "title": self.title, - "benchmark": self.benchmark, - "task_id": self.task_id, - "success": self.success, - "failure_reason": self.failure_reason, - "poc_source": self.poc_source, - "models": self.models, - "legend": [a.legend_dict() for a in self.agents], - "root_name": self.root_name, - "root_result": self.root_result, - "root_steps": self.root_steps, - "unlinked": self.unlinked, - } - - -def _load_jsonl(path: Path) -> list[dict[str, Any]]: - """Parse a ``.jsonl`` file, skipping any line that isn't valid JSON. - - Pulled rollout dirs can contain empty files or a ``cat: ... No such - file`` stub where an agent never ran; those simply yield no records. - """ - - records: list[dict[str, Any]] = [] - try: - text = path.read_text(encoding="utf-8", errors="replace") - except OSError: - return records - for line in text.splitlines(): - line = line.strip() - if not line: - continue - try: - obj = json.loads(line) - except json.JSONDecodeError: - continue - if isinstance(obj, dict): - records.append(obj) - return records - - -def _agent_from_records( - fallback_name: str, records: list[dict[str, Any]] -) -> AgentTrace | None: - """Fold a file's records into one :class:`AgentTrace` (or ``None``).""" - - meta = next((r for r in records if r.get("type") == "metadata"), {}) - iters = [ - Iteration( - n=int(r.get("iteration", 0)), - timestamp=str(r.get("timestamp", "")), - reasoning=str(r.get("response", "")), - blocks=[b for b in (r.get("code_blocks") or []) if isinstance(b, dict)], - ) - for r in records - if r.get("type") == "iteration" - ] - if not iters and not meta: - return None - result_rec = next((r for r in records if r.get("type") == "result"), None) - result = str(result_rec.get("final_answer", "")) if result_rec is not None else None - first_ts = str(meta.get("timestamp", "")) or (iters[0].timestamp if iters else "") - return AgentTrace( - name=str(meta.get("agent") or fallback_name), - depth=int(meta.get("depth", 0)), - model=str(meta.get("model", "")), - backend=str(meta.get("backend", "")), - iterations=iters, - result=result, - first_ts=first_ts, - ) - - -def _read_json(path: Path) -> dict[str, Any]: - if not path.exists(): - return {} - try: - obj = json.loads(path.read_text(encoding="utf-8", errors="replace")) - except (OSError, json.JSONDecodeError): - return {} - return obj if isinstance(obj, dict) else {} - - -def _spawn_sessions(records: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Split a sub-agent's records into one entry per spawn, time-ordered. - - The root re-invokes a sub-agent many times; each invocation is a distinct - ``spawn_id`` whose iteration counter restarts at 1. One session == one - delegation the root can match a ``spawn_*()`` call to. - """ - - order: list[str] = [] - sess: dict[str, dict[str, Any]] = {} - for r in records: - sid = str(r.get("spawn_id", "")) - kind = r.get("type") - if kind == "iteration": - if sid not in sess: - sess[sid] = { - "first_ts": str(r.get("timestamp", "")), - "returned": None, - "iters": [], - } - order.append(sid) - sess[sid]["iters"].append( - { - "iter": int(r.get("iteration", 0)), - "ts": str(r.get("timestamp", "")), - "reasoning": str(r.get("response", "")), - "blocks": [ - b for b in (r.get("code_blocks") or []) if isinstance(b, dict) - ], - } - ) - elif kind == "result" and sid in sess: - sess[sid]["returned"] = str(r.get("final_answer", "")) - out = [sess[s] for s in order] - out.sort(key=lambda s: s["first_ts"]) - return out - - -def _child(name: str, color: str, session: dict[str, Any] | None) -> dict[str, Any]: - if session is None: - return {"agent": name, "color": color, "missing": True, "iters": []} - return { - "agent": name, - "color": color, - "returned": session.get("returned"), - "iters": session["iters"], - } - - -def _build_root_spine( - root: AgentTrace, - sessions_by_agent: dict[str, list[dict[str, Any]]], - color_of: dict[str, str], -) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Walk the root's iterations and hang each spawned sub-agent under the - ``spawn_*()`` call that produced it (FIFO per agent name). - - Returns ``(root_steps, unlinked_children)``. ``unlinked`` holds sub-agent - sessions we couldn't tie to a call (count mismatch) so nothing is lost. - """ - - cursor = {name: 0 for name in sessions_by_agent if name != root.name} - steps: list[dict[str, Any]] = [] - for it in root.iterations: - code = "\n".join(b.get("code", "") for b in it.blocks) - children: list[dict[str, Any]] = [] - for name in SPAWN_RE.findall(code): - sessions = sessions_by_agent.get(name) - session = None - if sessions is not None and cursor.get(name, 0) < len(sessions): - session = sessions[cursor[name]] - cursor[name] += 1 - children.append(_child(name, color_of.get(name, "#8a99ad"), session)) - steps.append( - { - "iter": it.n, - "ts": it.timestamp, - "reasoning": it.reasoning, - "blocks": it.blocks, - "delegated": [c["agent"] for c in children], - "children": children, - } - ) - - unlinked: list[dict[str, Any]] = [] - for name, sessions in sessions_by_agent.items(): - if name == root.name: - continue - for session in sessions[cursor.get(name, 0) :]: - unlinked.append(_child(name, color_of.get(name, "#8a99ad"), session)) - return steps, unlinked +from ra.viewer.trace import load_rollout_dir +from kai.viewer.html import render_html, write_html -def load_rollout_dir(path: Path) -> RunTrace: - """Build a :class:`RunTrace` (root spine + causal nesting) from a dir.""" - - path = Path(path) - if not path.is_dir(): - raise NotADirectoryError(f"{path} is not a directory") - - agents: list[AgentTrace] = [] - records_by_agent: dict[str, list[dict[str, Any]]] = {} - for jf in sorted(path.rglob("*.jsonl")): - if jf.name == "status_updates.jsonl": - continue - records = _load_jsonl(jf) - agent = _agent_from_records(jf.stem, records) - if agent is not None and agent.iterations: - agents.append(agent) - records_by_agent[agent.name] = records - - agents.sort(key=lambda a: (a.depth, a.first_ts, a.name)) - color_of = {a.name: PALETTE[i % len(PALETTE)] for i, a in enumerate(agents)} - for a in agents: - a.color = color_of[a.name] - - root = _pick_root(agents) - sessions_by_agent = { - name: _spawn_sessions(records) for name, records in records_by_agent.items() - } - if root is not None: - root_steps, unlinked = _build_root_spine(root, sessions_by_agent, color_of) - else: - root_steps, unlinked = [], [] - - score = _read_json(path / "score.json") - details = score.get("details") or {} - task_ref = score.get("task_ref") or {} - run = _read_json(path / "run.json") - - benchmark = str(task_ref.get("benchmark") or _guess_benchmark(path.name)) - task_id = str(task_ref.get("task_id") or details.get("task_id") or path.name) - models = sorted({a.model for a in agents if a.model}) - if not models and run.get("root_model"): - models = [str(run["root_model"])] - - return RunTrace( - title=path.name, - benchmark=benchmark, - task_id=task_id, - success=score.get("success"), - failure_reason=score.get("failure_reason"), - poc_source=details.get("poc_source"), - models=models, - agents=agents, - root_name=root.name if root else "", - root_result=root.result if root else None, - root_steps=root_steps, - unlinked=unlinked, - ) - - -def _pick_root(agents: list[AgentTrace]) -> AgentTrace | None: - """The depth-0 orchestrator (prefer the conventional ``exploit``).""" - - if not agents: - return None - named = next((a for a in agents if a.name == ROOT_AGENT and a.depth == 0), None) - if named is not None: - return named - return min(agents, key=lambda a: (a.depth, a.first_ts)) - - -def _guess_benchmark(dir_name: str) -> str: - for known in ("cybergym", "bountybench", "evmbench", "noop"): - if dir_name.startswith(known): - return known - return "rollout" - - -def render_html(run: RunTrace) -> str: - """Render a self-contained HTML page (inline data + CSS + JS).""" - - # ``; escape it in the blob. - data_json = json.dumps(run.as_dict()).replace(" Path: - """Load ``rollout_dir`` and write ``trace.html`` (or ``out``).""" - - run = load_rollout_dir(rollout_dir) - target = out or (Path(rollout_dir) / "trace.html") - target.write_text(render_html(run), encoding="utf-8") - return target - - -_HTML_TEMPLATE = r""" - - - - -RLM rollout trace - - - -
-

Trace:

-
-
-
- -
-
- - - - -""" +__all__ = ["load_rollout_dir", "render_html", "write_html"] diff --git a/src/kai/viewer/__init__.py b/src/kai/viewer/__init__.py new file mode 100644 index 00000000..508a1756 --- /dev/null +++ b/src/kai/viewer/__init__.py @@ -0,0 +1,23 @@ +"""Self-contained HTML viewer for kai runs (findings + agent trace). + +Reads a run directory written by the pipeline -- ``exploits.json`` for the +security findings and ``rollouts/*.jsonl`` (or flat ``*.jsonl``) for the +agent trace -- and renders a single offline HTML file. No server, no +external requests, no live state backend required. +""" + +from __future__ import annotations + +from ra.viewer.trace import RunTrace, load_rollout_dir + +from kai.viewer.findings import Finding, load_findings +from kai.viewer.html import render_html, write_html + +__all__ = [ + "Finding", + "RunTrace", + "load_findings", + "load_rollout_dir", + "render_html", + "write_html", +] diff --git a/src/kai/viewer/__main__.py b/src/kai/viewer/__main__.py new file mode 100644 index 00000000..8affaecf --- /dev/null +++ b/src/kai/viewer/__main__.py @@ -0,0 +1,52 @@ +"""CLI entry point: ``python -m kai.viewer [-o OUT] [--open]``. + +Renders a run directory into a single self-contained HTML file. This is the +implementation the ``kai view`` subcommand wraps; it also works standalone. +""" + +from __future__ import annotations + +import argparse +import sys +import webbrowser +from pathlib import Path + +from kai.viewer.html import write_html + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="python -m kai.viewer", + description="Render a kai run (findings + agent trace) to a single HTML file.", + ) + parser.add_argument( + "run_dir", + help="run directory (a state// dir with exploits.json and/or rollouts/)", + ) + parser.add_argument( + "-o", + "--output", + help="output HTML path (default: /trace.html)", + ) + parser.add_argument( + "--open", + action="store_true", + help="open the rendered file in a browser", + ) + args = parser.parse_args(argv) + + run_dir = Path(args.run_dir) + if not run_dir.is_dir(): + print(f"error: {run_dir} is not a directory", file=sys.stderr) + return 2 + + out = Path(args.output) if args.output else None + target = write_html(run_dir, out) + print(target) + if args.open: + webbrowser.open(target.resolve().as_uri()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/kai/viewer/findings.py b/src/kai/viewer/findings.py new file mode 100644 index 00000000..2ae1bbd8 --- /dev/null +++ b/src/kai/viewer/findings.py @@ -0,0 +1,210 @@ +"""Load security findings from a run's ``exploits.json``. + +A normal pipeline run persists its findings as a JSON array of +:class:`kai.state.models.ExploitRecord` dicts at +``//exploits.json``. This module folds those into the +flat :class:`Finding` view-model the HTML renderer draws, deriving display +helpers (a one-line title, a severity bucket, a human-readable CVSS vector) +without needing a live state backend. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from kai import cvss + +# Human-readable expansions for CVSS 3.1 vector codes, by metric. +_CVSS_LABELS: dict[str, dict[str, str]] = { + "AV": {"N": "Network", "A": "Adjacent", "L": "Local", "P": "Physical"}, + "AC": {"L": "Low", "H": "High"}, + "PR": {"N": "None", "L": "Low", "H": "High"}, + "UI": {"N": "None", "R": "Required"}, + "S": {"U": "Unchanged", "C": "Changed"}, + "C": {"H": "High", "L": "Low", "N": "None"}, + "I": {"H": "High", "L": "Low", "N": "None"}, + "A": {"H": "High", "L": "Low", "N": "None"}, +} +_CVSS_ORDER = ("AV", "AC", "PR", "UI", "S", "C", "I", "A") + +# Status / category ordering: confirmed, runtime-exploitable findings first. +_SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "none": 0} + +# Internal bookkeeping records that aren't user-facing findings: when the +# pipeline merges duplicate hypotheses it keeps the merged-away ones as +# ``deduplicated`` shells (no severity/PoC). The report and viewer hide them. +_HIDDEN_STATUSES = {"deduplicated"} + + +@dataclass +class Finding: + """One vulnerability finding, flattened for display.""" + + exploit_id: str + title: str + hypothesis: str + exploit_sketch: str + file: str + function: str + category: str + status: str + confirmed: bool | None + severity: str + cvss_score: float | None + cvss_vector: str + cvss_rows: list[dict[str, str]] = field(default_factory=list) + poc_code: str = "" + patch: str = "" + attacker_role: str = "" + prerequisite: str = "" + adversarial_viability: str = "" + profit_model: str = "" + critic_summary: str = "" + + def as_dict(self) -> dict[str, Any]: + return { + "exploit_id": self.exploit_id, + "title": self.title, + "hypothesis": self.hypothesis, + "exploit_sketch": self.exploit_sketch, + "file": self.file, + "function": self.function, + "category": self.category, + "status": self.status, + "confirmed": self.confirmed, + "severity": self.severity, + "cvss_score": self.cvss_score, + "cvss_vector": self.cvss_vector, + "cvss_rows": self.cvss_rows, + "poc_code": self.poc_code, + "patch": self.patch, + "attacker_role": self.attacker_role, + "prerequisite": self.prerequisite, + "adversarial_viability": self.adversarial_viability, + "profit_model": self.profit_model, + "critic_summary": self.critic_summary, + } + + +def _title_of(record: dict[str, Any]) -> str: + """A one-line headline: the first sentence of the hypothesis, else a + `` in `` fallback.""" + + hypothesis = str(record.get("hypothesis") or "").strip() + if hypothesis: + first = hypothesis.replace("\n", " ").split(". ")[0].strip().rstrip(".") + # Cut at a word boundary so a long first sentence stays a scannable + # headline rather than wrapping across table cells / section titles. + if len(first) > 64: + first = first[:64].rsplit(" ", 1)[0] + "…" + return first + category = str(record.get("category") or "finding").replace("_", " ") + fn = str(record.get("function") or "").strip() + return f"{category} in {fn}" if fn else category + + +def _cvss_rows(vector: str, justification: dict[str, str] | None) -> list[dict[str, str]]: + """Expand a CVSS vector into ordered ``{metric, value, why}`` rows.""" + + if not vector: + return [] + try: + metrics = cvss.parse_vector(vector) + except Exception: + return [] + justification = justification or {} + rows: list[dict[str, str]] = [] + for code in _CVSS_ORDER: + if code not in metrics: + continue + value = metrics[code] + rows.append( + { + "metric": code, + "value": _CVSS_LABELS.get(code, {}).get(value, value), + "why": str(justification.get(code, "")), + } + ) + return rows + + +def _severity_of(record: dict[str, Any]) -> str: + """The record's severity, lowercased; derived from the CVSS score when + the field is absent.""" + + severity = str(record.get("severity") or "").strip().lower() + if severity in _SEVERITY_RANK: + return severity + score = record.get("cvss_score") + if isinstance(score, (int, float)): + return cvss.score_to_severity(float(score)).lower() + return "none" + + +def _finding_from_record(record: dict[str, Any]) -> Finding: + return Finding( + exploit_id=str(record.get("exploit_id") or ""), + title=_title_of(record), + hypothesis=str(record.get("hypothesis") or ""), + exploit_sketch=str(record.get("exploit_sketch") or ""), + file=str(record.get("file") or ""), + function=str(record.get("function") or ""), + category=str(record.get("category") or ""), + status=str(record.get("status") or ""), + confirmed=record.get("confirmed"), + severity=_severity_of(record), + cvss_score=record.get("cvss_score"), + cvss_vector=str(record.get("cvss_vector") or ""), + cvss_rows=_cvss_rows( + str(record.get("cvss_vector") or ""), record.get("cvss_justification") + ), + poc_code=str(record.get("poc_code") or ""), + patch=str(record.get("patch") or ""), + attacker_role=str(record.get("attacker_role") or ""), + prerequisite=str(record.get("prerequisite") or record.get("required_privileges") or ""), + adversarial_viability=str(record.get("adversarial_viability") or ""), + profit_model=str(record.get("profit_model") or ""), + critic_summary=str(record.get("critic_summary") or ""), + ) + + +def _sort_key(f: Finding) -> tuple[int, int, float]: + """Confirmed first, then by severity, then CVSS score as the tie-breaker. + + Severity is the secondary key so a high/critical finding that carries a + label but no usable CVSS score (the fixer can emit one without a vector) + still outranks a low finding that happens to have a numeric score. + """ + + confirmed = 1 if f.confirmed else 0 + severity = _SEVERITY_RANK.get(f.severity, 0) + score = f.cvss_score if isinstance(f.cvss_score, (int, float)) else -1.0 + return (confirmed, severity, score) + + +def load_findings(run_dir: Path) -> list[Finding]: + """Read ``/exploits.json`` into sorted :class:`Finding` objects. + + Returns an empty list when the file is absent or unparseable (e.g. a + benchmark rollout dir, which carries ``score.json`` but no findings). + """ + + path = Path(run_dir) / "exploits.json" + if not path.exists(): + return [] + try: + data = json.loads(path.read_text(encoding="utf-8", errors="replace")) + except (OSError, json.JSONDecodeError): + return [] + if not isinstance(data, list): + return [] + findings = [ + _finding_from_record(r) + for r in data + if isinstance(r, dict) and r.get("status") not in _HIDDEN_STATUSES + ] + findings.sort(key=_sort_key, reverse=True) + return findings diff --git a/src/kai/viewer/html.py b/src/kai/viewer/html.py new file mode 100644 index 00000000..64a8db0d --- /dev/null +++ b/src/kai/viewer/html.py @@ -0,0 +1,162 @@ +"""Render a kai run as a single self-contained HTML page. + +Composes kai's security **Findings** panel (severity, CVSS, PoC, patch) onto +the reusable viewer in :mod:`ra.viewer` — which supplies the tabbed shell, the +shared design system, and the built-in **Trace** panel. Findings stay here +because they're domain concepts (CVSS, exploits); the trace viewer and styling +live in ``ra`` so any ra agent can reuse them. + +Every dynamic value is written via ``textContent`` / DOM nodes, so unsanitised +rollout text cannot inject markup. +""" + +from __future__ import annotations + +from pathlib import Path + +from ra.viewer.html import Panel, render_page, trace_panel +from ra.viewer.trace import RunTrace, load_rollout_dir + +from kai.viewer.findings import Finding, load_findings + +# Findings-panel layout: the master-detail split + interactive table rows + +# the detail pane. Shared tokens/primitives come from ra.viewer.style. +_FINDINGS_CSS = """\ + .split { display: grid; grid-template-columns: minmax(360px, 1fr) minmax(420px, 1.3fr); } + @media (max-width: 880px) { .split { grid-template-columns: 1fr; } } + tbody tr { cursor: pointer; } + tbody tr:hover { background: color-mix(in srgb, var(--accent) 5%, transparent); } + tbody tr.sel { background: color-mix(in srgb, var(--accent) 9%, transparent); } + .detail { border-left: 1px solid var(--rule-2); padding: 18px 22px; min-width: 0; } + .detail h2 { margin: 0 0 4px; font-size: 18px; font-weight: 600; line-height: 1.3; } + .detail .where { font-size: 12.5px; color: var(--muted); margin-bottom: 16px; } +""" + +_FINDINGS_SECTION = """\ +
+
+
+ +
CVSSFindingCategoryLocation
+
+
+
""" + +_FINDINGS_JS = r""" +const FINDINGS = DATA.findings || []; +const pct = s => Math.max(0, Math.min(100, Math.round((s || 0) / 10 * 100))); +function fRow(f, i) { + const tr = el("tr", "sev-" + (f.severity || "none") + (f.confirmed ? "" : " unconf")); + tr.dataset.i = i; + const c = el("td", "cvss"); + c.append(el("span", "dot")); + c.append(el("span", "score", f.cvss_score != null ? Number(f.cvss_score).toFixed(1) : "—")); + if (f.cvss_score != null) { + const bar = el("span", "bar"), fill = el("i"); + fill.style.width = pct(f.cvss_score) + "%"; bar.append(fill); c.append(bar); + } + const t = el("td"); t.append(el("div", "ftitle", f.title)); + tr.append(c, t, el("td", "cat", (f.category || "").replace(/_/g, " ")), + el("td", "loc", (f.file ? f.file.split("/").pop() : "") + (f.function ? ":" + f.function : ""))); + tr.addEventListener("click", () => fSelect(i)); + return tr; +} +function diffNode(patch) { + const pre = el("pre", "diff"); + String(patch).split("\n").forEach(line => { + const k = line.startsWith("+") ? "add" : line.startsWith("-") ? "del" : null; + pre.append(el("span", k, line + "\n")); + }); + return pre; +} +function kvRow(dl, k, v) { if (v) { dl.append(el("dt", null, k), el("dd", null, v)); } } +function fSelect(i) { + document.querySelectorAll("#rows tr").forEach(r => r.classList.toggle("sel", +r.dataset.i === i)); + const f = FINDINGS[i], d = document.getElementById("detail"); d.replaceChildren(); + d.append(el("h2", "serif", f.title)); + d.append(el("div", "where", f.file + (f.function ? " · " + f.function + "()" : ""))); + const kv = el("dl", "kv"); + const sevLine = (f.severity || "—") + (f.cvss_score != null ? " · CVSS " + Number(f.cvss_score).toFixed(1) : ""); + kvRow(kv, "Severity", sevLine); + kvRow(kv, "Status", f.status + (f.confirmed ? " · confirmed" : "")); + kvRow(kv, "Category", (f.category || "").replace(/_/g, " ")); + kvRow(kv, "Attacker", f.attacker_role); + kvRow(kv, "Precondition", f.prerequisite); + if (kv.children.length) d.append(kv); + + if (f.hypothesis) { d.append(el("div", "sec-label", "Why it's exploitable")); d.append(el("p", "prose", f.hypothesis)); } + if (f.exploit_sketch) { d.append(el("div", "sec-label", "Exploit sketch")); d.append(el("p", "prose", f.exploit_sketch)); } + + if (f.cvss_rows && f.cvss_rows.length) { + d.append(el("div", "sec-label", "CVSS 3.1 vector")); + if (f.cvss_vector) d.append(el("div", "vector mono", f.cvss_vector)); + const g = el("div", "cvss-grid"); + f.cvss_rows.forEach(r => { g.append(el("span", "m", r.metric), el("span", "v", r.value), el("span", "why", r.why)); }); + d.append(g); + } + if (f.poc_code) { d.append(el("div", "sec-label", "Proof of concept")); d.append(el("pre", "code", f.poc_code)); } + if (f.patch) { d.append(el("div", "sec-label", "Suggested patch")); d.append(diffNode(f.patch)); } + if (f.critic_summary) { d.append(el("div", "sec-label", "Critic")); d.append(el("p", "prose", f.critic_summary)); } +} +function renderFindings() { + // Contribute the findings tally to the header facts (ahead of the models). + const facts = document.getElementById("facts"), ref = facts.firstChild; + const tally = el("span"); tally.append(el("b", null, FINDINGS.length), document.createTextNode(" findings")); + facts.insertBefore(tally, ref); + const crit = FINDINGS.filter(f => f.severity === "critical").length; + if (crit) { const c = el("span", "crit"); c.append(el("b", null, crit), document.createTextNode(" critical")); facts.insertBefore(c, ref); } + + const rows = document.getElementById("rows"); + if (!FINDINGS.length) { + document.getElementById("view-findings").querySelector(".split") + .replaceChildren(el("div", "empty", "No findings recorded for this run.")); + return; + } + FINDINGS.forEach((f, i) => rows.append(fRow(f, i))); + fSelect(0); +} +""" + + +def _findings_panel() -> Panel: + return Panel( + "findings", "Findings", _FINDINGS_SECTION, _FINDINGS_CSS, _FINDINGS_JS, "renderFindings();" + ) + + +def render_html(run: RunTrace, findings: list[Finding] | None = None) -> str: + """Render the full kai page (Findings + Trace) from a trace + findings list. + + ``findings`` defaults to empty (e.g. a benchmark rollout dir has a trace but + no ``exploits.json``); the Findings tab then shows an empty state and the + Trace tab opens first. + """ + + findings = findings or [] + data = { + "title": run.title, + "benchmark": run.benchmark, + "task_id": run.task_id, + "models": run.models, + "run": run.as_dict(), + "findings": [f.as_dict() for f in findings], + } + default_view = "findings" if findings else "trace" + return render_page( + data, [_findings_panel(), trace_panel()], brand="kai", default_view=default_view + ) + + +def write_html(run_dir: Path, out: Path | None = None) -> Path: + """Load ``run_dir`` (trace + findings) and write a single HTML file. + + Defaults to ``/trace.html`` so existing callers that link to that + name keep working. + """ + + run = load_rollout_dir(run_dir) + findings = load_findings(run_dir) + target = out or (Path(run_dir) / "trace.html") + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(render_html(run, findings), encoding="utf-8") + return target diff --git a/src/ra/viewer/__init__.py b/src/ra/viewer/__init__.py new file mode 100644 index 00000000..a38cc784 --- /dev/null +++ b/src/ra/viewer/__init__.py @@ -0,0 +1,29 @@ +"""Reusable HTML viewer for ``ra`` agent runs. + +Renders a run directory's ``*.jsonl`` rollouts into a single offline HTML page +— a tabbed shell with a built-in causal **Trace** panel and a shared design +system (:mod:`ra.viewer.style`). Domain layers compose extra panels on top via +:func:`ra.viewer.html.render_page`; kai-security, for example, adds a security +**Findings** panel. +""" + +from __future__ import annotations + +from ra.viewer.html import ( + Panel, + render_page, + render_trace_html, + trace_panel, + write_trace_html, +) +from ra.viewer.trace import RunTrace, load_rollout_dir + +__all__ = [ + "Panel", + "RunTrace", + "load_rollout_dir", + "render_page", + "render_trace_html", + "trace_panel", + "write_trace_html", +] diff --git a/src/ra/viewer/__main__.py b/src/ra/viewer/__main__.py new file mode 100644 index 00000000..09c72f28 --- /dev/null +++ b/src/ra/viewer/__main__.py @@ -0,0 +1,47 @@ +"""CLI entry point: ``python -m ra.viewer [-o OUT] [--open]``. + +Renders any ``ra`` run's agent trace into a single self-contained HTML file. +Domain tools (e.g. ``kai view``) wrap a richer page on top of this. +""" + +from __future__ import annotations + +import argparse +import sys +import webbrowser +from pathlib import Path + +from ra.viewer.html import write_trace_html + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="python -m ra.viewer", + description="Render an ra run's agent trace to a single HTML file.", + ) + parser.add_argument( + "run_dir", + help="run directory (a dir with *.jsonl rollouts, or state//rollouts/)", + ) + parser.add_argument( + "-o", "--output", help="output HTML path (default: /trace.html)" + ) + parser.add_argument( + "--open", action="store_true", help="open the rendered file in a browser" + ) + args = parser.parse_args(argv) + + run_dir = Path(args.run_dir) + if not run_dir.is_dir(): + print(f"error: {run_dir} is not a directory", file=sys.stderr) + return 2 + + target = write_trace_html(run_dir, Path(args.output) if args.output else None) + print(target) + if args.open: + webbrowser.open(target.resolve().as_uri()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/ra/viewer/html.py b/src/ra/viewer/html.py new file mode 100644 index 00000000..ef29e898 --- /dev/null +++ b/src/ra/viewer/html.py @@ -0,0 +1,289 @@ +"""Reusable, self-contained HTML viewer for any ``ra`` agent run. + +This is framework-level: any agent built on ``ra`` writes per-agent +``*.jsonl`` rollouts, and this module renders them into a single offline HTML +page (no server, no external requests). The page is built from **panels** — a +tabbed shell plus one or more views — so a domain layer can add its own panel +(e.g. kai adds a security **Findings** panel) on top of the built-in +**Trace** panel. + +Every dynamic value is written via ``textContent`` / DOM nodes, never +``innerHTML``, so unsanitised rollout text cannot inject markup. The palette +and shared primitives come from :mod:`ra.viewer.style`. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from ra.viewer import style +from ra.viewer.trace import RunTrace, load_rollout_dir + + +@dataclass(frozen=True) +class Panel: + """One tab in the viewer. + + ``js`` defines a render function over the embedded ``DATA``/``RUN`` globals + (and the shared ``el()`` helper); ``render_call`` invokes it at init. + """ + + id: str + label: str + section: str # the
block + css: str + js: str + render_call: str + + +# Shell chrome only (header / tabs / theme toggle / view switching). Panel- and +# domain-specific styling lives on each Panel; shared tokens + primitives come +# from style.base_css(). +_SHELL_CSS = """\ + header { display: flex; align-items: baseline; gap: 18px; flex-wrap: wrap; + padding: 12px 22px; border-bottom: 1px solid var(--rule-2); + position: sticky; top: 0; background: var(--paper); z-index: 5; } + header h1 { margin: 0; font-size: 16px; font-weight: 600; } + header h1 .sub { color: var(--muted); font-weight: 400; } + .facts { display: flex; gap: 16px; font-size: 12px; color: var(--muted-2); } + .facts b { color: var(--ink); font-weight: 600; } + .facts .crit b { color: var(--accent); } + .spacer { flex: 1 1 auto; } + .tabs { display: flex; gap: 2px; } + .tab { border: 0; background: none; color: var(--muted-2); cursor: pointer; + font: inherit; font-size: 13px; padding: 4px 10px; border-bottom: 2px solid transparent; } + .tab.active { color: var(--ink); border-bottom-color: var(--accent); } + .toggle { border: 1px solid var(--rule-2); background: none; color: var(--muted-2); + border-radius: 5px; cursor: pointer; font-size: 12px; padding: 3px 8px; } + .view { display: none; } + .view.active { display: block; } +""" + +# --------------------------------------------------------------------------- +# Built-in Trace panel: the causal agent spine. +# --------------------------------------------------------------------------- +_TRACE_CSS = """\ + .trace { padding: 14px 22px; max-width: 920px; } + .legend { display: flex; flex-wrap: wrap; gap: 12px; font-size: 12px; color: var(--muted-2); + margin-bottom: 16px; padding-bottom: 12px; border-bottom: 1px solid var(--rule); } + .legend .a { display: inline-flex; align-items: center; gap: 6px; } + .legend .sw { width: 9px; height: 9px; border-radius: 2px; display: inline-block; } + .step { border-left: 2px solid var(--rule-2); padding: 0 0 2px 14px; margin: 0 0 16px; } + .step .h { display: flex; gap: 10px; align-items: baseline; font-size: 12px; color: var(--muted-2); margin-bottom: 5px; } + .step .h .who { color: var(--ink); font-weight: 600; } + .step .h .deleg { color: var(--accent); } + .step .h .ts { margin-left: auto; } + .reason { white-space: pre-wrap; margin: 0 0 8px; } + details.spawn { border-left: 2px dashed var(--rule-2); padding-left: 12px; margin: 4px 0 10px; } + details.spawn > summary { cursor: pointer; font-size: 12.5px; color: var(--muted-2); padding: 3px 0; } + details.spawn > summary .who { color: var(--ink); font-weight: 600; } + details.spawn[open] > summary .ret { display: none; } + .ret-box { border: 1px solid var(--rule-2); background: var(--code-bg); border-radius: 6px; + padding: 8px 10px; margin: 4px 0 9px; white-space: pre-wrap; font-size: 12.5px; max-height: 220px; overflow: auto; } + .childit { padding-left: 10px; border-left: 1px solid var(--rule); margin-bottom: 10px; } + .childhead { font-size: 11px; color: var(--muted-2); margin: 9px 0 4px; } + .missing { color: var(--del); font-size: 12px; padding: 4px 0; } + .sec { font-size: 12px; color: var(--muted-2); margin: 26px 0 10px; border-top: 1px solid var(--rule); padding-top: 12px; } + .result { border: 1px solid var(--rule-2); border-radius: 8px; padding: 11px; background: var(--panel); white-space: pre-wrap; margin: 2px 0 0; } +""" + +_TRACE_SECTION = """\ +
+
+
""" + +_TRACE_JS = r""" +function head(s) { return (s || "").replace(/\s+/g, " ").trim().slice(0, 130); } +function proseNode(text) { + const parts = String(text || "").split("```"); + const prose = parts.filter((_, i) => i % 2 === 0).join("\n") + .replace(/
/g, "").replace(/\n{3,}/g, "\n\n").trim(); + return prose ? el("div", "reason", prose) : null; +} +function childNode(child) { + const det = el("details", "spawn"); + det.dataset.agent = child.agent; + if (child.color) det.style.borderLeftColor = child.color; + const sum = el("summary"); + const who = el("span", "who", "⤷ spawned " + child.agent); + if (child.color) who.style.color = child.color; + sum.append(who); + if (child.missing) { + sum.append(el("span", "ret", " — no rollout captured")); + det.append(sum, el("div", "missing", "(sub-agent file absent or empty)")); + return det; + } + sum.append(el("span", "ret", child.returned ? " — returned: " + head(child.returned) : " — (no return value recorded)")); + det.append(sum); + if (child.returned) det.append(el("div", "ret-box", child.returned)); + (child.iters || []).forEach(it => { + const wrap = el("div", "childit"); + wrap.append(el("div", "childhead", child.agent + " · iter " + it.iter)); + const p = proseNode(it.reasoning); if (p) wrap.append(p); + (it.blocks || []).forEach(b => { + if (b.code && b.code.trim()) wrap.append(el("pre", "code", b.code)); + if (b.output && b.output.trim()) wrap.append(el("pre", "output", b.output)); + }); + det.append(wrap); + }); + return det; +} +function stepNode(step) { + const wrap = el("div", "step"); + const h = el("div", "h"); + h.append(el("span", "who", RUN.root_name), el("span", null, "#" + step.iter)); + if (step.delegated && step.delegated.length) h.append(el("span", "deleg", "⤷ " + step.delegated.join(", "))); + h.append(el("span", "ts", (step.ts || "").replace("T", " ").slice(0, 19))); + wrap.append(h); + const p = proseNode(step.reasoning); if (p) wrap.append(p); + (step.blocks || []).forEach(b => { if (b.code && b.code.trim()) wrap.append(el("pre", "code", b.code)); }); + (step.children || []).forEach(c => wrap.append(childNode(c))); + (step.blocks || []).forEach(b => { if (b.output && b.output.trim()) wrap.append(el("pre", "output", b.output)); }); + return wrap; +} +function renderTrace() { + const t = document.getElementById("trace"); + if (!RUN.root_steps || !RUN.root_steps.length) { + t.replaceChildren(el("div", "empty", "No agent rollouts found for this run.")); + return; + } + const legend = el("div", "legend"); + (RUN.legend || []).forEach(a => { + const span = el("span", "a"); + const sw = el("span", "sw"); sw.style.background = a.color; span.append(sw); + span.append(el("span", null, a.name + " · d" + a.depth + " · " + a.iters + " it")); + legend.append(span); + }); + const nodes = [legend]; + RUN.root_steps.forEach(s => nodes.push(stepNode(s))); + if (RUN.root_result) { nodes.push(el("div", "sec", RUN.root_name + " — final answer")); nodes.push(el("div", "result", RUN.root_result)); } + (RUN.unlinked || []).forEach(c => nodes.push(childNode(c))); + t.replaceChildren(...nodes); +} +""" + + +def trace_panel() -> Panel: + """The built-in causal-trace panel, reusable by any ``ra`` agent.""" + + return Panel("trace", "Trace", _TRACE_SECTION, _TRACE_CSS, _TRACE_JS, "renderTrace();") + + +_SHELL = r""" + + + + +__TITLE__ + + + +
+

__BRAND__

+
+
+
+ +
+ +__SECTIONS__ + + + + + +""" + + +def render_page( + data: dict, + panels: list[Panel], + *, + brand: str = "ra", + default_view: str | None = None, +) -> str: + """Assemble a self-contained page from ``data`` + an ordered list of panels. + + ``data`` is embedded as JSON (the panels' JS reads it via the ``DATA`` / + ``RUN`` globals). ``default_view`` is the panel id shown first; it defaults + to the first panel. + """ + + blob = json.dumps(data).replace(" + css = style.base_css() + _SHELL_CSS + "".join(p.css for p in panels) + sections = "\n".join(p.section for p in panels) + panel_js = "\n".join(p.js for p in panels) + render_calls = "\n ".join(p.render_call for p in panels) + panels_meta = json.dumps([{"id": p.id, "label": p.label} for p in panels]) + default = json.dumps(default_view or (panels[0].id if panels else "")) + return ( + _SHELL.replace("__TITLE__", f"{brand} — run view") + .replace("__BRAND__", brand) + .replace("__STYLE__", css) + .replace("__SECTIONS__", sections) + .replace("__PANELS_META__", panels_meta) + .replace("__PANEL_JS__", panel_js) + .replace("__RENDER_CALLS__", render_calls) + .replace("__DEFAULT_VIEW__", default) + .replace("__DATA__", blob) + ) + + +def render_trace_html(run: RunTrace, *, brand: str = "ra") -> str: + """Render a run's causal agent trace as a standalone single-page viewer.""" + + data = { + "title": run.title, + "task_id": run.task_id, + "models": run.models, + "run": run.as_dict(), + } + return render_page(data, [trace_panel()], brand=brand, default_view="trace") + + +def write_trace_html(run_dir: Path, out: Path | None = None) -> Path: + """Load ``run_dir`` and write the standalone trace viewer to ``out``.""" + + run = load_rollout_dir(run_dir) + target = out or (Path(run_dir) / "trace.html") + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(render_trace_html(run), encoding="utf-8") + return target diff --git a/src/ra/viewer/style.py b/src/ra/viewer/style.py new file mode 100644 index 00000000..e1e0ca5a --- /dev/null +++ b/src/ra/viewer/style.py @@ -0,0 +1,84 @@ +"""Shared design system for kai's HTML surfaces. + +One palette, one set of primitives, used by both the interactive viewer +(:mod:`kai.viewer.html`) and the static report document (``kai report +--format html``). Each surface concatenates ``TOKENS + COMPONENTS`` with its +own layout CSS, so the look (colours, severity treatment, code/diff blocks) +can never drift between them. +""" + +from __future__ import annotations + +# Design tokens: the palette + the single accent. Dark theme overrides the +# same variables, so every component below is theme-aware for free. +TOKENS = """\ + :root { + --paper:#fafaf7; --panel:#fff; --ink:#1a1a1a; --rule:#e3dfd6; --rule-2:#d8d4cc; + --muted:#8a857c; --muted-2:#6b665d; --accent:#b3261e; --add:#2f6f43; --del:#9a2a22; + --gray-bar:#c8c2b5; --code-bg:#f4f1ea; + } + [data-theme="dark"] { + --paper:#14171b; --panel:#1b1f25; --ink:#e7e3da; --rule:#2a3038; --rule-2:#343b44; + --muted:#9aa3ad; --muted-2:#7f8893; --accent:#e5675d; --add:#7ec99a; --del:#e79a92; + --gray-bar:#3a424c; --code-bg:#11151b; + } +""" + +# Shared component primitives: base type, the findings table, the severity +# encoding (dot + score + 0-10 bar), the key/value + CVSS detail blocks, and +# code / diff / output panes. +COMPONENTS = """\ + * { box-sizing: border-box; } + html, body { margin: 0; } + body { background: var(--paper); color: var(--ink); + font: 14px/1.55 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; } + .serif { font-family: Charter, "Iowan Old Style", Georgia, serif; } + code, pre, .mono { font-family: ui-monospace, "SF Mono", Menlo, Consolas, monospace; } + + table { border-collapse: collapse; width: 100%; } + thead th { text-align: left; font-size: 10px; letter-spacing: .07em; text-transform: uppercase; + color: var(--muted-2); font-weight: 600; padding: 10px 14px 8px; border-bottom: 1px solid var(--rule-2); } + thead th.num { text-align: right; } + tbody tr { border-bottom: 1px solid var(--rule); } + td { padding: 11px 14px; vertical-align: top; } + td.cvss { white-space: nowrap; } + + .dot { display:inline-block; width:8px; height:8px; border-radius:50%; vertical-align: middle; margin-right: 7px; background: var(--gray-bar); } + .sev-critical .dot, .sev-high .dot { background: var(--accent); } + .sev-medium .dot { background: var(--muted-2); } + .score { font-family: ui-monospace, monospace; font-weight: 600; font-size: 13px; } + .bar { display:block; height: 3px; width: 64px; background: var(--gray-bar); margin-top: 6px; border-radius: 2px; } + .bar > i { display:block; height: 100%; background: var(--muted-2); border-radius: 2px; } + .sev-critical .bar > i, .sev-high .bar > i { background: var(--accent); } + .ftitle { font-weight: 600; } + .cat { font-size: 11px; color: var(--muted-2); } + .loc { font-size: 12px; color: var(--muted); } + .unconf { opacity: .62; } + + .kv { display: grid; grid-template-columns: 130px 1fr; gap: 5px 14px; font-size: 13px; margin: 0; } + .kv dt { color: var(--muted-2); } + .kv dd { margin: 0; } + .sec-label { font-size: 11px; letter-spacing: .07em; text-transform: uppercase; color: var(--muted-2); + margin: 18px 0 8px; border-top: 1px solid var(--rule); padding-top: 12px; } + .prose { white-space: pre-wrap; margin: 0; } + .cvss-grid { display: grid; grid-template-columns: max-content max-content 1fr; gap: 5px 14px; + font-size: 12.5px; align-items: baseline; } + .cvss-grid .m { color: var(--muted-2); font-family: ui-monospace, monospace; } + .cvss-grid .v { font-weight: 500; } + .cvss-grid .why { color: var(--muted); font-size: 12px; } + .vector { font-size: 12px; color: var(--muted); margin: 0 0 10px; } + + pre.code, pre.diff, pre.output { margin: 0 0 4px; padding: 11px 13px; border: 1px solid var(--rule-2); + border-radius: 6px; background: var(--code-bg); overflow: auto; font-size: 12.5px; line-height: 1.5; } + pre.code, pre.diff { white-space: pre; } + pre.output { white-space: pre-wrap; color: var(--muted-2); max-height: 320px; } + pre.diff .add { color: var(--add); } + pre.diff .del { color: var(--del); } + .empty { color: var(--muted); padding: 40px 22px; } +""" + + +def base_css() -> str: + """The shared stylesheet: tokens + component primitives.""" + + return TOKENS + COMPONENTS diff --git a/src/ra/viewer/trace.py b/src/ra/viewer/trace.py new file mode 100644 index 00000000..4e676b76 --- /dev/null +++ b/src/ra/viewer/trace.py @@ -0,0 +1,353 @@ +"""Load RLM rollout traces from a run directory. + +Reads the per-agent ``.jsonl`` files an RLM run writes via +:mod:`kai.state.hooks` (plus the optional ``score.json`` / ``run.json`` +siblings) and folds them into a :class:`RunTrace` the HTML renderer can draw. + +The view follows **causality, not wall-clock**. The root agent (``exploit``) +is an orchestrator: it reasons, then runs Python, and that Python calls +``spawn_analyzer(...)`` / ``spawn_researcher(...)`` / ``spawn_verifier(...)`` +etc. to delegate a subtask. The sub-agent runs to completion *inside* that +code call and its ``final_answer`` comes back as the call's return value -- +which is why a naive timestamp sort is misleading: the parent iteration is +stamped when it *finishes*, i.e. after the child it spawned has already run, +so the child appears to precede its own cause. + +So we read the root top-to-bottom by iteration number -- reason -> run code +-> observe output -- and attach each spawned sub-agent's full sub-transcript +under the exact ``spawn_*()`` call that caused it (matched per agent in call +order), with the value it returned surfaced at the call site. + +No external dependencies, no server, no spans -- just the rollouts on disk. +Pulled smoke dirs are flat (``*.jsonl`` next to ``score.json``); a fresh run +nests them under ``state//rollouts/``. Both work: we glob for +``*.jsonl`` and skip any file whose lines aren't valid JSON (empty files, or +``cat: ... No such file`` stubs from a partial ``railway ssh`` pull). +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +ROOT_AGENT = "exploit" +SPAWN_RE = re.compile(r"\bspawn_([a-z][a-z_]*)\s*\(") + +# Per-agent tints, assigned in first-appearance order. +PALETTE = [ + "#7fdbca", + "#c792ea", + "#f78c6c", + "#82aaff", + "#ffcb6b", + "#f07178", + "#addb67", + "#89ddff", +] + + +@dataclass +class Iteration: + """One reason -> act -> observe step of an agent.""" + + n: int + timestamp: str + reasoning: str + blocks: list[dict[str, str]] = field(default_factory=list) + + +@dataclass +class AgentTrace: + """A single (sub-)agent's rollout: its metadata + iterations + result.""" + + name: str + depth: int + model: str + backend: str + iterations: list[Iteration] + result: str | None + first_ts: str + color: str = "" + + def legend_dict(self) -> dict[str, Any]: + return { + "name": self.name, + "depth": self.depth, + "model": self.model, + "iters": len(self.iterations), + "color": self.color, + } + + +@dataclass +class RunTrace: + """A whole run: the causal root spine plus run-level header fields.""" + + title: str + benchmark: str + task_id: str + success: bool | None + failure_reason: str | None + poc_source: str | None + models: list[str] + agents: list[AgentTrace] + root_name: str + root_result: str | None + root_steps: list[dict[str, Any]] + unlinked: list[dict[str, Any]] + + def as_dict(self) -> dict[str, Any]: + return { + "title": self.title, + "benchmark": self.benchmark, + "task_id": self.task_id, + "success": self.success, + "failure_reason": self.failure_reason, + "poc_source": self.poc_source, + "models": self.models, + "legend": [a.legend_dict() for a in self.agents], + "root_name": self.root_name, + "root_result": self.root_result, + "root_steps": self.root_steps, + "unlinked": self.unlinked, + } + + +def _load_jsonl(path: Path) -> list[dict[str, Any]]: + """Parse a ``.jsonl`` file, skipping any line that isn't valid JSON. + + Pulled rollout dirs can contain empty files or a ``cat: ... No such + file`` stub where an agent never ran; those simply yield no records. + """ + + records: list[dict[str, Any]] = [] + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError: + return records + for line in text.splitlines(): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(obj, dict): + records.append(obj) + return records + + +def _agent_from_records( + fallback_name: str, records: list[dict[str, Any]] +) -> AgentTrace | None: + """Fold a file's records into one :class:`AgentTrace` (or ``None``).""" + + meta = next((r for r in records if r.get("type") == "metadata"), {}) + iters = [ + Iteration( + n=int(r.get("iteration", 0)), + timestamp=str(r.get("timestamp", "")), + reasoning=str(r.get("response", "")), + blocks=[b for b in (r.get("code_blocks") or []) if isinstance(b, dict)], + ) + for r in records + if r.get("type") == "iteration" + ] + if not iters and not meta: + return None + result_rec = next((r for r in records if r.get("type") == "result"), None) + result = str(result_rec.get("final_answer", "")) if result_rec is not None else None + first_ts = str(meta.get("timestamp", "")) or (iters[0].timestamp if iters else "") + return AgentTrace( + name=str(meta.get("agent") or fallback_name), + depth=int(meta.get("depth", 0)), + model=str(meta.get("model", "")), + backend=str(meta.get("backend", "")), + iterations=iters, + result=result, + first_ts=first_ts, + ) + + +def _read_json(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + try: + obj = json.loads(path.read_text(encoding="utf-8", errors="replace")) + except (OSError, json.JSONDecodeError): + return {} + return obj if isinstance(obj, dict) else {} + + +def _spawn_sessions(records: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Split a sub-agent's records into one entry per spawn, time-ordered. + + The root re-invokes a sub-agent many times; each invocation is a distinct + ``spawn_id`` whose iteration counter restarts at 1. One session == one + delegation the root can match a ``spawn_*()`` call to. + """ + + order: list[str] = [] + sess: dict[str, dict[str, Any]] = {} + for r in records: + sid = str(r.get("spawn_id", "")) + kind = r.get("type") + if kind == "iteration": + if sid not in sess: + sess[sid] = { + "first_ts": str(r.get("timestamp", "")), + "returned": None, + "iters": [], + } + order.append(sid) + sess[sid]["iters"].append( + { + "iter": int(r.get("iteration", 0)), + "ts": str(r.get("timestamp", "")), + "reasoning": str(r.get("response", "")), + "blocks": [ + b for b in (r.get("code_blocks") or []) if isinstance(b, dict) + ], + } + ) + elif kind == "result" and sid in sess: + sess[sid]["returned"] = str(r.get("final_answer", "")) + out = [sess[s] for s in order] + out.sort(key=lambda s: s["first_ts"]) + return out + + +def _child(name: str, color: str, session: dict[str, Any] | None) -> dict[str, Any]: + if session is None: + return {"agent": name, "color": color, "missing": True, "iters": []} + return { + "agent": name, + "color": color, + "returned": session.get("returned"), + "iters": session["iters"], + } + + +def _build_root_spine( + root: AgentTrace, + sessions_by_agent: dict[str, list[dict[str, Any]]], + color_of: dict[str, str], +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Walk the root's iterations and hang each spawned sub-agent under the + ``spawn_*()`` call that produced it (FIFO per agent name). + + Returns ``(root_steps, unlinked_children)``. ``unlinked`` holds sub-agent + sessions we couldn't tie to a call (count mismatch) so nothing is lost. + """ + + cursor = {name: 0 for name in sessions_by_agent if name != root.name} + steps: list[dict[str, Any]] = [] + for it in root.iterations: + code = "\n".join(b.get("code", "") for b in it.blocks) + children: list[dict[str, Any]] = [] + for name in SPAWN_RE.findall(code): + sessions = sessions_by_agent.get(name) + session = None + if sessions is not None and cursor.get(name, 0) < len(sessions): + session = sessions[cursor[name]] + cursor[name] += 1 + children.append(_child(name, color_of.get(name, "#8a99ad"), session)) + steps.append( + { + "iter": it.n, + "ts": it.timestamp, + "reasoning": it.reasoning, + "blocks": it.blocks, + "delegated": [c["agent"] for c in children], + "children": children, + } + ) + + unlinked: list[dict[str, Any]] = [] + for name, sessions in sessions_by_agent.items(): + if name == root.name: + continue + for session in sessions[cursor.get(name, 0) :]: + unlinked.append(_child(name, color_of.get(name, "#8a99ad"), session)) + return steps, unlinked + + +def load_rollout_dir(path: Path) -> RunTrace: + """Build a :class:`RunTrace` (root spine + causal nesting) from a dir.""" + + path = Path(path) + if not path.is_dir(): + raise NotADirectoryError(f"{path} is not a directory") + + agents: list[AgentTrace] = [] + records_by_agent: dict[str, list[dict[str, Any]]] = {} + for jf in sorted(path.rglob("*.jsonl")): + if jf.name == "status_updates.jsonl": + continue + records = _load_jsonl(jf) + agent = _agent_from_records(jf.stem, records) + if agent is not None and agent.iterations: + agents.append(agent) + records_by_agent[agent.name] = records + + agents.sort(key=lambda a: (a.depth, a.first_ts, a.name)) + color_of = {a.name: PALETTE[i % len(PALETTE)] for i, a in enumerate(agents)} + for a in agents: + a.color = color_of[a.name] + + root = _pick_root(agents) + sessions_by_agent = { + name: _spawn_sessions(records) for name, records in records_by_agent.items() + } + if root is not None: + root_steps, unlinked = _build_root_spine(root, sessions_by_agent, color_of) + else: + root_steps, unlinked = [], [] + + score = _read_json(path / "score.json") + details = score.get("details") or {} + task_ref = score.get("task_ref") or {} + run = _read_json(path / "run.json") + + benchmark = str(task_ref.get("benchmark") or _guess_benchmark(path.name)) + task_id = str(task_ref.get("task_id") or details.get("task_id") or path.name) + models = sorted({a.model for a in agents if a.model}) + if not models and run.get("root_model"): + models = [str(run["root_model"])] + + return RunTrace( + title=path.name, + benchmark=benchmark, + task_id=task_id, + success=score.get("success"), + failure_reason=score.get("failure_reason"), + poc_source=details.get("poc_source"), + models=models, + agents=agents, + root_name=root.name if root else "", + root_result=root.result if root else None, + root_steps=root_steps, + unlinked=unlinked, + ) + + +def _pick_root(agents: list[AgentTrace]) -> AgentTrace | None: + """The depth-0 orchestrator (prefer the conventional ``exploit``).""" + + if not agents: + return None + named = next((a for a in agents if a.name == ROOT_AGENT and a.depth == 0), None) + if named is not None: + return named + return min(agents, key=lambda a: (a.depth, a.first_ts)) + + +def _guess_benchmark(dir_name: str) -> str: + for known in ("cybergym", "bountybench", "evmbench", "noop"): + if dir_name.startswith(known): + return known + return "rollout" diff --git a/tests/test_ra_viewer.py b/tests/test_ra_viewer.py new file mode 100644 index 00000000..90e7bfb1 --- /dev/null +++ b/tests/test_ra_viewer.py @@ -0,0 +1,59 @@ +"""Tests for the reusable ra.viewer (framework-level trace viewer + composer). + +These exercise the viewer with NO kai/findings involvement, proving any ra +agent can render its run trace. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from ra.viewer import Panel, load_rollout_dir, render_page, render_trace_html +from ra.viewer.trace import RunTrace + + +def _write_rollout(dir_path: Path) -> None: + rollouts = dir_path / "rollouts" + rollouts.mkdir() + rows = [ + {"type": "metadata", "agent": "root", "depth": 0, "spawn_id": "r1", + "timestamp": "2026-06-03T00:00:00+00:00", "model": "some/model"}, + {"type": "iteration", "agent": "root", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:01:00+00:00", "response": "thinking", "code_blocks": []}, + {"type": "result", "agent": "root", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:02:00+00:00", "final_answer": "done"}, + ] + (rollouts / "root.jsonl").write_text( + "\n".join(json.dumps(r) for r in rows), encoding="utf-8" + ) + + +def test_render_trace_html_is_self_contained(tmp_path: Path) -> None: + _write_rollout(tmp_path) + html = render_trace_html(load_rollout_dir(tmp_path)) + + assert html.startswith("") + assert "http://" not in html and "https://" not in html + # Trace tab present; no kai Findings tab when used standalone. + assert 'id="view-trace"' in html + assert 'id="view-findings"' not in html + assert "renderTrace();" in html + + +def test_render_page_composes_arbitrary_panels() -> None: + run = RunTrace( + title="t", benchmark=None, task_id="t", success=None, failure_reason=None, + poc_source=None, models=["m"], agents=[], root_name="root", + root_result=None, root_steps=[], unlinked=[], + ) + custom = Panel( + id="notes", label="Notes", + section='

', + css=".notes{}", js="function renderNotes(){document.getElementById('n').textContent='hi';}", + render_call="renderNotes();", + ) + html = render_page({"title": "t", "run": run.as_dict()}, [custom], default_view="notes") + assert 'id="view-notes"' in html + assert "renderNotes();" in html + assert '"id": "notes"' in html or '"id":"notes"' in html diff --git a/tests/test_viewer.py b/tests/test_viewer.py new file mode 100644 index 00000000..b0986ddc --- /dev/null +++ b/tests/test_viewer.py @@ -0,0 +1,147 @@ +"""Tests for the kai run viewer (findings loader + HTML render).""" + +from __future__ import annotations + +import json +from pathlib import Path + +from ra.viewer.trace import RunTrace, load_rollout_dir + +from kai.viewer import load_findings, render_html, write_html + +_EXPLOITS = [ + { + "exploit_id": "e2", + "status": "rejected", + "confirmed": False, + "hypothesis": "Fee truncation rounds small trades to zero.", + "file": "contracts/Fees.sol", + "function": "calcFee", + "category": "theoretical_bounds", + "cvss_score": 4.3, + }, + { + "exploit_id": "e1", + "status": "verified", + "confirmed": True, + "hypothesis": ( + "Reentrancy in withdraw drains the vault. The external call " + "precedes the balance update and there is no guard." + ), + "file": "contracts/Vault.sol", + "function": "withdraw", + "category": "active_exploit", + "severity": "critical", + "cvss_score": 9.1, + "cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H", + "cvss_justification": {"AV": "remote attacker", "AC": "no special conditions"}, + "poc_code": "contract Attacker { function pwn() external {} }", + "patch": "- msg.sender.call{value: amount}(\"\");\n+ balances[msg.sender] -= amount;", + "attacker_role": "anyone", + "prerequisite": "a non-zero deposit", + }, +] + + +def _write_run(dir_path: Path) -> None: + (dir_path / "exploits.json").write_text(json.dumps(_EXPLOITS), encoding="utf-8") + rollouts = dir_path / "rollouts" + rollouts.mkdir() + exploit = [ + {"type": "metadata", "agent": "exploit", "depth": 0, "spawn_id": "r1", + "timestamp": "2026-06-03T00:00:00+00:00", "model": "anthropic/claude-opus-4.8"}, + {"type": "iteration", "agent": "exploit", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:01:00+00:00", + "response": "Analyzing the vault.", "code_blocks": []}, + {"type": "result", "agent": "exploit", "iteration": 1, "spawn_id": "r1", + "timestamp": "2026-06-03T00:02:00+00:00", "final_answer": "done"}, + ] + (rollouts / "exploit.jsonl").write_text( + "\n".join(json.dumps(r) for r in exploit), encoding="utf-8" + ) + + +def test_load_findings_drops_deduplicated(tmp_path: Path) -> None: + records = [ + {"exploit_id": "keep", "status": "verified_and_fixed", "confirmed": True, + "hypothesis": "real bug", "file": "A.sol", "function": "f", + "category": "active_exploit", "severity": "high", "cvss_score": 8.0}, + {"exploit_id": "dup", "status": "deduplicated", "confirmed": None, + "hypothesis": "merged duplicate", "file": "A.sol", "function": "f", + "category": "active_exploit"}, + ] + (tmp_path / "exploits.json").write_text(json.dumps(records), encoding="utf-8") + findings = load_findings(tmp_path) + # The deduplicated bookkeeping shell is hidden; the real finding remains. + assert [f.exploit_id for f in findings] == ["keep"] + + +def test_load_findings_sorts_severity_over_missing_score(tmp_path: Path) -> None: + # A critical finding with a severity label but no CVSS score must still + # outrank a low finding that happens to carry a numeric score. + records = [ + {"exploit_id": "low_scored", "status": "verified", "confirmed": True, + "hypothesis": "low but scored", "severity": "low", "cvss_score": 3.1}, + {"exploit_id": "crit_unscored", "status": "verified", "confirmed": True, + "hypothesis": "critical, no vector", "severity": "critical"}, + ] + (tmp_path / "exploits.json").write_text(json.dumps(records), encoding="utf-8") + assert [f.exploit_id for f in load_findings(tmp_path)] == ["crit_unscored", "low_scored"] + + +def test_load_findings_sorts_and_derives(tmp_path: Path) -> None: + _write_run(tmp_path) + findings = load_findings(tmp_path) + + # Confirmed critical sorts ahead of the unconfirmed lower-severity finding. + assert [f.exploit_id for f in findings] == ["e1", "e2"] + e1, e2 = findings + assert e1.severity == "critical" + assert e1.title.startswith("Reentrancy in withdraw") + # Severity is derived from the CVSS score when the field is absent. + assert e2.severity == "medium" + # The CVSS vector is expanded into ordered, human-readable rows. + assert [r["metric"] for r in e1.cvss_rows] == ["AV", "AC", "PR", "UI", "S", "C", "I", "A"] + assert e1.cvss_rows[0] == {"metric": "AV", "value": "Network", "why": "remote attacker"} + + +def test_load_findings_missing_file_is_empty(tmp_path: Path) -> None: + assert load_findings(tmp_path) == [] + + +def test_render_is_self_contained_and_has_findings(tmp_path: Path) -> None: + _write_run(tmp_path) + html = render_html(load_rollout_dir(tmp_path), load_findings(tmp_path)) + + assert html.startswith("") + # Fully offline: no external resources. + assert "http://" not in html and "https://" not in html + for needle in ( + "Reentrancy in withdraw", + "contracts/Vault.sol", + "active_exploit", + "critical", + "Attacker", # poc_code + "balances[msg.sender]", # patch diff body + ): + assert needle in html + + +def test_render_without_findings_still_renders(tmp_path: Path) -> None: + # A benchmark-style dir: a trace but no exploits.json. + (tmp_path / "rollouts").mkdir() + run = RunTrace( + title="t", benchmark="rollout", task_id="t", success=None, + failure_reason=None, poc_source=None, models=[], agents=[], + root_name="", root_result=None, root_steps=[], unlinked=[], + ) + html = render_html(run) + assert html.startswith("") + assert "No findings recorded" in html or "view-findings" in html + + +def test_write_html_creates_file(tmp_path: Path) -> None: + _write_run(tmp_path) + out = write_html(tmp_path) + assert out == tmp_path / "trace.html" + assert "Reentrancy in withdraw" in out.read_text(encoding="utf-8")