Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions kaievolve/viewer/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@
.dash-list td { padding:5px 10px; }
.dash-list tr.sel td { background:#eef4fb; }
.dash-list tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); }
/* shared row accents (programs table + dash list): best row tinted, new-best
steps carry a left accent bar on the first cell */
tr.sel td { background:#eef4fb; }
tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); }
th.sort a { color:var(--accent); }
th .arrow { color:var(--accent); font-size:0.78rem; }
.dash-list a.row { display:block; color:inherit; text-decoration:none; }
.dash-list tr:hover td { background:#f4f7fb; }
.detail { min-width:0; }
Expand Down Expand Up @@ -207,6 +213,10 @@
each attempt{% if has_std %} · whiskers = ±1σ score noise{% endif %}. Models:
{% for m in models %}<b>{{ m.name }}</b> {{ m.count }}{% if not loop.last %} · {% endif %}{% endfor %}</p>

<div class="btn-row">
<a class="btn ghost sm" href="/setup/{{ label }}/run/{{ idx }}/programs">browse all {{ steps_n }} programs as a table &rarr;</a>
</div>

<div class="dash">
<div class="dash-list">
<table>
Expand Down Expand Up @@ -251,6 +261,31 @@
{% endblock %}
"""

_PROGRAMS = """{% extends "base" %}{% block title %}{{ short }} run {{ seed }} programs - kai{% endblock %}
{% block content %}
<h1>{{ short }} · run {{ seed }} · all programs</h1>
<p class="sub">{{ task }} · {{ rows|length }} programs · sorted by {{ sort_label }} ({{ dir }})</p>
<div class="btn-row">
<a class="btn ghost sm" href="/setup/{{ label }}/run/{{ idx }}">&larr; back to dashboard</a>
</div>
{% if rows %}
<table>
<tr>{% for c in columns %}<th class="{{ 'num' if c.num }}{{ ' sort' if c.active }}"><a href="{{ c.href }}">{{ c.label }}</a>{% if c.arrow %} <span class="arrow">{{ c.arrow }}</span>{% endif %}</th>{% endfor %}</tr>
{% for r in rows %}
<tr class="{{ 'sel' if r.best }} {{ 'imp' if r.improved }}">
{% for cell in r.cells %}<td class="{{ 'num' if cell.num }} {{ cell.cls }}">{% if cell.href %}<a class="row" href="{{ cell.href }}">{{ cell.text }}</a>{% else %}{{ cell.text }}{% endif %}</td>{% endfor %}
</tr>
{% endfor %}
</table>
<p class="note">Each row links to that program in the dashboard. The tinted row is
the run's best; a bar on the left marks a step that set a new best. Click a column
header to re-sort.</p>
{% else %}
<p class="note">No programs recorded for this run yet.</p>
{% endif %}
{% endblock %}
"""

_COMPARE = """{% extends "base" %}{% block title %}compare - kai{% endblock %}
{% block content %}
<h1>compare setups</h1>
Expand Down Expand Up @@ -300,6 +335,7 @@
"overview": _OVERVIEW,
"setup": _SETUP,
"dashboard": _DASHBOARD,
"programs": _PROGRAMS,
"compare": _COMPARE,
"glossary": _GLOSSARY,
}
Expand Down Expand Up @@ -342,6 +378,37 @@ def _fmt_cost(v: float) -> str:
return f"${v:.3f}" if v else "-"


# columns folded into the score column or constant across a run — never their
# own column in the programs table.
_PROG_EXCLUDE = {"combined_score", "combined_score_std", "stage", "runs_successfully"}
# sort keys that aren't task metrics; everything else must be a metric column.
_PROG_FIXED_SORTS = {"step", "score", "delta", "model"}


def _metric_columns(steps, exclude=_PROG_EXCLUDE, cap: int = 4) -> list:
"""Task-specific numeric metric keys that actually appear, in first-seen
order (e.g. raw_C, sequence_length, wall_seconds). Constant/universal keys
are excluded so the table only carries columns with real signal."""
seen: list = []
for s in steps:
for k, v in (s.metrics or {}).items():
if k in exclude or k in seen:
continue
if isinstance(v, (int, float)) and not isinstance(v, bool):
seen.append(k)
return seen[:cap]


def _fmt_metric(v) -> str:
if isinstance(v, bool):
return str(v)
if isinstance(v, int):
return str(v)
if isinstance(v, float):
return str(int(v)) if v.is_integer() else f"{v:.4f}"
return "-" if v is None else str(v)


# ─── app factory ─────────────────────────────────────────────────────────────


Expand Down Expand Up @@ -579,6 +646,119 @@ def run_page(label: str, idx: int, step: Optional[int] = None):
def step_page(label: str, idx: int, iteration: int):
return RedirectResponse(f"/setup/{label}/run/{idx}?step={iteration}", status_code=307)

# ── programs browser: full-width, sortable table of every program in a run ──
@app.get("/setup/{label}/run/{idx}/programs", response_class=HTMLResponse)
def programs_page(label: str, idx: int, sort: str = "score", dir: str = "desc"):
arm = arm_or_404(label)
if idx < 0 or idx >= len(arm.run_dirs):
raise HTTPException(404, f"run {idx} out of range for {label}")
rd = arm.run_dirs[idx]
rs = explore.run_state(rd)
steps = explore.steps_for_run(rd)

metric_keys = _metric_columns(steps)
allowed = _PROG_FIXED_SORTS | set(metric_keys)
if sort not in allowed:
sort = "score"
if dir not in ("asc", "desc"):
dir = "desc"

# running-best bookkeeping (independent of the display sort)
improved, b, best_iter, best_sc = set(), float("-inf"), None, float("-inf")
for s in steps:
if s.score is not None:
if s.score > b + 1e-12:
improved.add(s.iteration)
b = max(b, s.score)
if s.score > best_sc:
best_sc, best_iter = s.score, s.iteration

def _sortval(s):
if sort == "step":
return s.iteration
if sort == "score":
return s.score if s.score is not None else float("-inf")
if sort == "delta":
return s.delta if s.delta is not None else float("-inf")
if sort == "model":
return explore.short_model(s.model)
v = (s.metrics or {}).get(sort)
return v if isinstance(v, (int, float)) else float("-inf")

steps_sorted = sorted(steps, key=_sortval, reverse=(dir == "desc"))

# column descriptors: step, score, Δ, <task metrics…>, model — all sortable
col_defs = [("step", "step", True), ("score", "score", True), ("delta", "Δ", True)]
col_defs += [(k, k, True) for k in metric_keys]
col_defs += [("model", "model", False)]
# step/model read most naturally ascending; scores/metrics descending
natural = {"step": "asc", "model": "asc"}

def _href(key):
nd = ("asc" if dir == "desc" else "desc") if sort == key else natural.get(key, "desc")
return f"/setup/{label}/run/{idx}/programs?sort={key}&dir={nd}"

columns = [
{
"label": lbl,
"num": num,
"active": key == sort,
"arrow": ("▼" if dir == "desc" else "▲") if key == sort else "",
"href": _href(key),
}
for key, lbl, num in col_defs
]

base = f"/setup/{label}/run/{idx}"
rows = []
for s in steps_sorted:
sc = _fmt_score(s.score)
sd = (s.metrics or {}).get("combined_score_std")
if isinstance(sd, (int, float)) and sd > 0:
sc = f"{sc} ±{sd:.3f}"
d, dcls = "", ""
if s.delta is not None and abs(s.delta) > 1e-9:
d, dcls = f"{s.delta:+.4f}", ("pos" if s.delta > 0 else "neg")
href = f"{base}?step={s.iteration}"
cells = [
{"text": s.iteration, "num": True, "cls": "", "href": href},
{"text": sc, "num": True, "cls": "", "href": ""},
{"text": d, "num": True, "cls": dcls, "href": ""},
]
cells += [
{"text": _fmt_metric((s.metrics or {}).get(k)), "num": True, "cls": "", "href": ""}
for k in metric_keys
]
cells.append(
{"text": explore.short_model(s.model), "num": False, "cls": "", "href": href}
)
rows.append(
{
"cells": cells,
"best": s.iteration == best_iter,
"improved": s.iteration in improved,
}
)

return render(
"programs",
crumbs=[
("overview", "/"),
(_short(label), f"/setup/{label}"),
(f"run {rs.seed}", f"/setup/{label}/run/{idx}"),
("programs", None),
],
label=label,
idx=idx,
short=_short(label),
task=_task_of(label),
seed=rs.seed,
columns=columns,
rows=rows,
sort_label="Δ" if sort == "delta" else sort,
dir=dir,
)

# ── compare ───────────────────────────────────────────────────────────────
@app.get("/compare", response_class=HTMLResponse)
def compare_page():
Expand Down
24 changes: 23 additions & 1 deletion tests/test_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def _make_tree(root: Path):
"id": "p1",
"parent_id": None,
"iteration_found": 1,
"metrics": {"combined_score": 0.42},
"metrics": {"combined_score": 0.42, "raw_C": 1.55, "stage": 2.0},
"code": "x = 1 < 2 # ok\n",
"summary": {
"hypothesis": "try X",
Expand Down Expand Up @@ -173,6 +173,28 @@ def test_code_is_escaped_in_step(self):
r = self.client.get("/setup/auto_full/run/0/step/1")
self.assertIn("1 &lt; 2", r.text)

def test_programs_table(self):
r = self.client.get("/setup/auto_full/run/0/programs")
self.assertEqual(r.status_code, 200)
self.assertIn("all programs", r.text)
# a task metric becomes a column; folded/constant universal keys do not
self.assertIn("raw_C", r.text)
self.assertNotIn("runs_successfully", r.text)
self.assertNotIn(">stage<", r.text) # 'stage' is excluded as a column header
# rows link back into the dashboard's ?step= selector
self.assertIn("?step=1", r.text)
# the dashboard offers a link into this table
self.assertIn("/programs", self.client.get("/setup/auto_full/run/0").text)

def test_programs_sort_params(self):
# every sort key resolves; an unknown key falls back rather than 500ing
for url in (
"/setup/auto_full/run/0/programs?sort=step&dir=asc",
"/setup/auto_full/run/0/programs?sort=raw_C&dir=desc",
"/setup/auto_full/run/0/programs?sort=bogus&dir=sideways",
):
self.assertEqual(self.client.get(url).status_code, 200, url)


if __name__ == "__main__":
unittest.main()
Loading