From 5523ad39d8c5fad24d95344a71772ec12d363985 Mon Sep 17 00:00:00 2001
From: aktasbatuhan
Date: Wed, 3 Jun 2026 22:21:11 +0100
Subject: [PATCH] feat(viewer): sortable programs browser for a run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Adds /setup/{label}/run/{idx}/programs — a full-width, sortable table of
every program in a run, modeled on AlphaEvolve's Programs browser. It
surfaces the task-specific metric columns (e.g. raw_C, sequence_length,
wall_seconds for autocorrelation; circles_n, sum_of_radii for packing)
that the narrow dashboard step-list hides, auto-detecting which numeric
metrics actually vary and excluding folded/constant ones (combined_score,
combined_score_std, stage, runs_successfully).
Sorting is server-side (no JS): each column header links to
?sort={col}&dir={asc|desc}, defaulting to score desc to surface the best
program first; an unknown sort key falls back rather than erroring. Every
row links back into the dashboard at ?step={n}; the run's best row is
tinted and new-best steps keep the left accent bar. The dashboard gains a
"browse all N programs as a table" link.
Tests cover the route, metric-column selection (task metric in, universal
keys out), the dashboard link, ?step row links, and sort-param fallback.
---
kaievolve/viewer/server.py | 180 +++++++++++++++++++++++++++++++++++++
tests/test_viewer.py | 24 ++++-
2 files changed, 203 insertions(+), 1 deletion(-)
diff --git a/kaievolve/viewer/server.py b/kaievolve/viewer/server.py
index d9779fc..baaf565 100644
--- a/kaievolve/viewer/server.py
+++ b/kaievolve/viewer/server.py
@@ -122,6 +122,12 @@
.dash-list td { padding:5px 10px; }
.dash-list tr.sel td { background:#eef4fb; }
.dash-list tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); }
+ /* shared row accents (programs table + dash list): best row tinted, new-best
+ steps carry a left accent bar on the first cell */
+ tr.sel td { background:#eef4fb; }
+ tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); }
+ th.sort a { color:var(--accent); }
+ th .arrow { color:var(--accent); font-size:0.78rem; }
.dash-list a.row { display:block; color:inherit; text-decoration:none; }
.dash-list tr:hover td { background:#f4f7fb; }
.detail { min-width:0; }
@@ -207,6 +213,10 @@
each attempt{% if has_std %} · whiskers = ±1σ score noise{% endif %}. Models:
{% for m in models %}{{ m.name }} {{ m.count }}{% if not loop.last %} · {% endif %}{% endfor %}
+
+
@@ -251,6 +261,31 @@
{% endblock %}
"""
+_PROGRAMS = """{% extends "base" %}{% block title %}{{ short }} run {{ seed }} programs - kai{% endblock %}
+{% block content %}
+{{ short }} · run {{ seed }} · all programs
+{{ task }} · {{ rows|length }} programs · sorted by {{ sort_label }} ({{ dir }})
+
+{% if rows %}
+
+{% for c in columns %}| {{ c.label }}{% if c.arrow %} {{ c.arrow }}{% endif %} | {% endfor %}
+{% for r in rows %}
+
+{% for cell in r.cells %}| {% if cell.href %}{{ cell.text }}{% else %}{{ cell.text }}{% endif %} | {% endfor %}
+
+{% endfor %}
+
+Each row links to that program in the dashboard. The tinted row is
+the run's best; a bar on the left marks a step that set a new best. Click a column
+header to re-sort.
+{% else %}
+No programs recorded for this run yet.
+{% endif %}
+{% endblock %}
+"""
+
_COMPARE = """{% extends "base" %}{% block title %}compare - kai{% endblock %}
{% block content %}
compare setups
@@ -300,6 +335,7 @@
"overview": _OVERVIEW,
"setup": _SETUP,
"dashboard": _DASHBOARD,
+ "programs": _PROGRAMS,
"compare": _COMPARE,
"glossary": _GLOSSARY,
}
@@ -342,6 +378,37 @@ def _fmt_cost(v: float) -> str:
return f"${v:.3f}" if v else "-"
+# columns folded into the score column or constant across a run — never their
+# own column in the programs table.
+_PROG_EXCLUDE = {"combined_score", "combined_score_std", "stage", "runs_successfully"}
+# sort keys that aren't task metrics; everything else must be a metric column.
+_PROG_FIXED_SORTS = {"step", "score", "delta", "model"}
+
+
+def _metric_columns(steps, exclude=_PROG_EXCLUDE, cap: int = 4) -> list:
+ """Task-specific numeric metric keys that actually appear, in first-seen
+ order (e.g. raw_C, sequence_length, wall_seconds). Constant/universal keys
+ are excluded so the table only carries columns with real signal."""
+ seen: list = []
+ for s in steps:
+ for k, v in (s.metrics or {}).items():
+ if k in exclude or k in seen:
+ continue
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
+ seen.append(k)
+ return seen[:cap]
+
+
+def _fmt_metric(v) -> str:
+ if isinstance(v, bool):
+ return str(v)
+ if isinstance(v, int):
+ return str(v)
+ if isinstance(v, float):
+ return str(int(v)) if v.is_integer() else f"{v:.4f}"
+ return "-" if v is None else str(v)
+
+
# ─── app factory ─────────────────────────────────────────────────────────────
@@ -579,6 +646,119 @@ def run_page(label: str, idx: int, step: Optional[int] = None):
def step_page(label: str, idx: int, iteration: int):
return RedirectResponse(f"/setup/{label}/run/{idx}?step={iteration}", status_code=307)
+ # ── programs browser: full-width, sortable table of every program in a run ──
+ @app.get("/setup/{label}/run/{idx}/programs", response_class=HTMLResponse)
+ def programs_page(label: str, idx: int, sort: str = "score", dir: str = "desc"):
+ arm = arm_or_404(label)
+ if idx < 0 or idx >= len(arm.run_dirs):
+ raise HTTPException(404, f"run {idx} out of range for {label}")
+ rd = arm.run_dirs[idx]
+ rs = explore.run_state(rd)
+ steps = explore.steps_for_run(rd)
+
+ metric_keys = _metric_columns(steps)
+ allowed = _PROG_FIXED_SORTS | set(metric_keys)
+ if sort not in allowed:
+ sort = "score"
+ if dir not in ("asc", "desc"):
+ dir = "desc"
+
+ # running-best bookkeeping (independent of the display sort)
+ improved, b, best_iter, best_sc = set(), float("-inf"), None, float("-inf")
+ for s in steps:
+ if s.score is not None:
+ if s.score > b + 1e-12:
+ improved.add(s.iteration)
+ b = max(b, s.score)
+ if s.score > best_sc:
+ best_sc, best_iter = s.score, s.iteration
+
+ def _sortval(s):
+ if sort == "step":
+ return s.iteration
+ if sort == "score":
+ return s.score if s.score is not None else float("-inf")
+ if sort == "delta":
+ return s.delta if s.delta is not None else float("-inf")
+ if sort == "model":
+ return explore.short_model(s.model)
+ v = (s.metrics or {}).get(sort)
+ return v if isinstance(v, (int, float)) else float("-inf")
+
+ steps_sorted = sorted(steps, key=_sortval, reverse=(dir == "desc"))
+
+ # column descriptors: step, score, Δ, , model — all sortable
+ col_defs = [("step", "step", True), ("score", "score", True), ("delta", "Δ", True)]
+ col_defs += [(k, k, True) for k in metric_keys]
+ col_defs += [("model", "model", False)]
+ # step/model read most naturally ascending; scores/metrics descending
+ natural = {"step": "asc", "model": "asc"}
+
+ def _href(key):
+ nd = ("asc" if dir == "desc" else "desc") if sort == key else natural.get(key, "desc")
+ return f"/setup/{label}/run/{idx}/programs?sort={key}&dir={nd}"
+
+ columns = [
+ {
+ "label": lbl,
+ "num": num,
+ "active": key == sort,
+ "arrow": ("▼" if dir == "desc" else "▲") if key == sort else "",
+ "href": _href(key),
+ }
+ for key, lbl, num in col_defs
+ ]
+
+ base = f"/setup/{label}/run/{idx}"
+ rows = []
+ for s in steps_sorted:
+ sc = _fmt_score(s.score)
+ sd = (s.metrics or {}).get("combined_score_std")
+ if isinstance(sd, (int, float)) and sd > 0:
+ sc = f"{sc} ±{sd:.3f}"
+ d, dcls = "", ""
+ if s.delta is not None and abs(s.delta) > 1e-9:
+ d, dcls = f"{s.delta:+.4f}", ("pos" if s.delta > 0 else "neg")
+ href = f"{base}?step={s.iteration}"
+ cells = [
+ {"text": s.iteration, "num": True, "cls": "", "href": href},
+ {"text": sc, "num": True, "cls": "", "href": ""},
+ {"text": d, "num": True, "cls": dcls, "href": ""},
+ ]
+ cells += [
+ {"text": _fmt_metric((s.metrics or {}).get(k)), "num": True, "cls": "", "href": ""}
+ for k in metric_keys
+ ]
+ cells.append(
+ {"text": explore.short_model(s.model), "num": False, "cls": "", "href": href}
+ )
+ rows.append(
+ {
+ "cells": cells,
+ "best": s.iteration == best_iter,
+ "improved": s.iteration in improved,
+ }
+ )
+
+ return render(
+ "programs",
+ crumbs=[
+ ("overview", "/"),
+ (_short(label), f"/setup/{label}"),
+ (f"run {rs.seed}", f"/setup/{label}/run/{idx}"),
+ ("programs", None),
+ ],
+ label=label,
+ idx=idx,
+ short=_short(label),
+ task=_task_of(label),
+ seed=rs.seed,
+ columns=columns,
+ rows=rows,
+ sort_label="Δ" if sort == "delta" else sort,
+ dir=dir,
+ )
+
# ── compare ───────────────────────────────────────────────────────────────
@app.get("/compare", response_class=HTMLResponse)
def compare_page():
diff --git a/tests/test_viewer.py b/tests/test_viewer.py
index 2a6d74d..efb60eb 100644
--- a/tests/test_viewer.py
+++ b/tests/test_viewer.py
@@ -118,7 +118,7 @@ def _make_tree(root: Path):
"id": "p1",
"parent_id": None,
"iteration_found": 1,
- "metrics": {"combined_score": 0.42},
+ "metrics": {"combined_score": 0.42, "raw_C": 1.55, "stage": 2.0},
"code": "x = 1 < 2 # ok\n",
"summary": {
"hypothesis": "try X",
@@ -173,6 +173,28 @@ def test_code_is_escaped_in_step(self):
r = self.client.get("/setup/auto_full/run/0/step/1")
self.assertIn("1 < 2", r.text)
+ def test_programs_table(self):
+ r = self.client.get("/setup/auto_full/run/0/programs")
+ self.assertEqual(r.status_code, 200)
+ self.assertIn("all programs", r.text)
+ # a task metric becomes a column; folded/constant universal keys do not
+ self.assertIn("raw_C", r.text)
+ self.assertNotIn("runs_successfully", r.text)
+ self.assertNotIn(">stage<", r.text) # 'stage' is excluded as a column header
+ # rows link back into the dashboard's ?step= selector
+ self.assertIn("?step=1", r.text)
+ # the dashboard offers a link into this table
+ self.assertIn("/programs", self.client.get("/setup/auto_full/run/0").text)
+
+ def test_programs_sort_params(self):
+ # every sort key resolves; an unknown key falls back rather than 500ing
+ for url in (
+ "/setup/auto_full/run/0/programs?sort=step&dir=asc",
+ "/setup/auto_full/run/0/programs?sort=raw_C&dir=desc",
+ "/setup/auto_full/run/0/programs?sort=bogus&dir=sideways",
+ ):
+ self.assertEqual(self.client.get(url).status_code, 200, url)
+
if __name__ == "__main__":
unittest.main()