From 5523ad39d8c5fad24d95344a71772ec12d363985 Mon Sep 17 00:00:00 2001 From: aktasbatuhan Date: Wed, 3 Jun 2026 22:21:11 +0100 Subject: [PATCH] feat(viewer): sortable programs browser for a run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds /setup/{label}/run/{idx}/programs — a full-width, sortable table of every program in a run, modeled on AlphaEvolve's Programs browser. It surfaces the task-specific metric columns (e.g. raw_C, sequence_length, wall_seconds for autocorrelation; circles_n, sum_of_radii for packing) that the narrow dashboard step-list hides, auto-detecting which numeric metrics actually vary and excluding folded/constant ones (combined_score, combined_score_std, stage, runs_successfully). Sorting is server-side (no JS): each column header links to ?sort={col}&dir={asc|desc}, defaulting to score desc to surface the best program first; an unknown sort key falls back rather than erroring. Every row links back into the dashboard at ?step={n}; the run's best row is tinted and new-best steps keep the left accent bar. The dashboard gains a "browse all N programs as a table" link. Tests cover the route, metric-column selection (task metric in, universal keys out), the dashboard link, ?step row links, and sort-param fallback. --- kaievolve/viewer/server.py | 180 +++++++++++++++++++++++++++++++++++++ tests/test_viewer.py | 24 ++++- 2 files changed, 203 insertions(+), 1 deletion(-) diff --git a/kaievolve/viewer/server.py b/kaievolve/viewer/server.py index d9779fc..baaf565 100644 --- a/kaievolve/viewer/server.py +++ b/kaievolve/viewer/server.py @@ -122,6 +122,12 @@ .dash-list td { padding:5px 10px; } .dash-list tr.sel td { background:#eef4fb; } .dash-list tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); } + /* shared row accents (programs table + dash list): best row tinted, new-best + steps carry a left accent bar on the first cell */ + tr.sel td { background:#eef4fb; } + tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); } + th.sort a { color:var(--accent); } + th .arrow { color:var(--accent); font-size:0.78rem; } .dash-list a.row { display:block; color:inherit; text-decoration:none; } .dash-list tr:hover td { background:#f4f7fb; } .detail { min-width:0; } @@ -207,6 +213,10 @@ each attempt{% if has_std %} · whiskers = ±1σ score noise{% endif %}. Models: {% for m in models %}{{ m.name }} {{ m.count }}{% if not loop.last %} · {% endif %}{% endfor %}

+
+ browse all {{ steps_n }} programs as a table → +
+
@@ -251,6 +261,31 @@ {% endblock %} """ +_PROGRAMS = """{% extends "base" %}{% block title %}{{ short }} run {{ seed }} programs - kai{% endblock %} +{% block content %} +

{{ short }} · run {{ seed }} · all programs

+

{{ task }} · {{ rows|length }} programs · sorted by {{ sort_label }} ({{ dir }})

+ +{% if rows %} +
+{% for c in columns %}{% endfor %} +{% for r in rows %} + +{% for cell in r.cells %}{% endfor %} + +{% endfor %} +
{{ c.label }}{% if c.arrow %} {{ c.arrow }}{% endif %}
{% if cell.href %}{{ cell.text }}{% else %}{{ cell.text }}{% endif %}
+

Each row links to that program in the dashboard. The tinted row is +the run's best; a bar on the left marks a step that set a new best. Click a column +header to re-sort.

+{% else %} +

No programs recorded for this run yet.

+{% endif %} +{% endblock %} +""" + _COMPARE = """{% extends "base" %}{% block title %}compare - kai{% endblock %} {% block content %}

compare setups

@@ -300,6 +335,7 @@ "overview": _OVERVIEW, "setup": _SETUP, "dashboard": _DASHBOARD, + "programs": _PROGRAMS, "compare": _COMPARE, "glossary": _GLOSSARY, } @@ -342,6 +378,37 @@ def _fmt_cost(v: float) -> str: return f"${v:.3f}" if v else "-" +# columns folded into the score column or constant across a run — never their +# own column in the programs table. +_PROG_EXCLUDE = {"combined_score", "combined_score_std", "stage", "runs_successfully"} +# sort keys that aren't task metrics; everything else must be a metric column. +_PROG_FIXED_SORTS = {"step", "score", "delta", "model"} + + +def _metric_columns(steps, exclude=_PROG_EXCLUDE, cap: int = 4) -> list: + """Task-specific numeric metric keys that actually appear, in first-seen + order (e.g. raw_C, sequence_length, wall_seconds). Constant/universal keys + are excluded so the table only carries columns with real signal.""" + seen: list = [] + for s in steps: + for k, v in (s.metrics or {}).items(): + if k in exclude or k in seen: + continue + if isinstance(v, (int, float)) and not isinstance(v, bool): + seen.append(k) + return seen[:cap] + + +def _fmt_metric(v) -> str: + if isinstance(v, bool): + return str(v) + if isinstance(v, int): + return str(v) + if isinstance(v, float): + return str(int(v)) if v.is_integer() else f"{v:.4f}" + return "-" if v is None else str(v) + + # ─── app factory ───────────────────────────────────────────────────────────── @@ -579,6 +646,119 @@ def run_page(label: str, idx: int, step: Optional[int] = None): def step_page(label: str, idx: int, iteration: int): return RedirectResponse(f"/setup/{label}/run/{idx}?step={iteration}", status_code=307) + # ── programs browser: full-width, sortable table of every program in a run ── + @app.get("/setup/{label}/run/{idx}/programs", response_class=HTMLResponse) + def programs_page(label: str, idx: int, sort: str = "score", dir: str = "desc"): + arm = arm_or_404(label) + if idx < 0 or idx >= len(arm.run_dirs): + raise HTTPException(404, f"run {idx} out of range for {label}") + rd = arm.run_dirs[idx] + rs = explore.run_state(rd) + steps = explore.steps_for_run(rd) + + metric_keys = _metric_columns(steps) + allowed = _PROG_FIXED_SORTS | set(metric_keys) + if sort not in allowed: + sort = "score" + if dir not in ("asc", "desc"): + dir = "desc" + + # running-best bookkeeping (independent of the display sort) + improved, b, best_iter, best_sc = set(), float("-inf"), None, float("-inf") + for s in steps: + if s.score is not None: + if s.score > b + 1e-12: + improved.add(s.iteration) + b = max(b, s.score) + if s.score > best_sc: + best_sc, best_iter = s.score, s.iteration + + def _sortval(s): + if sort == "step": + return s.iteration + if sort == "score": + return s.score if s.score is not None else float("-inf") + if sort == "delta": + return s.delta if s.delta is not None else float("-inf") + if sort == "model": + return explore.short_model(s.model) + v = (s.metrics or {}).get(sort) + return v if isinstance(v, (int, float)) else float("-inf") + + steps_sorted = sorted(steps, key=_sortval, reverse=(dir == "desc")) + + # column descriptors: step, score, Δ, , model — all sortable + col_defs = [("step", "step", True), ("score", "score", True), ("delta", "Δ", True)] + col_defs += [(k, k, True) for k in metric_keys] + col_defs += [("model", "model", False)] + # step/model read most naturally ascending; scores/metrics descending + natural = {"step": "asc", "model": "asc"} + + def _href(key): + nd = ("asc" if dir == "desc" else "desc") if sort == key else natural.get(key, "desc") + return f"/setup/{label}/run/{idx}/programs?sort={key}&dir={nd}" + + columns = [ + { + "label": lbl, + "num": num, + "active": key == sort, + "arrow": ("▼" if dir == "desc" else "▲") if key == sort else "", + "href": _href(key), + } + for key, lbl, num in col_defs + ] + + base = f"/setup/{label}/run/{idx}" + rows = [] + for s in steps_sorted: + sc = _fmt_score(s.score) + sd = (s.metrics or {}).get("combined_score_std") + if isinstance(sd, (int, float)) and sd > 0: + sc = f"{sc} ±{sd:.3f}" + d, dcls = "", "" + if s.delta is not None and abs(s.delta) > 1e-9: + d, dcls = f"{s.delta:+.4f}", ("pos" if s.delta > 0 else "neg") + href = f"{base}?step={s.iteration}" + cells = [ + {"text": s.iteration, "num": True, "cls": "", "href": href}, + {"text": sc, "num": True, "cls": "", "href": ""}, + {"text": d, "num": True, "cls": dcls, "href": ""}, + ] + cells += [ + {"text": _fmt_metric((s.metrics or {}).get(k)), "num": True, "cls": "", "href": ""} + for k in metric_keys + ] + cells.append( + {"text": explore.short_model(s.model), "num": False, "cls": "", "href": href} + ) + rows.append( + { + "cells": cells, + "best": s.iteration == best_iter, + "improved": s.iteration in improved, + } + ) + + return render( + "programs", + crumbs=[ + ("overview", "/"), + (_short(label), f"/setup/{label}"), + (f"run {rs.seed}", f"/setup/{label}/run/{idx}"), + ("programs", None), + ], + label=label, + idx=idx, + short=_short(label), + task=_task_of(label), + seed=rs.seed, + columns=columns, + rows=rows, + sort_label="Δ" if sort == "delta" else sort, + dir=dir, + ) + # ── compare ─────────────────────────────────────────────────────────────── @app.get("/compare", response_class=HTMLResponse) def compare_page(): diff --git a/tests/test_viewer.py b/tests/test_viewer.py index 2a6d74d..efb60eb 100644 --- a/tests/test_viewer.py +++ b/tests/test_viewer.py @@ -118,7 +118,7 @@ def _make_tree(root: Path): "id": "p1", "parent_id": None, "iteration_found": 1, - "metrics": {"combined_score": 0.42}, + "metrics": {"combined_score": 0.42, "raw_C": 1.55, "stage": 2.0}, "code": "x = 1 < 2 # ok\n", "summary": { "hypothesis": "try X", @@ -173,6 +173,28 @@ def test_code_is_escaped_in_step(self): r = self.client.get("/setup/auto_full/run/0/step/1") self.assertIn("1 < 2", r.text) + def test_programs_table(self): + r = self.client.get("/setup/auto_full/run/0/programs") + self.assertEqual(r.status_code, 200) + self.assertIn("all programs", r.text) + # a task metric becomes a column; folded/constant universal keys do not + self.assertIn("raw_C", r.text) + self.assertNotIn("runs_successfully", r.text) + self.assertNotIn(">stage<", r.text) # 'stage' is excluded as a column header + # rows link back into the dashboard's ?step= selector + self.assertIn("?step=1", r.text) + # the dashboard offers a link into this table + self.assertIn("/programs", self.client.get("/setup/auto_full/run/0").text) + + def test_programs_sort_params(self): + # every sort key resolves; an unknown key falls back rather than 500ing + for url in ( + "/setup/auto_full/run/0/programs?sort=step&dir=asc", + "/setup/auto_full/run/0/programs?sort=raw_C&dir=desc", + "/setup/auto_full/run/0/programs?sort=bogus&dir=sideways", + ): + self.assertEqual(self.client.get(url).status_code, 200, url) + if __name__ == "__main__": unittest.main()