From 5523ad39d8c5fad24d95344a71772ec12d363985 Mon Sep 17 00:00:00 2001
From: aktasbatuhan <aktasbatuhann@gmail.com>
Date: Wed, 3 Jun 2026 22:21:11 +0100
Subject: [PATCH] feat(viewer): sortable programs browser for a run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds /setup/{label}/run/{idx}/programs — a full-width, sortable table of
every program in a run, modeled on AlphaEvolve's Programs browser. It
surfaces the task-specific metric columns (e.g. raw_C, sequence_length,
wall_seconds for autocorrelation; circles_n, sum_of_radii for packing)
that the narrow dashboard step-list hides, auto-detecting which numeric
metrics actually vary and excluding folded/constant ones (combined_score,
combined_score_std, stage, runs_successfully).

Sorting is server-side (no JS): each column header links to
?sort={col}&dir={asc|desc}, defaulting to score desc to surface the best
program first; an unknown sort key falls back rather than erroring. Every
row links back into the dashboard at ?step={n}; the run's best row is
tinted and new-best steps keep the left accent bar. The dashboard gains a
"browse all N programs as a table" link.

Tests cover the route, metric-column selection (task metric in, universal
keys out), the dashboard link, ?step row links, and sort-param fallback.
---
 kaievolve/viewer/server.py | 180 +++++++++++++++++++++++++++++++++++++
 tests/test_viewer.py       |  24 ++++-
 2 files changed, 203 insertions(+), 1 deletion(-)
diff --git a/kaievolve/viewer/server.py b/kaievolve/viewer/server.py
index d9779fc..baaf565 100644
--- a/kaievolve/viewer/server.py
+++ b/kaievolve/viewer/server.py
@@ -122,6 +122,12 @@
   .dash-list td { padding:5px 10px; }
   .dash-list tr.sel td { background:#eef4fb; }
   .dash-list tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); }
+  /* shared row accents (programs table + dash list): best row tinted, new-best
+     steps carry a left accent bar on the first cell */
+  tr.sel td { background:#eef4fb; }
+  tr.imp td:first-child { box-shadow:inset 3px 0 0 var(--accent); }
+  th.sort a { color:var(--accent); }
+  th .arrow { color:var(--accent); font-size:0.78rem; }
   .dash-list a.row { display:block; color:inherit; text-decoration:none; }
   .dash-list tr:hover td { background:#f4f7fb; }
   .detail { min-width:0; }
@@ -207,6 +213,10 @@
 each attempt{% if has_std %} · whiskers = ±1σ score noise{% endif %}. Models:
 {% for m in models %}<b>{{ m.name }}</b> {{ m.count }}{% if not loop.last %} · {% endif %}{% endfor %}</p>
 
+<div class="btn-row">
+  <a class="btn ghost sm" href="/setup/{{ label }}/run/{{ idx }}/programs">browse all {{ steps_n }} programs as a table &rarr;</a>
+</div>
+
 <div class="dash">
   <div class="dash-list">
     <table>
@@ -251,6 +261,31 @@
 {% endblock %}
 """
 
+_PROGRAMS = """{% extends "base" %}{% block title %}{{ short }} run {{ seed }} programs - kai{% endblock %}
+{% block content %}
+<h1>{{ short }} · run {{ seed }} · all programs</h1>
+<p class="sub">{{ task }} · {{ rows|length }} programs · sorted by {{ sort_label }} ({{ dir }})</p>
+<div class="btn-row">
+  <a class="btn ghost sm" href="/setup/{{ label }}/run/{{ idx }}">&larr; back to dashboard</a>
+</div>
+{% if rows %}
+<table>
+<tr>{% for c in columns %}<th class="{{ 'num' if c.num }}{{ ' sort' if c.active }}"><a href="{{ c.href }}">{{ c.label }}</a>{% if c.arrow %} <span class="arrow">{{ c.arrow }}</span>{% endif %}</th>{% endfor %}</tr>
+{% for r in rows %}
+<tr class="{{ 'sel' if r.best }} {{ 'imp' if r.improved }}">
+{% for cell in r.cells %}<td class="{{ 'num' if cell.num }} {{ cell.cls }}">{% if cell.href %}<a class="row" href="{{ cell.href }}">{{ cell.text }}</a>{% else %}{{ cell.text }}{% endif %}</td>{% endfor %}
+</tr>
+{% endfor %}
+</table>
+<p class="note">Each row links to that program in the dashboard. The tinted row is
+the run's best; a bar on the left marks a step that set a new best. Click a column
+header to re-sort.</p>
+{% else %}
+<p class="note">No programs recorded for this run yet.</p>
+{% endif %}
+{% endblock %}
+"""
+
 _COMPARE = """{% extends "base" %}{% block title %}compare - kai{% endblock %}
 {% block content %}
 <h1>compare setups</h1>
@@ -300,6 +335,7 @@
     "overview": _OVERVIEW,
     "setup": _SETUP,
     "dashboard": _DASHBOARD,
+    "programs": _PROGRAMS,
     "compare": _COMPARE,
     "glossary": _GLOSSARY,
 }
@@ -342,6 +378,37 @@ def _fmt_cost(v: float) -> str:
     return f"${v:.3f}" if v else "-"
 
 
+# columns folded into the score column or constant across a run — never their
+# own column in the programs table.
+_PROG_EXCLUDE = {"combined_score", "combined_score_std", "stage", "runs_successfully"}
+# sort keys that aren't task metrics; everything else must be a metric column.
+_PROG_FIXED_SORTS = {"step", "score", "delta", "model"}
+
+
+def _metric_columns(steps, exclude=_PROG_EXCLUDE, cap: int = 4) -> list:
+    """Task-specific numeric metric keys that actually appear, in first-seen
+    order (e.g. raw_C, sequence_length, wall_seconds). Constant/universal keys
+    are excluded so the table only carries columns with real signal."""
+    seen: list = []
+    for s in steps:
+        for k, v in (s.metrics or {}).items():
+            if k in exclude or k in seen:
+                continue
+            if isinstance(v, (int, float)) and not isinstance(v, bool):
+                seen.append(k)
+    return seen[:cap]
+
+
+def _fmt_metric(v) -> str:
+    if isinstance(v, bool):
+        return str(v)
+    if isinstance(v, int):
+        return str(v)
+    if isinstance(v, float):
+        return str(int(v)) if v.is_integer() else f"{v:.4f}"
+    return "-" if v is None else str(v)
+
+
 # ─── app factory ─────────────────────────────────────────────────────────────
 
 
@@ -579,6 +646,119 @@ def run_page(label: str, idx: int, step: Optional[int] = None):
     def step_page(label: str, idx: int, iteration: int):
         return RedirectResponse(f"/setup/{label}/run/{idx}?step={iteration}", status_code=307)
 
+    # ── programs browser: full-width, sortable table of every program in a run ──
+    @app.get("/setup/{label}/run/{idx}/programs", response_class=HTMLResponse)
+    def programs_page(label: str, idx: int, sort: str = "score", dir: str = "desc"):
+        arm = arm_or_404(label)
+        if idx < 0 or idx >= len(arm.run_dirs):
+            raise HTTPException(404, f"run {idx} out of range for {label}")
+        rd = arm.run_dirs[idx]
+        rs = explore.run_state(rd)
+        steps = explore.steps_for_run(rd)
+
+        metric_keys = _metric_columns(steps)
+        allowed = _PROG_FIXED_SORTS | set(metric_keys)
+        if sort not in allowed:
+            sort = "score"
+        if dir not in ("asc", "desc"):
+            dir = "desc"
+
+        # running-best bookkeeping (independent of the display sort)
+        improved, b, best_iter, best_sc = set(), float("-inf"), None, float("-inf")
+        for s in steps:
+            if s.score is not None:
+                if s.score > b + 1e-12:
+                    improved.add(s.iteration)
+                b = max(b, s.score)
+                if s.score > best_sc:
+                    best_sc, best_iter = s.score, s.iteration
+
+        def _sortval(s):
+            if sort == "step":
+                return s.iteration
+            if sort == "score":
+                return s.score if s.score is not None else float("-inf")
+            if sort == "delta":
+                return s.delta if s.delta is not None else float("-inf")
+            if sort == "model":
+                return explore.short_model(s.model)
+            v = (s.metrics or {}).get(sort)
+            return v if isinstance(v, (int, float)) else float("-inf")
+
+        steps_sorted = sorted(steps, key=_sortval, reverse=(dir == "desc"))
+
+        # column descriptors: step, score, Δ, <task metrics…>, model — all sortable
+        col_defs = [("step", "step", True), ("score", "score", True), ("delta", "Δ", True)]
+        col_defs += [(k, k, True) for k in metric_keys]
+        col_defs += [("model", "model", False)]
+        # step/model read most naturally ascending; scores/metrics descending
+        natural = {"step": "asc", "model": "asc"}
+
+        def _href(key):
+            nd = ("asc" if dir == "desc" else "desc") if sort == key else natural.get(key, "desc")
+            return f"/setup/{label}/run/{idx}/programs?sort={key}&dir={nd}"
+
+        columns = [
+            {
+                "label": lbl,
+                "num": num,
+                "active": key == sort,
+                "arrow": ("▼" if dir == "desc" else "▲") if key == sort else "",
+                "href": _href(key),
+            }
+            for key, lbl, num in col_defs
+        ]
+
+        base = f"/setup/{label}/run/{idx}"
+        rows = []
+        for s in steps_sorted:
+            sc = _fmt_score(s.score)
+            sd = (s.metrics or {}).get("combined_score_std")
+            if isinstance(sd, (int, float)) and sd > 0:
+                sc = f"{sc} ±{sd:.3f}"
+            d, dcls = "", ""
+            if s.delta is not None and abs(s.delta) > 1e-9:
+                d, dcls = f"{s.delta:+.4f}", ("pos" if s.delta > 0 else "neg")
+            href = f"{base}?step={s.iteration}"
+            cells = [
+                {"text": s.iteration, "num": True, "cls": "", "href": href},
+                {"text": sc, "num": True, "cls": "", "href": ""},
+                {"text": d, "num": True, "cls": dcls, "href": ""},
+            ]
+            cells += [
+                {"text": _fmt_metric((s.metrics or {}).get(k)), "num": True, "cls": "", "href": ""}
+                for k in metric_keys
+            ]
+            cells.append(
+                {"text": explore.short_model(s.model), "num": False, "cls": "", "href": href}
+            )
+            rows.append(
+                {
+                    "cells": cells,
+                    "best": s.iteration == best_iter,
+                    "improved": s.iteration in improved,
+                }
+            )
+
+        return render(
+            "programs",
+            crumbs=[
+                ("overview", "/"),
+                (_short(label), f"/setup/{label}"),
+                (f"run {rs.seed}", f"/setup/{label}/run/{idx}"),
+                ("programs", None),
+            ],
+            label=label,
+            idx=idx,
+            short=_short(label),
+            task=_task_of(label),
+            seed=rs.seed,
+            columns=columns,
+            rows=rows,
+            sort_label="Δ" if sort == "delta" else sort,
+            dir=dir,
+        )
+
     # ── compare ───────────────────────────────────────────────────────────────
     @app.get("/compare", response_class=HTMLResponse)
     def compare_page():
diff --git a/tests/test_viewer.py b/tests/test_viewer.py
index 2a6d74d..efb60eb 100644
--- a/tests/test_viewer.py
+++ b/tests/test_viewer.py
@@ -118,7 +118,7 @@ def _make_tree(root: Path):
             "id": "p1",
             "parent_id": None,
             "iteration_found": 1,
-            "metrics": {"combined_score": 0.42},
+            "metrics": {"combined_score": 0.42, "raw_C": 1.55, "stage": 2.0},
             "code": "x = 1 < 2  # ok\n",
             "summary": {
                 "hypothesis": "try X",
@@ -173,6 +173,28 @@ def test_code_is_escaped_in_step(self):
         r = self.client.get("/setup/auto_full/run/0/step/1")
         self.assertIn("1 &lt; 2", r.text)
 
+    def test_programs_table(self):
+        r = self.client.get("/setup/auto_full/run/0/programs")
+        self.assertEqual(r.status_code, 200)
+        self.assertIn("all programs", r.text)
+        # a task metric becomes a column; folded/constant universal keys do not
+        self.assertIn("raw_C", r.text)
+        self.assertNotIn("runs_successfully", r.text)
+        self.assertNotIn(">stage<", r.text)  # 'stage' is excluded as a column header
+        # rows link back into the dashboard's ?step= selector
+        self.assertIn("?step=1", r.text)
+        # the dashboard offers a link into this table
+        self.assertIn("/programs", self.client.get("/setup/auto_full/run/0").text)
+
+    def test_programs_sort_params(self):
+        # every sort key resolves; an unknown key falls back rather than 500ing
+        for url in (
+            "/setup/auto_full/run/0/programs?sort=step&dir=asc",
+            "/setup/auto_full/run/0/programs?sort=raw_C&dir=desc",
+            "/setup/auto_full/run/0/programs?sort=bogus&dir=sideways",
+        ):
+            self.assertEqual(self.client.get(url).status_code, 200, url)
+
 
 if __name__ == "__main__":
     unittest.main()