OpenHands · neubig · May 10, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.github/workflows/build-programbench-images.yml b/.github/workflows/build-programbench-images.yml
@@ -0,0 +1,139 @@
+name: Build ProgramBench Images
+
+run-name: >-
+  Build ProgramBench Images
+  ${{ github.event_name == 'workflow_dispatch' && inputs.sdk-commit != '' && format('(SDK: {0})', inputs.sdk-commit) || '' }}
+
+# ProgramBench task images are published by the upstream project at
+# programbench/<owner>_1776_<repo>.<sha>:task_cleanroom on Docker Hub. There
+# is nothing to build on our side at orchestrator time — we layer
+# openhands-agent-server onto each cleanroom image at runtime via
+# DockerDevWorkspace inside `programbench-infer`.
+#
+# This reusable workflow exists purely to satisfy the eval-job.yml shape in
+# the OpenHands/evaluation repo (each benchmark must contribute a
+# build-<benchmark>-images.yml that the orchestrator depends on). It runs a
+# cheap reachability check against Docker Hub for the requested instances
+# and exits successfully, so the downstream eval phase can proceed.
+
+on:
+  workflow_call:
+    inputs:
+      sdk-commit:
+        description: 'Software Agent SDK commit/ref to use (for parity with other build workflows; not consumed by ProgramBench)'
+        required: false
+        type: string
+        default: ''
+      n-limit:
+        description: 'Number of instances to verify reachability for (0 = all)'
+        required: false
+        type: string
+        default: '5'
+      instance-ids:
+        description: 'Comma-separated list of instance IDs to verify (overrides n-limit)'
+        required: false
+        type: string
+        default: ''
+      benchmarks-ref:
+        description: 'Benchmarks repo ref to checkout (for cross-repo calls)'
+        required: false
+        type: string
+        default: ''
+  workflow_dispatch:
+    inputs:
+      n-limit:
+        description: 'Number of instances to verify reachability for (0 = all)'
+        required: false
+        default: '5'
+        type: string
+      instance-ids:
+        description: 'Comma-separated list of instance IDs to verify (overrides n-limit)'
+        required: false
+        default: ''
+        type: string
+
+concurrency:
+  group: build-programbench-${{ inputs.sdk-commit || github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  verify-cleanroom-images:
+    name: Verify ProgramBench cleanroom images on Docker Hub
+    runs-on: ubuntu-24.04
+    timeout-minutes: 30
+    permissions:
+      contents: read
+    steps:
+      - name: Determine checkout ref
+        id: ref
+        env:
+          BENCHMARKS_REF: ${{ inputs.benchmarks-ref }}
+        run: |
+          if [ -n "${BENCHMARKS_REF:-}" ]; then
+            echo "ref=${BENCHMARKS_REF}" >> "$GITHUB_OUTPUT"
+          else
+            echo "ref=" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Checkout benchmarks repo
+        uses: actions/checkout@v6
+        with:
+          repository: OpenHands/benchmarks
+          ref: ${{ steps.ref.outputs.ref }}
+          submodules: recursive
+          token: ${{ github.token }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Sync deps
+        run: uv sync --frozen
+
+      - name: Resolve instance ids
+        id: resolve
+        env:
+          INSTANCE_IDS: ${{ inputs.instance-ids }}
+          N_LIMIT: ${{ inputs.n-limit }}
+        run: |
+          set -eo pipefail
+          if [ -n "${INSTANCE_IDS:-}" ]; then
+            tr ',' '\n' <<< "${INSTANCE_IDS}" | sed '/^$/d' > instance_ids.txt
+          else
+            uv run python - <<'PY' > instance_ids.txt
+          import os
+          from benchmarks.programbench.run_infer import _load_upstream_instances
+
+          n = int(os.environ.get("N_LIMIT") or "0")
+          ins = _load_upstream_instances()
+          if n > 0:
+              ins = ins[:n]
+          for entry in ins:
+              print(entry["instance_id"])
+          PY
+          fi
+          echo "Will verify $(wc -l < instance_ids.txt) instance(s):"
+          cat instance_ids.txt
+
+      - name: Verify each cleanroom image is reachable
+        # We hit the Docker Hub manifest endpoint instead of pulling — pulling
+        # the full image is multi-GiB per task and not needed at this stage.
+        # The downstream eval-job will pull on demand inside its DIND pod.
+        run: |
+          set -eo pipefail
+          fail=0
+          while IFS= read -r iid; do
+            [ -z "$iid" ] && continue
+            tagged="${iid//__/_1776_}"
+            image="programbench/${tagged}"
+            url="https://hub.docker.com/v2/repositories/${image}/tags/task_cleanroom"
+            status=$(curl -sS -o /dev/null -w '%{http_code}' "$url")
+            if [ "$status" = "200" ]; then
+              echo "✅ ${image}:task_cleanroom"
+            else
+              echo "❌ ${image}:task_cleanroom (HTTP ${status})"
+              fail=1
+            fi
+          done < instance_ids.txt
+          exit "$fail"
diff --git a/AGENTS.md b/AGENTS.md
@@ -107,6 +107,65 @@ When converting between OpenHands format and benchmark-specific formats:
 - The Harbor dataset name used in CI is `terminal-bench@2.0`.
 - For CI smoke tests, pass `--n-limit <count>` to `terminalbench-infer` so Harbor only runs the requested subset.
 
+# ProgramBench Notes
+- Upstream package is `programbench` (PyPI). Pinned `>=1.0,<2.0` in `pyproject.toml` (skipped on macOS — upstream images are linux/amd64 only).
+- Task images live at `programbench/<owner>_1776_<repo>.<sha>:<tag>` on Docker Hub. The agent runs against `:task_cleanroom`; evaluation runs against `:task`.
+- The `__` separator in instance ids is replaced with `_1776_` for Docker tag compatibility (see `_instance_to_image`).
+- **Strict offline isolation is not yet enforced** (known limitation). `--network=none` breaks the SDK's HTTP control channel and `docker network create --internal` breaks `-p` port mapping; the proper fix is in-container egress filtering with `CAP_NET_ADMIN` + iptables in an init step. Until that lands, the agent container uses the default Docker bridge and we rely on the system prompt + cleanroom image to keep the agent honest. `--allow-network` is reserved so future strict-offline runs are distinguishable in metadata. Treat current results as engineering-grade, not leaderboard-faithful.
+- `programbench-infer` writes submission tarballs to `<eval_output_dir>/run/<instance_id>/submission.tar.gz`; this matches the layout the upstream `programbench eval` CLI consumes.
+- The 200-task base set is loaded via `programbench.utils.load_data.load_all_instances(include_tests=False)`. Use `include_tests=False` during inference because the tests blob is large and only needed by the eval harness.
+- CI smoke runs the first 5 instances (matches `benchmarks/programbench/instances.txt`).
+- **Cleanroom workspace layout** (verified by inspecting agent runtime in retry-21):
+    - `/workspace/.git/`, `/workspace/README.md`, etc. — cloned reference repo (sources only).
+    - `/workspace/executable` — **the reference binary**, mode `---x--x--x` (execute-only,
+      NOT readable). The `binary_path` rendered into `prompts/default.j2` (currently
+      `/workspace/<repo_name>`) is **wrong**; the agent always finds the real binary at
+      `/workspace/executable` via its own `ls`.
+    - `/workspace/project/` — initially empty placeholder (legacy / unused).
+    - The agent's working directory is `/workspace/`. `compile.sh` lives at
+      `/workspace/compile.sh` and produces `/workspace/executable` — i.e. the agent's
+      build literally **overwrites the reference binary** at `/workspace/executable`.
+      By the time any Stop hook fires (end of conversation), the reference is gone.
+- **Reference-diffs hook gotcha** (retry-21 lesson): a Stop hook that diffs
+  `$REF --help` against `./executable --help` cannot work if it tries to use
+  `/workspace/executable` as `$REF` — because the agent's compile.sh has replaced
+  it. Two paths forward:
+    1. Capture `executable --help` / `executable -h` into a hidden, read-only
+       location (e.g. `/opt/programbench-ref/`) **before the conversation starts**
+       (e.g. via a pre-conversation `WorkspaceClient.bash` call in `run_infer.py`),
+       then have the Stop hook diff against those captured outputs.
+    2. Tell the agent in the prompt to `mv /workspace/executable
+       /workspace/executable.ref` before building (some agents already do this
+       spontaneously; we observed it in zoxide retry-21).
+  Approach (1) is robust to agent behaviour; approach (2) keeps the hook simple
+  but depends on agent compliance. **Retry-22 shipped approach (2)** with a
+  Step-0 prominent block at the top of `prompts/default.j2`; Sonnet 4.5
+  complied 3-for-3 on the smoke set.
+
+- **Reference-diffs hook v2** (retry-22 -> retry-23): the v1 hook only diffed
+  top-level `--help` and `-h`. Bucketing R22's residual 352 failures showed
+  68% are reachable by expanding the probe set. v2 adds:
+    1. **Top-level invalid flag probe** (`<bin> --__bogus__`) — catches argv
+       parser leaks (agent silently accepts unknown flags rc=0 where ref rc=2).
+    2. **Subcommand discovery** via awk parsing of the reference's
+       `Commands:` / `Subcommands:` / `Available Commands:` /
+       `Available subcommands:` section. Capped at
+       `PB_REFERENCE_DIFFS_MAX_SUBCMDS` (default 8).
+    3. **Per-subcommand probes**: `<sub> --help` (drift detection),
+       `<sub> --__bogus__` (validation gap), `<sub> /<bogus-path>`
+       (validation gap). Compares both rc and stderr/stdout.
+    4. **argv[0] normalization** via `bash -c 'exec -a "$1" "${@:2}"' _
+       executable "$bin" "$@"`. Both ref and agent see argv[0]="executable",
+       so binaries that derive `Usage:` from argv[0] (clap default) don't
+       false-positive on basename drift. **Note:** this only works for ELF
+       binaries — shell scripts get $0 from the kernel exec path, not from
+       `exec -a`. ProgramBench reference binaries are always compiled, so
+       we're safe in production.
+  Hook timeout was bumped 120s -> 240s to fit the worst-case probe count
+  (3 top-level @ 30s + 8 subs * 3 probes @ 5s = ~185s).
+  Smoke-tested with synthetic gcc-built C binaries; see
+  `tests/test_programbench.py::TestReferenceDiffsHookV2`.
+
 # SWE-Bench Multimodal Notes
 - The default `swebenchmultimodal-infer` selection now comes from `benchmarks/swebenchmultimodal/resolved_instances.txt`.
 - `resolved_instances.txt` is generated from `ambiguity_annotations.json` and contains all instances annotated with the `SOLVEABLE` keyword.

diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ This repository contains benchmark evaluation infrastructure for [OpenHands](htt
 | [GAIA](benchmarks/gaia/) | General AI assistant tasks requiring multi-step reasoning | ✅ Active |
 | [Commit0](benchmarks/commit0/) | Python function implementation tasks with unit tests | ✅ Active |
 | [OpenAgentSafety](benchmarks/openagentsafety/) | AI agent safety evaluation in workplace scenarios with NPC interactions | ✅ Active |
+| [ProgramBench](benchmarks/programbench/) | Rebuild a program from scratch given only its compiled binary and docs | ✅ Active |
 
 See the individual benchmark directories for detailed usage instructions.
 

diff --git a/benchmarks/programbench/README.md b/benchmarks/programbench/README.md
@@ -0,0 +1,102 @@
+# ProgramBench
+
+[ProgramBench](https://programbench.com) (Yang et al., 2026) asks: *can a
+language-model agent rebuild a program from scratch given only the compiled
+binary and its public documentation?* The benchmark ships 200 cleanroom
+tasks (and an extended set), each as a Docker image containing the binary
+plus its docs.
+
+This module wraps the upstream
+[facebookresearch/ProgramBench](https://github.com/facebookresearch/ProgramBench)
+harness so the OpenHands [Software Agent SDK](https://github.com/OpenHands/software-agent-sdk)
+can be used as the inference agent.
+
+## How it works
+
+1. **Inference (`programbench-infer`)** loads the upstream task list, layers
+   `openhands-agent-server` on top of each `programbench/<id>:task_cleanroom`
+   image, and runs the SDK agent with no internet access. After the agent
+   finishes, `/workspace` is tarred up into
+   `<eval_output_dir>/run/<instance_id>/submission.tar.gz` — exactly the
+   layout `programbench eval` expects.
+2. **Evaluation (`programbench-eval`)** shells out to the upstream
+   `programbench eval <run_dir>` CLI, then aggregates the per-instance
+   `<id>/<id>.eval.json` files into our standard report format (`resolved`,
+   `almost_resolved`, `error`, …).
+
+## Prerequisites
+
+- Linux x86_64 host. The upstream task images are built for `linux/amd64`
+  only and emulating them via QEMU is impractically slow.
+- Docker daemon running and reachable to the user invoking the script.
+- The `programbench` Python package (added as a dependency in this repo's
+  `pyproject.toml`).
+- An LLM config under `.llm_config/`.
+
+## Usage
+
+### Inference
+
+```bash
+# Smoke test — first 5 tasks
+uv run programbench-infer .llm_config/claude.json --n-limit 5
+
+# Selected subset of tasks: pass a newline-separated instance-id file
+uv run programbench-infer .llm_config/claude.json \
+    --select my_instances.txt
+
+# Higher concurrency, more iterations
+uv run programbench-infer .llm_config/claude.json \
+    --n-limit 20 --num-workers 4 --max-iterations 300
+```
+
+### Evaluation
+
+```bash
+uv run programbench-eval ./eval_outputs/.../output.jsonl
+```
+
+Pass `--skip-eval` to re-aggregate an already-graded run without rerunning
+the upstream harness, and `--force` to regrade everything.
+
+## Output layout
+
+```
+eval_outputs/
+└── programbench__ProgramBench-test/
+    └── <model>_sdk_<sha>_maxiter_1000/
+        ├── metadata.json
+        ├── output.jsonl
+        ├── output.report.json
+        └── run/
+            ├── abishekvashok__cmatrix.5c082c6/
+            │   ├── submission.tar.gz
+            │   └── abishekvashok__cmatrix.5c082c6.eval.json
+            └── …
+```
+
+## Caveats
+
+- **Offline inference (known limitation).** ProgramBench's leaderboard
+  rules require the agent to have no internet access during inference.
+  Enforcing that via Docker is harder than it sounds: `--network none`
+  breaks the SDK's HTTP control channel (Docker port mapping needs a
+  network interface), and `docker network create --internal` blocks
+  the `-p` mapping too. We currently rely on the system prompt + the
+  cleanroom image (which ships everything the task needs) and leave
+  the container on the default Docker bridge. Strict in-container
+  egress filtering (iptables in an init step with `CAP_NET_ADMIN`) is
+  tracked as follow-up work in `AGENTS.md`. The `--allow-network` flag
+  is reserved so that future strict-offline runs are distinguishable in
+  metadata. **Until that lands, treat results as engineering-grade, not
+  leaderboard-faithful.**
+- **Image pulls are large.** Each task image is multiple GiB. Plan disk
+  budget accordingly.
+- **Remote workspace** is not yet wired up for ProgramBench because we
+  have no reliable network-isolation hook for the runtime API. PRs welcome.
+
+## References
+
+- ProgramBench paper & leaderboard: <https://programbench.com>
+- Upstream harness: <https://github.com/facebookresearch/ProgramBench>
+- Upstream usage guide: <https://github.com/facebookresearch/ProgramBench/blob/main/docs/README.md>
diff --git a/benchmarks/programbench/__init__.py b/benchmarks/programbench/__init__.py
diff --git a/benchmarks/programbench/config.py b/benchmarks/programbench/config.py
@@ -0,0 +1,76 @@
+"""ProgramBench configuration defaults.
+
+ProgramBench (https://programbench.com / https://github.com/facebookresearch/ProgramBench)
+ships its task metadata inside the upstream ``programbench`` PyPI package and the
+per-task Docker images under the ``programbench`` Docker Hub org with the
+``task_cleanroom`` tag (e.g. ``programbench/abishekvashok_1776_cmatrix.5c082c6:task_cleanroom``).
+"""
+
+from typing import TypedDict
+
+
+class _InferDefaults(TypedDict):
+    dataset: str
+    split: str
+    output_dir: str
+    num_workers: int
+    workspace_dir: str
+    task_image_tag: str
+    build_target: str
+    max_iterations: int
+
+
+class _EvalDefaults(TypedDict):
+    image_tag: str
+    workers: int
+    branch_workers: int
+    docker_cpus: int
+
+
+# Default inference settings (only include values actually used by argparse).
+INFER_DEFAULTS: _InferDefaults = {
+    # ProgramBench has a single canonical 200-task split shipped with the
+    # ``programbench`` package. We expose this purely as a label that ends up
+    # in the structured output dir name.
+    "dataset": "programbench/ProgramBench",
+    "split": "test",
+    "output_dir": "./eval_outputs",
+    "num_workers": 1,
+    # Submission tarballs default to using the agent's /workspace contents.
+    "workspace_dir": "/workspace",
+    # The cleanroom task image tag used for inference. ProgramBench tags
+    # cleanroom variants with ``task_cleanroom`` (binary + docs only, no
+    # internet). Other tags exist on Docker Hub but should not be used for
+    # inference.
+    "task_image_tag": "task_cleanroom",
+    # Build target for layering openhands-agent-server on top of the
+    # cleanroom image. ``source-minimal`` keeps the image small.
+    "build_target": "source-minimal",
+    # Conversation iteration budget. ProgramBench tasks tend to be larger
+    # than typical SWE-Bench instances since the agent rebuilds an entire
+    # codebase from scratch. We allow up to 1000 because:
+    #   1. The full rebuild loop (read docs → infer interface → implement
+    #      → run probes against the reference binary → diagnose failures
+    #      → patch) can chain many bash + edit + test iterations on
+    #      non-trivial CLIs.
+    #   2. Stop hooks (compile-contract + reference-diffs) may reject
+    #      the agent's first attempt to finish and demand more work; we
+    #      want a generous budget for those retries.
+    # Lowering this for cost only makes sense for quick smoke runs (pass
+    # ``--max-iterations`` explicitly).
+    "max_iterations": 1000,
+}
+
+# Default evaluation settings.
+EVAL_DEFAULTS: _EvalDefaults = {
+    # Image tag used by ``programbench eval`` to spin up evaluation
+    # containers. ``task`` is the upstream default and corresponds to the
+    # full task image (binary + tests scaffolding).
+    "image_tag": "task",
+    # Parallelism for ``programbench eval``.
+    "workers": 1,
+    "branch_workers": 1,
+    # CPU cores allotted per docker container during evaluation. Mirrors
+    # programbench.constants.DOCKER_CPUS default (10).
+    "docker_cpus": 10,
+}