OpenHands · neubig · May 29, 2026 · May 7, 2026 · May 7, 2026 · May 16, 2026
diff --git a/.agents/skills/manage-evals/SKILL.md b/.agents/skills/manage-evals/SKILL.md
@@ -70,6 +70,7 @@ done
 | `commit0` | Commit0 — commit generation tasks |
 | `swebenchmultimodal` | SWE-bench Multimodal — tasks with images |
 | `terminalbench` | TerminalBench — terminal interaction tasks |
+| `programbench` | ProgramBench — program-repair tasks against gold-standard test binaries |
 
 ### Trigger Options
 
@@ -130,7 +131,7 @@ Each line is a run path. Match by benchmark and model to find the run.
 ### Step 2: Identify the Run Path Components
 
 A run path has three components:
-- **benchmark**: `swebench`, `swebenchpro`, `gaia`, `swtbench`, `commit0`, `swebenchmultimodal`, `terminalbench`
+- **benchmark**: `swebench`, `swebenchpro`, `gaia`, `swtbench`, `commit0`, `swebenchmultimodal`, `terminalbench`, `programbench`
 - **model_slug**: Derived from model name with `/:@.` replaced by `-` (e.g., `litellm_proxy-claude-sonnet-4-5-20250929`)
 - **run_id**: The GitHub Actions workflow run ID from the `OpenHands/evaluation` repo
 

diff --git a/.agents/skills/manage-evals/references/eval-infrastructure.md b/.agents/skills/manage-evals/references/eval-infrastructure.md
@@ -42,7 +42,7 @@ and served via CDN at `https://results.eval.all-hands.dev/`.
 {benchmark}/{model_slug}/{github_run_id}/
 ```
 
-- **benchmark**: `swebench`, `swebenchpro`, `gaia`, `swtbench`, `commit0`, `swebenchmultimodal`, `terminalbench`
+- **benchmark**: `swebench`, `swebenchpro`, `gaia`, `swtbench`, `commit0`, `swebenchmultimodal`, `terminalbench`, `programbench`
 - **model_slug**: Model name with `/:@.` replaced by `-`
   - Example: `litellm_proxy/claude-sonnet-4-5-20250929` → `litellm_proxy-claude-sonnet-4-5-20250929`
 - **github_run_id**: The GitHub Actions run ID from the `OpenHands/evaluation` repo

diff --git a/.agents/skills/manage-evals/scripts/manage_evals.py b/.agents/skills/manage-evals/scripts/manage_evals.py
@@ -43,6 +43,7 @@
     "commit0",
     "swebenchmultimodal",
     "terminalbench",
+    "programbench",
 ]
 TOOL_PRESETS = ["default", "gemini", "gpt5", "planning"]
 AGENT_TYPES = ["default", "acp-claude", "acp-codex"]

diff --git a/.agents/skills/run-eval.md b/.agents/skills/run-eval.md
@@ -31,7 +31,7 @@ curl -X POST \
 ```
 
 **Key parameters:**
-- `benchmark`: `swebench`, `swebenchpro`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench`, `terminalbench`
+- `benchmark`: `swebench`, `swebenchpro`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench`, `terminalbench`, `programbench`
 - `eval_limit`: Any positive integer (e.g., `1`, `10`, `50`, `200`)
 - `model_ids`: See `.github/run-eval/resolve_model_config.py` for available models
 - `benchmarks_branch`: Use feature branch from the benchmarks repo to test benchmark changes before merging

diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml
@@ -22,6 +22,7 @@ on:
                     - commit0
                     - swebenchmultimodal
                     - terminalbench
+                    - programbench
             sdk_ref:
                 description: SDK commit/ref to evaluate (must be a semantic version like v1.0.0 unless 'Allow unreleased branches' is checked)
                 required: true

diff --git a/AGENTS.md b/AGENTS.md
@@ -383,4 +383,9 @@ Note: This is separate from `persistence_dir` which is used for conversation sta
 - Repository guidance lives in the project root AGENTS.md (loaded as a third-party skill file).
 </REPO_CONFIG_NOTES>
 
+<KNOWN_RACES_AND_GOTCHAS>
+- **`RemoteConversation._wait_for_run_completion` and stop hooks**: Per-field WebSocket `FINISHED` status events are *hints*, not authoritative termination. The server-side `LocalConversation.run` loop releases its state lock at the end of each iteration, so a `FINISHED` status set by `agent.step()` is visible to clients before the *next* loop iteration runs stop hooks (`hook_processor.run_stop`). If a stop hook returns rc=2 (denying the stop), status flips back to RUNNING and the agent gets another iteration. The client's `_wait_for_run_completion` therefore must **not** return on the first WS-delivered FINISHED. Instead, post-run full-state WebSocket snapshots are authoritative; if that snapshot is missing, the time-based hard-fallback path (`TERMINAL_HARD_FALLBACK_SECS = 30.0`) accepts REST-confirmed terminal status after 30 continuous seconds. ERROR/STUCK still raise immediately through `_handle_conversation_status`. Empirically this caused agents to consume just 0–1 iterations after a hook block on programbench retry-16; fix shipped in `feat/programbench`.
+- **Hook events vs `state.events`**: `HookExecutionEvent` is emitted via `hook_processor.original_callback` (the chained `_on_event`), so it *should* land in `state.events` when the run is allowed to complete. But because the WS-FINISHED race above used to make the client snapshot `list(conversation.state.events)` *before* the server-side hook eval ran, `output.jsonl` history could miss hook events while on-disk persisted events under `/workspace/conversations/.../events/` had them — useful as a forensic signal that the race fired.
+</KNOWN_RACES_AND_GOTCHAS>
+
 </REPO>
diff --git a/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py
@@ -641,7 +641,8 @@ class RemoteConversation(BaseConversation):
     workspace: RemoteWorkspace
     _client: httpx.Client
     _cleanup_initiated: bool
-    _terminal_status_queue: Queue[str]  # Thread-safe queue for terminal status from WS
+    _terminal_status_queue: Queue[str]
+    _run_armed: threading.Event
     _conversation_info_base_path: str
     _conversation_action_base_path: str
     delete_on_close: bool = False
@@ -706,6 +707,7 @@ def __init__(
         self._conversation_action_base_path = LEGACY_CONVERSATIONS_PATH
         self._cleanup_initiated = False
         self._terminal_status_queue: Queue[str] = Queue()
+        self._run_armed = threading.Event()
 
         should_create = conversation_id is None
         if conversation_id is not None:
@@ -832,17 +834,47 @@ def __init__(
             # No visualization (visualizer is None)
             self._visualizer = None
 
-        # Add a callback that signals when run completes via WebSocket
-        # This ensures we wait for all events to be delivered before run() returns
+        # Add a callback that signals when run completes via WebSocket.
+        # The server's post-run full-state snapshot is the only authoritative
+        # WebSocket success signal. Per-field FINISHED is a hint because stop
+        # hooks can still revert it; per-field ERROR/STUCK remain immediate.
         def run_complete_callback(event: Event) -> None:
-            if isinstance(event, ConversationStateUpdateEvent):
-                if event.key == "execution_status":
-                    try:
-                        status = ConversationExecutionStatus(event.value)
-                        if status.is_terminal():
-                            self._terminal_status_queue.put(event.value)
-                    except ValueError:
-                        pass  # Unknown status value, ignore
+            if not isinstance(event, ConversationStateUpdateEvent):
+                return
+
+            if event.key == "execution_status":
+                try:
+                    status = ConversationExecutionStatus(event.value)
+                except ValueError:
+                    return
+                if status in (
+                    ConversationExecutionStatus.ERROR,
+                    ConversationExecutionStatus.STUCK,
+                ):
+                    self._terminal_status_queue.put(status.value)
+                return
+
+            if event.key != FULL_STATE_KEY:
+                return
+
+            # Only accept full-state snapshots as run-completion signals when a
+            # run is actually in progress. The WS subscription delivers an
+            # initial full-state snapshot during connect(); if that snapshot
+            # carries a non-RUNNING status (e.g. "idle"), it could be picked up
+            # by _wait_for_run_completion() as the completion signal for the
+            # *next* run() invocation, causing blocking=True to return before the
+            # server has actually finished.
+            if not self._run_armed.is_set():
+                return
+
+            raw_status = event.value.get("execution_status")
+            try:
+                status = ConversationExecutionStatus(raw_status)
+            except ValueError:
+                return
+
+            if status != ConversationExecutionStatus.RUNNING:
+                self._terminal_status_queue.put(status.value)
 
         # Compose all callbacks into a single callback
         all_callbacks = self._callbacks + [run_complete_callback]
@@ -985,13 +1017,12 @@ def run(
         Raises:
             ConversationRunError: If the run fails or times out.
         """
-        # Drain any stale terminal status events from previous runs.
-        # This prevents stale events from causing early returns.
-        while True:
-            try:
-                self._terminal_status_queue.get_nowait()
-            except Empty:
-                break
+        # Disarm and drain any stale terminal status events from previous runs
+        # before arming for the new one. _run_armed gates full-state WS snapshots
+        # in run_complete_callback; clearing it here prevents the initial WS
+        # subscription snapshot from being mistaken for the post-run snapshot.
+        self._run_armed.clear()
+        self._drain_terminal_status_queue()
 
         # Trigger a run on the server using the dedicated run endpoint.
         # Let the server tell us if it's already running (409), avoiding an extra GET.
@@ -1013,7 +1044,13 @@ def run(
             logger.info(f"run() triggered successfully: {resp}")
 
         if blocking:
-            self._wait_for_run_completion(poll_interval, timeout)
+            # Arm after POST so that only WS full-state snapshots arriving
+            # after the run was triggered are treated as run-completion signals.
+            self._run_armed.set()
+            try:
+                self._wait_for_run_completion(poll_interval, timeout)
+            finally:
+                self._run_armed.clear()
 
     def _wait_for_run_completion(
         self,
@@ -1028,9 +1065,9 @@ def _wait_for_run_completion(
         status before WebSocket delivers the final events.
 
         As a fallback, it also polls the server periodically. If the WebSocket
-        is delayed or disconnected, we return after multiple consecutive polls
-        show a terminal status, and reconcile events to catch any that were
-        missed via WebSocket.
+        is delayed or disconnected, polling still surfaces ERROR/STUCK promptly.
+        A REST-only FINISHED status is not authoritative because stop hooks can
+        still revert it to RUNNING before the server-side run task exits.
 
         Args:
             poll_interval: Time in seconds between status polls (fallback).
@@ -1042,14 +1079,19 @@ def _wait_for_run_completion(
                 responses are retried until timeout.
         """
         start_time = time.monotonic()
-        consecutive_terminal_polls = 0
-        # Return after this many consecutive terminal polls (fallback for WS issues).
-        # We use 3 polls to balance latency vs reliability:
-        # - 1 poll could be a transient state during shutdown
-        # - 2 polls might still catch a race condition
-        # - 3 polls (with default 1s interval = 3s total) provides high confidence
-        #   that the run is truly complete while keeping fallback latency reasonable
-        TERMINAL_POLL_THRESHOLD = 3
+        terminal_poll_count = 0
+        # Log a warning after this many consecutive REST terminal polls without a
+        # post-run WS snapshot. This is a health signal, not a return path —
+        # returning immediately on REST FINISHED would reintroduce the stop-hook race.
+        TERMINAL_POLL_WARNING_THRESHOLD = 3
+        # Time-based hard fallback: accept REST-confirmed terminal status after
+        # the server has been reporting terminal for at least this many seconds
+        # without a post-run WS snapshot. Stop hooks are fast (seconds); 30 s
+        # is a safe bound regardless of poll_interval. This prevents an
+        # indefinite hang when the WS snapshot is never delivered (e.g., socket
+        # dropped after the run finishes on the server).
+        TERMINAL_HARD_FALLBACK_SECS = 30.0
+        terminal_first_seen_at: float | None = None
 
         while True:
             elapsed = time.monotonic() - start_time
@@ -1063,20 +1105,23 @@ def _wait_for_run_completion(
                 )
 
             # Wait for either:
-            # 1. WebSocket delivers terminal status event (preferred)
-            # 2. Poll interval expires (fallback - check status via REST)
+            # 1. WebSocket delivers a run-completion signal
+            # 2. Poll interval expires (fall through to REST poll)
             try:
                 ws_status = self._terminal_status_queue.get(timeout=poll_interval)
-                # Handle ERROR/STUCK states - raises ConversationRunError
+                # Raises ConversationRunError on ERROR/STUCK; no-op otherwise.
                 self._handle_conversation_status(ws_status)
-
                 logger.info(
-                    "Run completed via WebSocket notification "
+                    "Run completed via post-run WebSocket state update "
                     "(status: %s, elapsed: %.1fs)",
                     ws_status,
                     elapsed,
                 )
-                self._state.refresh_from_server()
+                # The server publishes the full ConversationStateUpdateEvent after
+                # conversation.run()/arun() exits and pending events are flushed,
+                # so non-running statuses from that snapshot are authoritative
+                # run-complete signals.
+                self._state.events.reconcile()
                 return
             except Empty:
                 pass  # Queue.get() timed out, fall through to REST polling
@@ -1088,36 +1133,65 @@ def _wait_for_run_completion(
                 status = self._poll_status_once()
             except Exception as exc:
                 self._handle_poll_exception(exc)
-                consecutive_terminal_polls = 0  # Reset on error
+                # Reset on error: we cannot confirm the server is still in a
+                # terminal state after a failed poll, so conservatively restart
+                # the hard-fallback timer. In the degenerate case where polls
+                # alternate between terminal and exception, the 30 s threshold
+                # slides; this is intentional — we prefer a false-negative wait
+                # over a false-positive early return.
+                terminal_poll_count = 0
+                terminal_first_seen_at = None
             else:
                 # Raises ConversationRunError for ERROR/STUCK states
                 self._handle_conversation_status(status)
 
-                # Track consecutive terminal polls as a fallback for WS issues.
-                # If WebSocket is delayed/disconnected, we return after multiple
-                # consecutive polls confirm the terminal status.
                 if status and ConversationExecutionStatus(status).is_terminal():
-                    consecutive_terminal_polls += 1
-                    if consecutive_terminal_polls >= TERMINAL_POLL_THRESHOLD:
-                        logger.info(
-                            "Run completed via REST fallback after %d consecutive "
-                            "terminal polls (status: %s, elapsed: %.1fs). "
-                            "Refreshing final state and reconciling events...",
-                            consecutive_terminal_polls,
+                    # ERROR/STUCK have already been handled above. FINISHED from
+                    # REST is advisory because stop hooks can still veto it;
+                    # prefer waiting for the server's post-run WebSocket state update.
+                    terminal_poll_count += 1
+                    now = time.monotonic()
+                    if terminal_first_seen_at is None:
+                        terminal_first_seen_at = now
+                    if terminal_poll_count == TERMINAL_POLL_WARNING_THRESHOLD:
+                        logger.warning(
+                            "REST has reported terminal status %s for %d polls "
+                            "without a post-run WebSocket snapshot; continuing "
+                            "to wait for the authoritative snapshot "
+                            "(elapsed: %.1fs)",
                             status,
+                            terminal_poll_count,
                             elapsed,
                         )
-                        final_info = self._state.refresh_from_server()
-                        self._handle_conversation_status(
-                            final_info.get("execution_status")
+                    terminal_secs = now - terminal_first_seen_at
+                    if terminal_secs >= TERMINAL_HARD_FALLBACK_SECS:
+                        logger.warning(
+                            "REST has reported terminal status %s for %.0fs "
+                            "with no post-run WebSocket snapshot; accepting as "
+                            "final to avoid an indefinite hang (elapsed: %.1fs). "
+                            "This may indicate a WebSocket delivery issue.",
+                            status,
+                            terminal_secs,
+                            elapsed,
                         )
-                        # Reconcile events to catch any that were missed via WS.
-                        # This is only called in the fallback path, so it doesn't
-                        # add overhead in the common case where WS works.
+                        self._state.refresh_from_server()
                         self._state.events.reconcile()
                         return
                 else:
-                    consecutive_terminal_polls = 0
+                    terminal_poll_count = 0
+                    terminal_first_seen_at = None
+
+    def _drain_terminal_status_queue(self) -> None:
+        """Empty the WS terminal-status hint queue.
+
+        Called at the start of run() (before arming) to discard any stale
+        signals left over from a previous run invocation.
+        """
+        while True:
+            try:
+                self._terminal_status_queue.get_nowait()
+            except Empty:
+                break
 
     def _poll_status_once(self) -> str | None:
         """Fetch the current execution status from the remote conversation."""