Skip to content

Commit bcf80c6

Browse files
fix: add robust process exit detection for child processes
Port robust process exit detection from PR anomalyco#15757 to fix zombie/stuck child processes in containers where Bun fails to deliver exit events. - Add polling watchdog to bash tool and Process.spawn that detects process exit via kill(pid, 0) when event-loop events are missed - Add process registry (active map) with stale/reap exports for server-level watchdog to detect and clean up stuck bash processes - Improve Shell.killTree with alive() helper and proper SIGKILL escalation after SIGTERM timeout - Add session-level watchdog interval in prompt loop to periodically reap stale bash processes Based on the work in anomalyco#15757. Co-Authored-By: Nacho F. Lizaur <NachoFLizaur@users.noreply.github.com>
1 parent 5124d2a commit bcf80c6

File tree

4 files changed

+166
-19
lines changed

4 files changed

+166
-19
lines changed

packages/opencode/src/session/prompt.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ import { LLM } from "./llm"
4747
import { iife } from "@/util/iife"
4848
import { Shell } from "@/shell/shell"
4949
import { Truncate } from "@/tool/truncation"
50+
import { stale, reap } from "@/tool/bash"
5051

5152
// @ts-ignore
5253
globalThis.AI_SDK_LOG_WARNINGS = false
@@ -291,6 +292,13 @@ export namespace SessionPrompt {
291292

292293
using _ = defer(() => cancel(sessionID))
293294

295+
const watchdog = setInterval(() => {
296+
for (const id of stale()) {
297+
reap(id)
298+
}
299+
}, 5000)
300+
using _watchdog = defer(() => clearInterval(watchdog))
301+
294302
// Structured output state
295303
// Note: On session resumption, state is reset but outputFormat is preserved
296304
// on the user message and will be retrieved from lastUser below

packages/opencode/src/shell/shell.ts

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ import { setTimeout as sleep } from "node:timers/promises"
99
const SIGKILL_TIMEOUT_MS = 200
1010

1111
export namespace Shell {
12+
function alive(pid: number): boolean {
13+
try {
14+
process.kill(pid, 0)
15+
return true
16+
} catch {
17+
return false
18+
}
19+
}
20+
1221
export async function killTree(proc: ChildProcess, opts?: { exited?: () => boolean }): Promise<void> {
1322
const pid = proc.pid
1423
if (!pid || opts?.exited?.()) return
@@ -27,17 +36,24 @@ export namespace Shell {
2736

2837
try {
2938
process.kill(-pid, "SIGTERM")
30-
await sleep(SIGKILL_TIMEOUT_MS)
31-
if (!opts?.exited?.()) {
32-
process.kill(-pid, "SIGKILL")
33-
}
34-
} catch (_e) {
35-
proc.kill("SIGTERM")
36-
await sleep(SIGKILL_TIMEOUT_MS)
37-
if (!opts?.exited?.()) {
39+
} catch {
40+
try {
41+
proc.kill("SIGTERM")
42+
} catch {}
43+
}
44+
45+
await sleep(SIGKILL_TIMEOUT_MS)
46+
47+
if (opts?.exited?.() || !alive(pid)) return
48+
try {
49+
process.kill(-pid, "SIGKILL")
50+
} catch {
51+
try {
3852
proc.kill("SIGKILL")
39-
}
53+
} catch {}
4054
}
55+
56+
await sleep(SIGKILL_TIMEOUT_MS)
4157
}
4258
const BLACKLIST = new Set(["fish", "nu"])
4359

packages/opencode/src/tool/bash.ts

Lines changed: 93 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,40 @@ const DEFAULT_TIMEOUT = Flag.OPENCODE_EXPERIMENTAL_BASH_DEFAULT_TIMEOUT_MS || 2
2323

2424
export const log = Log.create({ service: "bash-tool" })
2525

26+
// Registry for active bash processes — enables server-level watchdog
27+
const active = new Map<
28+
string,
29+
{
30+
pid: number
31+
timeout: number
32+
started: number
33+
kill: () => void
34+
done: () => void
35+
}
36+
>()
37+
38+
export function stale() {
39+
const result: string[] = []
40+
const now = Date.now()
41+
for (const [id, entry] of active) {
42+
if (now - entry.started > entry.timeout + 5000) result.push(id)
43+
}
44+
return result
45+
}
46+
47+
export function reap(id: string) {
48+
const entry = active.get(id)
49+
if (!entry) return
50+
log.info("reaping stuck process", {
51+
callID: id,
52+
pid: entry.pid,
53+
age: Date.now() - entry.started,
54+
})
55+
entry.kill()
56+
entry.done()
57+
active.delete(id)
58+
}
59+
2660
const resolveWasm = (asset: string) => {
2761
if (asset.startsWith("file://")) return fileURLToPath(asset)
2862
if (asset.startsWith("/") || /^[a-z]:/i.test(asset)) return asset
@@ -176,6 +210,14 @@ export const BashTool = Tool.define("bash", async () => {
176210
windowsHide: process.platform === "win32",
177211
})
178212

213+
if (!proc.pid) {
214+
if (proc.exitCode !== null) {
215+
log.info("process exited before pid could be read", { exitCode: proc.exitCode })
216+
} else {
217+
throw new Error(`Failed to spawn process: pid is undefined for command "${params.command}"`)
218+
}
219+
}
220+
179221
const MAX_OUTPUT_BYTES = 10 * 1024 * 1024 // 10 MB cap
180222
const outputChunks: Buffer[] = []
181223
let outputLen = 0
@@ -232,25 +274,72 @@ export const BashTool = Tool.define("bash", async () => {
232274
void kill()
233275
}, timeout + 100)
234276

277+
const callID = ctx.callID
278+
if (callID) {
279+
active.set(callID, {
280+
pid: proc.pid!,
281+
timeout,
282+
started: Date.now(),
283+
kill: () => Shell.killTree(proc, { exited: () => exited }),
284+
done: () => {},
285+
})
286+
}
287+
235288
await new Promise<void>((resolve, reject) => {
289+
let resolved = false
290+
236291
const cleanup = () => {
292+
if (resolved) return
293+
resolved = true
237294
clearTimeout(timeoutTimer)
295+
clearInterval(poll)
238296
ctx.abort.removeEventListener("abort", abortHandler)
239297
}
240298

241-
proc.once("exit", () => {
299+
const done = () => {
300+
if (resolved) return
242301
exited = true
243302
cleanup()
244303
resolve()
245-
})
304+
}
246305

247-
proc.once("error", (error) => {
306+
// Update the active entry with the real done callback
307+
if (callID) {
308+
const entry = active.get(callID)
309+
if (entry) entry.done = done
310+
}
311+
312+
const fail = (error: Error) => {
313+
if (resolved) return
248314
exited = true
249315
cleanup()
250316
reject(error)
251-
})
317+
}
318+
319+
proc.once("exit", done)
320+
proc.once("close", done)
321+
proc.once("error", fail)
322+
323+
// Polling watchdog: detect process exit when Bun's event loop
324+
// fails to deliver the "exit" event (confirmed Bun bug in containers)
325+
const poll = setInterval(() => {
326+
if (proc.exitCode !== null || proc.signalCode !== null) {
327+
done()
328+
return
329+
}
330+
if (proc.pid && process.platform !== "win32") {
331+
try {
332+
process.kill(proc.pid, 0)
333+
} catch {
334+
done()
335+
return
336+
}
337+
}
338+
}, 1000)
252339
})
253340

341+
if (callID) active.delete(callID)
342+
254343
let output = Buffer.concat(outputChunks).toString()
255344
// Free the chunks array
256345
outputChunks.length = 0

packages/opencode/src/util/process.ts

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,20 +79,54 @@ export namespace Process {
7979
}
8080

8181
const exited = new Promise<number>((resolve, reject) => {
82-
const done = () => {
82+
let resolved = false
83+
84+
const cleanup = () => {
85+
if (resolved) return
86+
resolved = true
8387
opts.abort?.removeEventListener("abort", abort)
8488
if (timer) clearTimeout(timer)
89+
clearInterval(poll)
90+
}
91+
92+
const finish = (code: number) => {
93+
if (resolved) return
94+
cleanup()
95+
resolve(code)
96+
}
97+
98+
const fail = (error: Error) => {
99+
if (resolved) return
100+
cleanup()
101+
reject(error)
85102
}
86103

87104
proc.once("exit", (code, signal) => {
88-
done()
89-
resolve(code ?? (signal ? 1 : 0))
105+
finish(code ?? (signal ? 1 : 0))
90106
})
91107

92-
proc.once("error", (error) => {
93-
done()
94-
reject(error)
108+
proc.once("close", (code, signal) => {
109+
finish(code ?? (signal ? 1 : 0))
95110
})
111+
112+
proc.once("error", fail)
113+
114+
// Polling watchdog: detect process exit when Bun's event loop
115+
// fails to deliver the "exit" event (confirmed Bun bug in containers)
116+
const poll = setInterval(() => {
117+
if (proc.exitCode !== null || proc.signalCode !== null) {
118+
finish(proc.exitCode ?? (proc.signalCode ? 1 : 0))
119+
return
120+
}
121+
if (proc.pid && process.platform !== "win32") {
122+
try {
123+
process.kill(proc.pid, 0)
124+
} catch {
125+
finish(proc.exitCode ?? 1)
126+
return
127+
}
128+
}
129+
}, 1000)
96130
})
97131

98132
if (opts.abort) {

0 commit comments

Comments
 (0)