From 56efc0bad6f4461613852e088d88909a5ba4fedf Mon Sep 17 00:00:00 2001 From: Lukas May Date: Fri, 6 Mar 2026 21:23:19 +0100 Subject: [PATCH] fix: detect hung agent processes via defensive signal.json polling Claude CLI occasionally hangs after writing signal.json but never exits. Add an optional signal check to pollForCompletion: after a 60s grace period, check signal.json every 30s. If a valid completion signal is found while the process is still alive, SIGTERM it and proceed to normal completion handling. --- apps/server/agent/manager.ts | 20 ++++++++++++ apps/server/agent/output-handler.ts | 2 +- apps/server/agent/process-manager.ts | 48 ++++++++++++++++++++++++---- 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/apps/server/agent/manager.ts b/apps/server/agent/manager.ts index 5c4fc11..3e022bc 100644 --- a/apps/server/agent/manager.ts +++ b/apps/server/agent/manager.ts @@ -369,6 +369,7 @@ export class MultiProviderAgentManager implements AgentManager { agentId, pid, () => this.handleDetachedAgentCompletion(agentId), () => this.activeAgents.get(agentId)?.tailer, + this.createEarlyCompletionChecker(agentId), ); activeEntry.cancelPoll = cancel; @@ -406,6 +407,20 @@ export class MultiProviderAgentManager implements AgentManager { return this.toAgentInfo(agent); } + /** + * Create a callback that checks if an agent has a valid signal.json, + * used by pollForCompletion to detect hung processes. + */ + private createEarlyCompletionChecker(agentId: string): () => Promise { + return async () => { + const agent = await this.repository.findById(agentId); + if (!agent?.worktreeId) return false; + const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId); + const signal = await this.outputHandler.readSignalCompletion(agentWorkdir); + return signal !== null; + }; + } + /** * Handle completion of a detached agent. */ @@ -525,6 +540,7 @@ export class MultiProviderAgentManager implements AgentManager { agentId, pid, () => this.handleDetachedAgentCompletion(agentId), () => this.activeAgents.get(agentId)?.tailer, + this.createEarlyCompletionChecker(agentId), ); commitActiveEntry.cancelPoll = commitCancel; @@ -633,6 +649,7 @@ export class MultiProviderAgentManager implements AgentManager { agentId, pid, () => this.handleDetachedAgentCompletion(agentId), () => this.activeAgents.get(agentId)?.tailer, + this.createEarlyCompletionChecker(agentId), ); activeEntry.cancelPoll = cancel; @@ -704,6 +721,7 @@ export class MultiProviderAgentManager implements AgentManager { agentId, pid, () => this.handleDetachedAgentCompletion(agentId), () => this.activeAgents.get(agentId)?.tailer, + this.createEarlyCompletionChecker(agentId), ); activeEntry.cancelPoll = cancel; @@ -890,6 +908,7 @@ export class MultiProviderAgentManager implements AgentManager { agentId, pid, () => this.handleDetachedAgentCompletion(agentId), () => this.activeAgents.get(agentId)?.tailer, + this.createEarlyCompletionChecker(agentId), ); resumeActiveEntry.cancelPoll = resumeCancel; } @@ -1013,6 +1032,7 @@ export class MultiProviderAgentManager implements AgentManager { agentId, pid, () => this.handleDetachedAgentCompletion(agentId), () => this.activeAgents.get(agentId)?.tailer, + this.createEarlyCompletionChecker(agentId), ); const active = this.activeAgents.get(agentId); if (active) active.cancelPoll = cancel; diff --git a/apps/server/agent/output-handler.ts b/apps/server/agent/output-handler.ts index 28fdaf6..4da2d0c 100644 --- a/apps/server/agent/output-handler.ts +++ b/apps/server/agent/output-handler.ts @@ -1133,7 +1133,7 @@ export class OutputHandler { * Uses SignalManager for atomic read-and-validate when available. * Returns the raw JSON string on success, null if missing/invalid. */ - private async readSignalCompletion(agentWorkdir: string): Promise { + async readSignalCompletion(agentWorkdir: string): Promise { // Prefer SignalManager (unified implementation with proper validation) if (this.signalManager) { const signal = await this.signalManager.readSignal(agentWorkdir); diff --git a/apps/server/agent/process-manager.ts b/apps/server/agent/process-manager.ts index 144b348..105de84 100644 --- a/apps/server/agent/process-manager.ts +++ b/apps/server/agent/process-manager.ts @@ -328,27 +328,63 @@ export class ProcessManager { * When the process exits, calls onComplete callback. * Returns a cancel handle to stop polling (e.g. on agent cleanup or re-resume). * + * Optionally checks signal.json after a grace period to detect hung processes + * that completed work but failed to exit. If a valid signal is found while the + * process is still alive, SIGTERM is sent and normal completion proceeds. + * * @param onComplete - Called when the process is no longer alive * @param getTailer - Function to get the current tailer for final flush + * @param checkEarlyCompletion - Optional callback that returns true if signal.json indicates completion */ pollForCompletion( agentId: string, pid: number, onComplete: () => Promise, getTailer: () => FileTailer | undefined, + checkEarlyCompletion?: () => Promise, ): { cancel: () => void } { let cancelled = false; + const startTime = Date.now(); + const GRACE_PERIOD_MS = 60_000; + const SIGNAL_CHECK_INTERVAL_MS = 30_000; + let lastSignalCheck = 0; + + const finalize = async () => { + const tailer = getTailer(); + if (tailer) { + await new Promise((resolve) => setTimeout(resolve, 500)); + await tailer.stop(); + } + if (!cancelled) await onComplete(); + }; + const check = async () => { if (cancelled) return; if (!isPidAlive(pid)) { - const tailer = getTailer(); - if (tailer) { - await new Promise((resolve) => setTimeout(resolve, 500)); - await tailer.stop(); - } - if (!cancelled) await onComplete(); + await finalize(); return; } + + // Defensive signal check: after grace period, periodically check signal.json + if (checkEarlyCompletion) { + const elapsed = Date.now() - startTime; + if (elapsed >= GRACE_PERIOD_MS && Date.now() - lastSignalCheck >= SIGNAL_CHECK_INTERVAL_MS) { + lastSignalCheck = Date.now(); + try { + const hasSignal = await checkEarlyCompletion(); + if (hasSignal) { + log.warn({ agentId, pid, elapsedMs: elapsed }, 'signal.json found but process still alive — sending SIGTERM'); + try { process.kill(pid, 'SIGTERM'); } catch { /* already dead */ } + await new Promise((resolve) => setTimeout(resolve, 2000)); + await finalize(); + return; + } + } catch (err) { + log.debug({ agentId, err: err instanceof Error ? err.message : String(err) }, 'early completion check failed'); + } + } + } + if (!cancelled) setTimeout(check, 1000); }; check();