fix: detect hung agent processes via defensive signal.json polling

Claude CLI occasionally hangs after writing signal.json but never exits.
Add an optional signal check to pollForCompletion: after a 60s grace
period, check signal.json every 30s. If a valid completion signal is
found while the process is still alive, SIGTERM it and proceed to
normal completion handling.
This commit is contained in:
Lukas May
2026-03-06 21:23:19 +01:00
parent 388befd7c3
commit 56efc0bad6
3 changed files with 63 additions and 7 deletions

View File

@@ -369,6 +369,7 @@ export class MultiProviderAgentManager implements AgentManager {
agentId, pid,
() => this.handleDetachedAgentCompletion(agentId),
() => this.activeAgents.get(agentId)?.tailer,
this.createEarlyCompletionChecker(agentId),
);
activeEntry.cancelPoll = cancel;
@@ -406,6 +407,20 @@ export class MultiProviderAgentManager implements AgentManager {
return this.toAgentInfo(agent);
}
/**
* Create a callback that checks if an agent has a valid signal.json,
* used by pollForCompletion to detect hung processes.
*/
private createEarlyCompletionChecker(agentId: string): () => Promise<boolean> {
return async () => {
const agent = await this.repository.findById(agentId);
if (!agent?.worktreeId) return false;
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
const signal = await this.outputHandler.readSignalCompletion(agentWorkdir);
return signal !== null;
};
}
/**
* Handle completion of a detached agent.
*/
@@ -525,6 +540,7 @@ export class MultiProviderAgentManager implements AgentManager {
agentId, pid,
() => this.handleDetachedAgentCompletion(agentId),
() => this.activeAgents.get(agentId)?.tailer,
this.createEarlyCompletionChecker(agentId),
);
commitActiveEntry.cancelPoll = commitCancel;
@@ -633,6 +649,7 @@ export class MultiProviderAgentManager implements AgentManager {
agentId, pid,
() => this.handleDetachedAgentCompletion(agentId),
() => this.activeAgents.get(agentId)?.tailer,
this.createEarlyCompletionChecker(agentId),
);
activeEntry.cancelPoll = cancel;
@@ -704,6 +721,7 @@ export class MultiProviderAgentManager implements AgentManager {
agentId, pid,
() => this.handleDetachedAgentCompletion(agentId),
() => this.activeAgents.get(agentId)?.tailer,
this.createEarlyCompletionChecker(agentId),
);
activeEntry.cancelPoll = cancel;
@@ -890,6 +908,7 @@ export class MultiProviderAgentManager implements AgentManager {
agentId, pid,
() => this.handleDetachedAgentCompletion(agentId),
() => this.activeAgents.get(agentId)?.tailer,
this.createEarlyCompletionChecker(agentId),
);
resumeActiveEntry.cancelPoll = resumeCancel;
}
@@ -1013,6 +1032,7 @@ export class MultiProviderAgentManager implements AgentManager {
agentId, pid,
() => this.handleDetachedAgentCompletion(agentId),
() => this.activeAgents.get(agentId)?.tailer,
this.createEarlyCompletionChecker(agentId),
);
const active = this.activeAgents.get(agentId);
if (active) active.cancelPoll = cancel;

View File

@@ -1133,7 +1133,7 @@ export class OutputHandler {
* Uses SignalManager for atomic read-and-validate when available.
* Returns the raw JSON string on success, null if missing/invalid.
*/
private async readSignalCompletion(agentWorkdir: string): Promise<string | null> {
async readSignalCompletion(agentWorkdir: string): Promise<string | null> {
// Prefer SignalManager (unified implementation with proper validation)
if (this.signalManager) {
const signal = await this.signalManager.readSignal(agentWorkdir);

View File

@@ -328,27 +328,63 @@ export class ProcessManager {
* When the process exits, calls onComplete callback.
* Returns a cancel handle to stop polling (e.g. on agent cleanup or re-resume).
*
* Optionally checks signal.json after a grace period to detect hung processes
* that completed work but failed to exit. If a valid signal is found while the
* process is still alive, SIGTERM is sent and normal completion proceeds.
*
* @param onComplete - Called when the process is no longer alive
* @param getTailer - Function to get the current tailer for final flush
* @param checkEarlyCompletion - Optional callback that returns true if signal.json indicates completion
*/
pollForCompletion(
agentId: string,
pid: number,
onComplete: () => Promise<void>,
getTailer: () => FileTailer | undefined,
checkEarlyCompletion?: () => Promise<boolean>,
): { cancel: () => void } {
let cancelled = false;
const check = async () => {
if (cancelled) return;
if (!isPidAlive(pid)) {
const startTime = Date.now();
const GRACE_PERIOD_MS = 60_000;
const SIGNAL_CHECK_INTERVAL_MS = 30_000;
let lastSignalCheck = 0;
const finalize = async () => {
const tailer = getTailer();
if (tailer) {
await new Promise((resolve) => setTimeout(resolve, 500));
await tailer.stop();
}
if (!cancelled) await onComplete();
};
const check = async () => {
if (cancelled) return;
if (!isPidAlive(pid)) {
await finalize();
return;
}
// Defensive signal check: after grace period, periodically check signal.json
if (checkEarlyCompletion) {
const elapsed = Date.now() - startTime;
if (elapsed >= GRACE_PERIOD_MS && Date.now() - lastSignalCheck >= SIGNAL_CHECK_INTERVAL_MS) {
lastSignalCheck = Date.now();
try {
const hasSignal = await checkEarlyCompletion();
if (hasSignal) {
log.warn({ agentId, pid, elapsedMs: elapsed }, 'signal.json found but process still alive — sending SIGTERM');
try { process.kill(pid, 'SIGTERM'); } catch { /* already dead */ }
await new Promise((resolve) => setTimeout(resolve, 2000));
await finalize();
return;
}
} catch (err) {
log.debug({ agentId, err: err instanceof Error ? err.message : String(err) }, 'early completion check failed');
}
}
}
if (!cancelled) setTimeout(check, 1000);
};
check();