fix: detect hung agent processes via defensive signal.json polling
Claude CLI occasionally hangs after writing signal.json but never exits. Add an optional signal check to pollForCompletion: after a 60s grace period, check signal.json every 30s. If a valid completion signal is found while the process is still alive, SIGTERM it and proceed to normal completion handling.
This commit is contained in:
@@ -369,6 +369,7 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
agentId, pid,
|
||||
() => this.handleDetachedAgentCompletion(agentId),
|
||||
() => this.activeAgents.get(agentId)?.tailer,
|
||||
this.createEarlyCompletionChecker(agentId),
|
||||
);
|
||||
activeEntry.cancelPoll = cancel;
|
||||
|
||||
@@ -406,6 +407,20 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
return this.toAgentInfo(agent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a callback that checks if an agent has a valid signal.json,
|
||||
* used by pollForCompletion to detect hung processes.
|
||||
*/
|
||||
private createEarlyCompletionChecker(agentId: string): () => Promise<boolean> {
|
||||
return async () => {
|
||||
const agent = await this.repository.findById(agentId);
|
||||
if (!agent?.worktreeId) return false;
|
||||
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
|
||||
const signal = await this.outputHandler.readSignalCompletion(agentWorkdir);
|
||||
return signal !== null;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle completion of a detached agent.
|
||||
*/
|
||||
@@ -525,6 +540,7 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
agentId, pid,
|
||||
() => this.handleDetachedAgentCompletion(agentId),
|
||||
() => this.activeAgents.get(agentId)?.tailer,
|
||||
this.createEarlyCompletionChecker(agentId),
|
||||
);
|
||||
commitActiveEntry.cancelPoll = commitCancel;
|
||||
|
||||
@@ -633,6 +649,7 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
agentId, pid,
|
||||
() => this.handleDetachedAgentCompletion(agentId),
|
||||
() => this.activeAgents.get(agentId)?.tailer,
|
||||
this.createEarlyCompletionChecker(agentId),
|
||||
);
|
||||
activeEntry.cancelPoll = cancel;
|
||||
|
||||
@@ -704,6 +721,7 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
agentId, pid,
|
||||
() => this.handleDetachedAgentCompletion(agentId),
|
||||
() => this.activeAgents.get(agentId)?.tailer,
|
||||
this.createEarlyCompletionChecker(agentId),
|
||||
);
|
||||
activeEntry.cancelPoll = cancel;
|
||||
|
||||
@@ -890,6 +908,7 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
agentId, pid,
|
||||
() => this.handleDetachedAgentCompletion(agentId),
|
||||
() => this.activeAgents.get(agentId)?.tailer,
|
||||
this.createEarlyCompletionChecker(agentId),
|
||||
);
|
||||
resumeActiveEntry.cancelPoll = resumeCancel;
|
||||
}
|
||||
@@ -1013,6 +1032,7 @@ export class MultiProviderAgentManager implements AgentManager {
|
||||
agentId, pid,
|
||||
() => this.handleDetachedAgentCompletion(agentId),
|
||||
() => this.activeAgents.get(agentId)?.tailer,
|
||||
this.createEarlyCompletionChecker(agentId),
|
||||
);
|
||||
const active = this.activeAgents.get(agentId);
|
||||
if (active) active.cancelPoll = cancel;
|
||||
|
||||
@@ -1133,7 +1133,7 @@ export class OutputHandler {
|
||||
* Uses SignalManager for atomic read-and-validate when available.
|
||||
* Returns the raw JSON string on success, null if missing/invalid.
|
||||
*/
|
||||
private async readSignalCompletion(agentWorkdir: string): Promise<string | null> {
|
||||
async readSignalCompletion(agentWorkdir: string): Promise<string | null> {
|
||||
// Prefer SignalManager (unified implementation with proper validation)
|
||||
if (this.signalManager) {
|
||||
const signal = await this.signalManager.readSignal(agentWorkdir);
|
||||
|
||||
@@ -328,27 +328,63 @@ export class ProcessManager {
|
||||
* When the process exits, calls onComplete callback.
|
||||
* Returns a cancel handle to stop polling (e.g. on agent cleanup or re-resume).
|
||||
*
|
||||
* Optionally checks signal.json after a grace period to detect hung processes
|
||||
* that completed work but failed to exit. If a valid signal is found while the
|
||||
* process is still alive, SIGTERM is sent and normal completion proceeds.
|
||||
*
|
||||
* @param onComplete - Called when the process is no longer alive
|
||||
* @param getTailer - Function to get the current tailer for final flush
|
||||
* @param checkEarlyCompletion - Optional callback that returns true if signal.json indicates completion
|
||||
*/
|
||||
pollForCompletion(
|
||||
agentId: string,
|
||||
pid: number,
|
||||
onComplete: () => Promise<void>,
|
||||
getTailer: () => FileTailer | undefined,
|
||||
checkEarlyCompletion?: () => Promise<boolean>,
|
||||
): { cancel: () => void } {
|
||||
let cancelled = false;
|
||||
const startTime = Date.now();
|
||||
const GRACE_PERIOD_MS = 60_000;
|
||||
const SIGNAL_CHECK_INTERVAL_MS = 30_000;
|
||||
let lastSignalCheck = 0;
|
||||
|
||||
const finalize = async () => {
|
||||
const tailer = getTailer();
|
||||
if (tailer) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
await tailer.stop();
|
||||
}
|
||||
if (!cancelled) await onComplete();
|
||||
};
|
||||
|
||||
const check = async () => {
|
||||
if (cancelled) return;
|
||||
if (!isPidAlive(pid)) {
|
||||
const tailer = getTailer();
|
||||
if (tailer) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
await tailer.stop();
|
||||
}
|
||||
if (!cancelled) await onComplete();
|
||||
await finalize();
|
||||
return;
|
||||
}
|
||||
|
||||
// Defensive signal check: after grace period, periodically check signal.json
|
||||
if (checkEarlyCompletion) {
|
||||
const elapsed = Date.now() - startTime;
|
||||
if (elapsed >= GRACE_PERIOD_MS && Date.now() - lastSignalCheck >= SIGNAL_CHECK_INTERVAL_MS) {
|
||||
lastSignalCheck = Date.now();
|
||||
try {
|
||||
const hasSignal = await checkEarlyCompletion();
|
||||
if (hasSignal) {
|
||||
log.warn({ agentId, pid, elapsedMs: elapsed }, 'signal.json found but process still alive — sending SIGTERM');
|
||||
try { process.kill(pid, 'SIGTERM'); } catch { /* already dead */ }
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
await finalize();
|
||||
return;
|
||||
}
|
||||
} catch (err) {
|
||||
log.debug({ agentId, err: err instanceof Error ? err.message : String(err) }, 'early completion check failed');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!cancelled) setTimeout(check, 1000);
|
||||
};
|
||||
check();
|
||||
|
||||
Reference in New Issue
Block a user