Files
Codewalkers/apps/server/agent/process-manager.ts
Lukas May 56efc0bad6 fix: detect hung agent processes via defensive signal.json polling
Claude CLI occasionally hangs after writing signal.json but never exits.
Add an optional signal check to pollForCompletion: after a 60s grace
period, check signal.json every 30s. If a valid completion signal is
found while the process is still alive, SIGTERM it and proceed to
normal completion handling.
2026-03-06 21:23:19 +01:00

432 lines
13 KiB
TypeScript

/**
* ProcessManager — Subprocess lifecycle, worktree creation, command building.
*
* Extracted from MultiProviderAgentManager. Manages the spawning of detached
* subprocesses, worktree creation per project, and provider-specific command
* construction.
*/
import { spawn } from 'node:child_process';
import { openSync, closeSync, existsSync } from 'node:fs';
import { mkdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
import type { ProjectRepository } from '../db/repositories/project-repository.js';
import type { AgentProviderConfig } from './providers/types.js';
import type { StreamEvent } from './providers/parsers/index.js';
import { getStreamParser } from './providers/parsers/index.js';
import { SimpleGitWorktreeManager } from '../git/manager.js';
import { ensureProjectClone, getProjectCloneDir } from '../git/project-clones.js';
import { FileTailer } from './file-tailer.js';
import { createModuleLogger } from '../logger/index.js';
const log = createModuleLogger('process-manager');
/**
* Check if a process with the given PID is still alive.
*/
export function isPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}
export class ProcessManager {
constructor(
private workspaceRoot: string,
private projectRepository: ProjectRepository,
) {}
/**
* Resolve the agent's working directory path.
*/
getAgentWorkdir(alias: string): string {
return join(this.workspaceRoot, 'agent-workdirs', alias);
}
/**
* Create worktrees for all projects linked to an initiative.
* Returns the base agent workdir path.
*/
async createProjectWorktrees(
alias: string,
initiativeId: string,
baseBranch?: string,
branchName?: string,
): Promise<string> {
const projects = await this.projectRepository.findProjectsByInitiativeId(initiativeId);
const agentWorkdir = this.getAgentWorkdir(alias);
log.debug({
alias,
initiativeId,
projectCount: projects.length,
agentWorkdir,
baseBranch
}, 'creating project worktrees');
// No linked projects — fall back to standalone worktree so the agent
// always has a git-backed working directory.
if (projects.length === 0) {
log.info({ alias, initiativeId }, 'initiative has no linked projects, falling back to standalone worktree');
return this.createStandaloneWorktree(alias);
}
for (const project of projects) {
const clonePath = await ensureProjectClone(project, this.workspaceRoot);
const worktreeManager = new SimpleGitWorktreeManager(clonePath, undefined, agentWorkdir);
const effectiveBaseBranch = baseBranch ?? project.defaultBranch;
const worktree = await worktreeManager.create(project.name, branchName ?? `agent/${alias}`, effectiveBaseBranch);
const worktreePath = worktree.path;
const pathExists = existsSync(worktreePath);
log.debug({
alias,
agentWorkdir,
projectName: project.name,
worktreePath,
pathExists
}, 'worktree created');
if (!pathExists) {
log.error({ worktreePath }, 'Worktree path does not exist after creation!');
throw new Error(`Worktree creation failed: ${worktreePath}`);
}
}
return agentWorkdir;
}
/**
* Fallback: create a single "workspace" worktree for standalone agents.
*/
async createStandaloneWorktree(alias: string): Promise<string> {
const agentWorkdir = this.getAgentWorkdir(alias);
const worktreeManager = new SimpleGitWorktreeManager(this.workspaceRoot, undefined, agentWorkdir);
log.debug({ alias, agentWorkdir }, 'creating standalone worktree');
const worktree = await worktreeManager.create('workspace', `agent/${alias}`);
const worktreePath = worktree.path;
const pathExists = existsSync(worktreePath);
log.debug({
alias,
agentWorkdir,
worktreePath,
pathExists
}, 'standalone worktree created');
if (!pathExists) {
log.error({ worktreePath }, 'Standalone worktree path does not exist after creation!');
throw new Error(`Standalone worktree creation failed: ${worktreePath}`);
}
return worktree.path;
}
/**
* Build the spawn command for a given provider configuration.
*/
buildSpawnCommand(
provider: AgentProviderConfig,
prompt: string,
): { command: string; args: string[]; env: Record<string, string> } {
const args = [...provider.args];
const env: Record<string, string> = { ...provider.env };
if (provider.nonInteractive?.subcommand) {
args.unshift(provider.nonInteractive.subcommand);
}
if (provider.promptMode === 'native') {
args.push('-p', prompt);
} else if (provider.promptMode === 'flag' && provider.nonInteractive?.promptFlag) {
args.push(provider.nonInteractive.promptFlag, prompt);
}
if (provider.nonInteractive?.outputFlag) {
args.push(...provider.nonInteractive.outputFlag.split(' '));
}
return { command: provider.command, args, env };
}
/**
* Build the resume command for a given provider configuration.
*/
buildResumeCommand(
provider: AgentProviderConfig,
sessionId: string,
prompt: string,
): { command: string; args: string[]; env: Record<string, string> } {
const args = [...provider.args];
const env: Record<string, string> = { ...provider.env };
switch (provider.resumeStyle) {
case 'flag':
args.push(provider.resumeFlag!, sessionId);
break;
case 'subcommand':
if (provider.nonInteractive?.subcommand) {
args.unshift(provider.nonInteractive.subcommand);
}
args.push(provider.resumeFlag!, sessionId);
break;
case 'none':
throw new Error(`Provider '${provider.name}' does not support resume`);
}
if (provider.promptMode === 'native') {
args.push('-p', prompt);
} else if (provider.promptMode === 'flag' && provider.nonInteractive?.promptFlag) {
args.push(provider.nonInteractive.promptFlag, prompt);
}
if (provider.nonInteractive?.outputFlag) {
args.push(...provider.nonInteractive.outputFlag.split(' '));
}
return { command: provider.command, args, env };
}
/**
* Extract session ID from CLI output based on provider config.
*/
extractSessionId(
provider: AgentProviderConfig,
output: string,
): string | null {
if (!provider.sessionId) return null;
try {
if (provider.sessionId.extractFrom === 'result') {
const parsed = JSON.parse(output);
return parsed[provider.sessionId.field] ?? null;
}
if (provider.sessionId.extractFrom === 'event') {
const lines = output.trim().split('\n');
for (const line of lines) {
try {
const event = JSON.parse(line);
if (event.type === provider.sessionId.eventType) {
return event[provider.sessionId.field] ?? null;
}
} catch {
// Skip non-JSON lines
}
}
}
} catch {
// Parse failure
}
return null;
}
/**
* Spawn a detached subprocess with file redirection for crash resilience.
* The subprocess writes directly to files and survives server crashes.
* A FileTailer watches the output file and emits events in real-time.
*
* @param onEvent - Callback for stream events from the tailer
*/
async spawnDetached(
agentId: string,
agentName: string,
command: string,
args: string[],
cwd: string,
env: Record<string, string>,
providerName: string,
prompt?: string,
onEvent?: (event: StreamEvent) => void,
onRawContent?: (content: string) => void,
): Promise<{ pid: number; outputFilePath: string; tailer: FileTailer }> {
// Pre-spawn validation and logging
const cwdExists = existsSync(cwd);
const commandWithArgs = [command, ...args].join(' ');
// Log environment variables that might affect working directory
const environmentInfo = {
PWD: process.env.PWD,
HOME: process.env.HOME,
CLAUDE_CONFIG_DIR: env.CLAUDE_CONFIG_DIR,
CW_CONFIG_DIR: env.CW_CONFIG_DIR
};
log.info({
agentId,
cwd,
cwdExists,
commandWithArgs,
providerName,
environmentInfo
}, 'spawning detached process with workdir validation');
if (!cwdExists) {
log.error({ cwd }, 'CWD does not exist before spawn!');
throw new Error(`Agent working directory does not exist: ${cwd}`);
}
const logDir = join(this.workspaceRoot, '.cw', 'agent-logs', agentName);
await mkdir(logDir, { recursive: true });
const outputFilePath = join(logDir, 'output.jsonl');
const stderrFilePath = join(logDir, 'stderr.log');
if (prompt) {
await writeFile(join(logDir, 'PROMPT.md'), prompt, 'utf-8');
}
const stdoutFd = openSync(outputFilePath, 'w');
const stderrFd = openSync(stderrFilePath, 'w');
const child = spawn(command, args, {
cwd,
env: { ...process.env, ...env },
detached: true,
stdio: ['ignore', stdoutFd, stderrFd],
});
closeSync(stdoutFd);
closeSync(stderrFd);
child.unref();
const pid = child.pid!;
log.info({
agentId,
pid,
command,
args: args.join(' '),
cwd,
spawnSuccess: true
}, 'spawned detached process successfully');
const parser = getStreamParser(providerName);
const tailer = new FileTailer({
filePath: outputFilePath,
agentId,
parser,
onEvent: onEvent ?? (() => {}),
startFromBeginning: true,
onRawContent,
});
tailer.start().catch((err) => {
log.warn({ agentId, err: err instanceof Error ? err.message : String(err) }, 'failed to start tailer');
});
return { pid, outputFilePath, tailer };
}
/**
* Poll for process completion by checking if PID is still alive.
* When the process exits, calls onComplete callback.
* Returns a cancel handle to stop polling (e.g. on agent cleanup or re-resume).
*
* Optionally checks signal.json after a grace period to detect hung processes
* that completed work but failed to exit. If a valid signal is found while the
* process is still alive, SIGTERM is sent and normal completion proceeds.
*
* @param onComplete - Called when the process is no longer alive
* @param getTailer - Function to get the current tailer for final flush
* @param checkEarlyCompletion - Optional callback that returns true if signal.json indicates completion
*/
pollForCompletion(
agentId: string,
pid: number,
onComplete: () => Promise<void>,
getTailer: () => FileTailer | undefined,
checkEarlyCompletion?: () => Promise<boolean>,
): { cancel: () => void } {
let cancelled = false;
const startTime = Date.now();
const GRACE_PERIOD_MS = 60_000;
const SIGNAL_CHECK_INTERVAL_MS = 30_000;
let lastSignalCheck = 0;
const finalize = async () => {
const tailer = getTailer();
if (tailer) {
await new Promise((resolve) => setTimeout(resolve, 500));
await tailer.stop();
}
if (!cancelled) await onComplete();
};
const check = async () => {
if (cancelled) return;
if (!isPidAlive(pid)) {
await finalize();
return;
}
// Defensive signal check: after grace period, periodically check signal.json
if (checkEarlyCompletion) {
const elapsed = Date.now() - startTime;
if (elapsed >= GRACE_PERIOD_MS && Date.now() - lastSignalCheck >= SIGNAL_CHECK_INTERVAL_MS) {
lastSignalCheck = Date.now();
try {
const hasSignal = await checkEarlyCompletion();
if (hasSignal) {
log.warn({ agentId, pid, elapsedMs: elapsed }, 'signal.json found but process still alive — sending SIGTERM');
try { process.kill(pid, 'SIGTERM'); } catch { /* already dead */ }
await new Promise((resolve) => setTimeout(resolve, 2000));
await finalize();
return;
}
} catch (err) {
log.debug({ agentId, err: err instanceof Error ? err.message : String(err) }, 'early completion check failed');
}
}
}
if (!cancelled) setTimeout(check, 1000);
};
check();
return { cancel: () => { cancelled = true; } };
}
/**
* Wait for a process to complete with Promise-based API.
* Returns when the process is no longer alive.
*/
async waitForProcessCompletion(pid: number, timeoutMs: number = 300000): Promise<{ exitCode: number | null }> {
return new Promise((resolve, reject) => {
const startTime = Date.now();
const check = () => {
if (!isPidAlive(pid)) {
// Process has exited, try to get exit code
// Note: Getting exact exit code from detached process is limited
resolve({ exitCode: null });
return;
}
if (Date.now() - startTime > timeoutMs) {
reject(new Error(`Process ${pid} did not complete within ${timeoutMs}ms`));
return;
}
setTimeout(check, 1000);
};
check();
});
}
/**
* Get the exit code of a completed process.
* Limited implementation since we use detached processes.
*/
async getExitCode(pid: number): Promise<number | null> {
// For detached processes, we can't easily get the exit code
// This would need to be enhanced with process tracking
return null;
}
}