fix: Patch full-flow test timeouts and driveToCompletion polling loop
- driveToCompletion() now catches inner waitForAgentAttention timeouts instead of letting them propagate — long-running execute/detail agents (>3 min without transitioning to waiting_for_input) no longer crash the polling loop; the outer deadline handles termination correctly - Switch execute stage from waitForAgentCompletion to driveToCompletion so any clarifying questions get auto-answered - Increase DETAIL_TIMEOUT_MS 8→15 min, PLAN_TIMEOUT_MS 8→12 min, EXECUTE_TIMEOUT_MS 10→20 min — architect agents are variable in practice; these are upper bounds not expectations - Raise FULL_FLOW_TIMEOUT 30→60 min to cover worst-case stacking - Update CLAUDE.md test command with correct --test-timeout=3600000 Verified: full pipeline (discuss→plan→detail→execute) passes in ~499s
This commit is contained in:
@@ -47,6 +47,7 @@ Run after any change to server-side code (`src/**`).
|
||||
npm test # Unit + E2E tests (no API cost)
|
||||
CW_CASSETTE_RECORD=1 npm test -- <test-file> # Record new cassettes locally
|
||||
REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50)
|
||||
FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000 # Full end-to-end test (~$2-5)
|
||||
```
|
||||
|
||||
See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs.
|
||||
|
||||
@@ -42,14 +42,14 @@ import {
|
||||
// Constants
|
||||
// =============================================================================
|
||||
|
||||
/** Total test timeout: 30 minutes */
|
||||
const FULL_FLOW_TIMEOUT = 30 * 60 * 1000;
|
||||
/** Total test timeout: 60 minutes */
|
||||
const FULL_FLOW_TIMEOUT = 60 * 60 * 1000;
|
||||
|
||||
/** Per-stage timeouts */
|
||||
const DISCUSS_TIMEOUT_MS = 5 * 60_000;
|
||||
const PLAN_TIMEOUT_MS = 8 * 60_000;
|
||||
const DETAIL_TIMEOUT_MS = 8 * 60_000; // per phase
|
||||
const EXECUTE_TIMEOUT_MS = 10 * 60_000; // per task
|
||||
/** Per-stage timeouts — architect agents are variable; these are upper bounds not expectations */
|
||||
const DISCUSS_TIMEOUT_MS = 8 * 60_000;
|
||||
const PLAN_TIMEOUT_MS = 12 * 60_000;
|
||||
const DETAIL_TIMEOUT_MS = 15 * 60_000; // per phase
|
||||
const EXECUTE_TIMEOUT_MS = 20 * 60_000; // per task — real RED-GREEN-REFACTOR cycles take time
|
||||
|
||||
// =============================================================================
|
||||
// Test
|
||||
@@ -154,7 +154,11 @@ describe.skipIf(!shouldRunFullFlowTests)('full flow (real agents — costs API c
|
||||
});
|
||||
console.log(` Agent: ${execAgent.name} (${execAgent.id})`);
|
||||
|
||||
const result = await harness.waitForAgentCompletion(execAgent.id, EXECUTE_TIMEOUT_MS);
|
||||
const result = await harness.driveToCompletion(
|
||||
execAgent.id,
|
||||
'Use your best judgment and keep it simple.',
|
||||
EXECUTE_TIMEOUT_MS,
|
||||
);
|
||||
executed.push({ task, result });
|
||||
|
||||
const icon = result?.success ? '✓' : '✗';
|
||||
|
||||
@@ -178,26 +178,38 @@ const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
|
||||
export async function createFullFlowHarness(
|
||||
initiativeName = 'Add complete() method to TodoStore',
|
||||
): Promise<FullFlowHarness> {
|
||||
// ── 0. Allow nested claude invocations ────────────────────────────────────
|
||||
// Claude Code sets CLAUDECODE in the environment, which prevents nested
|
||||
// claude CLI calls from starting ("cannot be launched inside another Claude
|
||||
// Code session"). Save and remove it so spawned agents can run normally.
|
||||
// It is restored in cleanup().
|
||||
const savedClaudeCodeEnv = process.env.CLAUDECODE;
|
||||
delete process.env.CLAUDECODE;
|
||||
|
||||
// ── 1. Fixture project ────────────────────────────────────────────────────
|
||||
const fixtureRoot = await mkdtemp(join(tmpdir(), 'cw-todo-api-'));
|
||||
// IMPORTANT: cp(src, dest) puts src INSIDE dest when dest already exists
|
||||
// (like `cp -r src dest/` → creates dest/src/). We need dest to NOT exist
|
||||
// yet so that cp creates it as a copy of src directly.
|
||||
const fixtureBase = await mkdtemp(join(tmpdir(), 'cw-fixture-'));
|
||||
const fixtureRoot = join(fixtureBase, 'todo-api'); // does not exist yet
|
||||
await cp(FIXTURES_DIR, fixtureRoot, { recursive: true });
|
||||
execSync('git init', { cwd: fixtureRoot, stdio: 'ignore' });
|
||||
execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'ignore' });
|
||||
execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'ignore' });
|
||||
execSync('git add . && git commit -m "initial todo-api with missing complete()"', {
|
||||
|
||||
// Verify files landed at the right level before git operations
|
||||
execSync(`test -f "${join(fixtureRoot, 'package.json')}"`, { stdio: 'pipe' });
|
||||
|
||||
execSync('git init', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git add .', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git commit -m "initial todo-api with missing complete()"', {
|
||||
cwd: fixtureRoot,
|
||||
stdio: 'ignore',
|
||||
stdio: 'pipe',
|
||||
});
|
||||
|
||||
// ── 2. Workspace root ─────────────────────────────────────────────────────
|
||||
// Just a plain temp directory — agent worktrees live under repos/ inside it.
|
||||
// No git init needed; the PROJECT clone (repos/<name>-<id>/) is the git repo.
|
||||
const workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-workspace-'));
|
||||
execSync('git init', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
execSync('git config user.email "test@test.com"', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
execSync('git config user.name "Test"', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
execSync('touch .gitkeep && git add .gitkeep && git commit -m "init"', {
|
||||
cwd: workspaceRoot,
|
||||
stdio: 'ignore',
|
||||
});
|
||||
|
||||
// ── 3. Database + repositories ────────────────────────────────────────────
|
||||
const db = createTestDatabase();
|
||||
@@ -301,7 +313,15 @@ export async function createFullFlowHarness(
|
||||
const remaining = deadline - Date.now();
|
||||
if (remaining <= 0) break;
|
||||
|
||||
const status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
|
||||
let status: AgentAttentionStatus;
|
||||
try {
|
||||
status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
|
||||
} catch {
|
||||
// Agent is still running (hasn't reached an attention state within the polling
|
||||
// window). This is normal for long-running execute agents. Continue the outer
|
||||
// loop — the deadline check above will terminate us if we truly time out.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (status === 'done' || status === 'crashed') {
|
||||
return agentManager.getResult(agentId);
|
||||
@@ -353,9 +373,13 @@ export async function createFullFlowHarness(
|
||||
.filter((a) => a.status === 'running')
|
||||
.map((a) => agentManager.stop(a.id)),
|
||||
);
|
||||
// Remove temp directories
|
||||
// Restore CLAUDECODE env var
|
||||
if (savedClaudeCodeEnv !== undefined) {
|
||||
process.env.CLAUDECODE = savedClaudeCodeEnv;
|
||||
}
|
||||
// Remove temp directories (fixtureBase contains fixtureRoot)
|
||||
await Promise.allSettled([
|
||||
rm(fixtureRoot, { recursive: true, force: true }),
|
||||
rm(fixtureBase, { recursive: true, force: true }),
|
||||
rm(workspaceRoot, { recursive: true, force: true }),
|
||||
]);
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user