fix: Patch full-flow test timeouts and driveToCompletion polling loop

- driveToCompletion() now catches inner waitForAgentAttention timeouts
  instead of letting them propagate — long-running execute/detail agents
  (>3 min without transitioning to waiting_for_input) no longer crash the
  polling loop; the outer deadline handles termination correctly
- Switch execute stage from waitForAgentCompletion to driveToCompletion
  so any clarifying questions get auto-answered
- Increase DETAIL_TIMEOUT_MS 8→15 min, PLAN_TIMEOUT_MS 8→12 min,
  EXECUTE_TIMEOUT_MS 10→20 min — architect agents are variable in
  practice; these are upper bounds not expectations
- Raise FULL_FLOW_TIMEOUT 30→60 min to cover worst-case stacking
- Update CLAUDE.md test command with correct --test-timeout=3600000

Verified: full pipeline (discuss→plan→detail→execute) passes in ~499s
This commit is contained in:
Lukas May
2026-03-02 17:15:12 +09:00
parent 76aca71705
commit 988160b2b7
3 changed files with 53 additions and 24 deletions

View File

@@ -47,6 +47,7 @@ Run after any change to server-side code (`src/**`).
npm test # Unit + E2E tests (no API cost)
CW_CASSETTE_RECORD=1 npm test -- <test-file> # Record new cassettes locally
REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50)
FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000 # Full end-to-end test (~$2-5)
```
See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs.

View File

@@ -42,14 +42,14 @@ import {
// Constants
// =============================================================================
/** Total test timeout: 30 minutes */
const FULL_FLOW_TIMEOUT = 30 * 60 * 1000;
/** Total test timeout: 60 minutes */
const FULL_FLOW_TIMEOUT = 60 * 60 * 1000;
/** Per-stage timeouts */
const DISCUSS_TIMEOUT_MS = 5 * 60_000;
const PLAN_TIMEOUT_MS = 8 * 60_000;
const DETAIL_TIMEOUT_MS = 8 * 60_000; // per phase
const EXECUTE_TIMEOUT_MS = 10 * 60_000; // per task
/** Per-stage timeouts — architect agents are variable; these are upper bounds not expectations */
const DISCUSS_TIMEOUT_MS = 8 * 60_000;
const PLAN_TIMEOUT_MS = 12 * 60_000;
const DETAIL_TIMEOUT_MS = 15 * 60_000; // per phase
const EXECUTE_TIMEOUT_MS = 20 * 60_000; // per task — real RED-GREEN-REFACTOR cycles take time
// =============================================================================
// Test
@@ -154,7 +154,11 @@ describe.skipIf(!shouldRunFullFlowTests)('full flow (real agents — costs API c
});
console.log(` Agent: ${execAgent.name} (${execAgent.id})`);
const result = await harness.waitForAgentCompletion(execAgent.id, EXECUTE_TIMEOUT_MS);
const result = await harness.driveToCompletion(
execAgent.id,
'Use your best judgment and keep it simple.',
EXECUTE_TIMEOUT_MS,
);
executed.push({ task, result });
const icon = result?.success ? '✓' : '✗';

View File

@@ -178,26 +178,38 @@ const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
export async function createFullFlowHarness(
initiativeName = 'Add complete() method to TodoStore',
): Promise<FullFlowHarness> {
// ── 0. Allow nested claude invocations ────────────────────────────────────
// Claude Code sets CLAUDECODE in the environment, which prevents nested
// claude CLI calls from starting ("cannot be launched inside another Claude
// Code session"). Save and remove it so spawned agents can run normally.
// It is restored in cleanup().
const savedClaudeCodeEnv = process.env.CLAUDECODE;
delete process.env.CLAUDECODE;
// ── 1. Fixture project ────────────────────────────────────────────────────
const fixtureRoot = await mkdtemp(join(tmpdir(), 'cw-todo-api-'));
// IMPORTANT: cp(src, dest) puts src INSIDE dest when dest already exists
// (like `cp -r src dest/` → creates dest/src/). We need dest to NOT exist
// yet so that cp creates it as a copy of src directly.
const fixtureBase = await mkdtemp(join(tmpdir(), 'cw-fixture-'));
const fixtureRoot = join(fixtureBase, 'todo-api'); // does not exist yet
await cp(FIXTURES_DIR, fixtureRoot, { recursive: true });
execSync('git init', { cwd: fixtureRoot, stdio: 'ignore' });
execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'ignore' });
execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'ignore' });
execSync('git add . && git commit -m "initial todo-api with missing complete()"', {
// Verify files landed at the right level before git operations
execSync(`test -f "${join(fixtureRoot, 'package.json')}"`, { stdio: 'pipe' });
execSync('git init', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git add .', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git commit -m "initial todo-api with missing complete()"', {
cwd: fixtureRoot,
stdio: 'ignore',
stdio: 'pipe',
});
// ── 2. Workspace root ─────────────────────────────────────────────────────
// Just a plain temp directory — agent worktrees live under repos/ inside it.
// No git init needed; the PROJECT clone (repos/<name>-<id>/) is the git repo.
const workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-workspace-'));
execSync('git init', { cwd: workspaceRoot, stdio: 'ignore' });
execSync('git config user.email "test@test.com"', { cwd: workspaceRoot, stdio: 'ignore' });
execSync('git config user.name "Test"', { cwd: workspaceRoot, stdio: 'ignore' });
execSync('touch .gitkeep && git add .gitkeep && git commit -m "init"', {
cwd: workspaceRoot,
stdio: 'ignore',
});
// ── 3. Database + repositories ────────────────────────────────────────────
const db = createTestDatabase();
@@ -301,7 +313,15 @@ export async function createFullFlowHarness(
const remaining = deadline - Date.now();
if (remaining <= 0) break;
const status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
let status: AgentAttentionStatus;
try {
status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
} catch {
// Agent is still running (hasn't reached an attention state within the polling
// window). This is normal for long-running execute agents. Continue the outer
// loop — the deadline check above will terminate us if we truly time out.
continue;
}
if (status === 'done' || status === 'crashed') {
return agentManager.getResult(agentId);
@@ -353,9 +373,13 @@ export async function createFullFlowHarness(
.filter((a) => a.status === 'running')
.map((a) => agentManager.stop(a.id)),
);
// Remove temp directories
// Restore CLAUDECODE env var
if (savedClaudeCodeEnv !== undefined) {
process.env.CLAUDECODE = savedClaudeCodeEnv;
}
// Remove temp directories (fixtureBase contains fixtureRoot)
await Promise.allSettled([
rm(fixtureRoot, { recursive: true, force: true }),
rm(fixtureBase, { recursive: true, force: true }),
rm(workspaceRoot, { recursive: true, force: true }),
]);
},