From 988160b2b724604904c07877e2bdb7b9690a5ead Mon Sep 17 00:00:00 2001
From: Lukas May <lukas.may@carealytix.com>
Date: Mon, 2 Mar 2026 17:15:12 +0900
Subject: [PATCH] fix: Patch full-flow test timeouts and driveToCompletion
 polling loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- driveToCompletion() now catches inner waitForAgentAttention timeouts
  instead of letting them propagate — long-running execute/detail agents
  (>3 min without transitioning to waiting_for_input) no longer crash the
  polling loop; the outer deadline handles termination correctly
- Switch execute stage from waitForAgentCompletion to driveToCompletion
  so any clarifying questions get auto-answered
- Increase DETAIL_TIMEOUT_MS 8→15 min, PLAN_TIMEOUT_MS 8→12 min,
  EXECUTE_TIMEOUT_MS 10→20 min — architect agents are variable in
  practice; these are upper bounds not expectations
- Raise FULL_FLOW_TIMEOUT 30→60 min to cover worst-case stacking
- Update CLAUDE.md test command with correct --test-timeout=3600000

Verified: full pipeline (discuss→plan→detail→execute) passes in ~499s
---
 CLAUDE.md                                     |  1 +
 .../integration/full-flow/full-flow.test.ts   | 20 ++++---
 src/test/integration/full-flow/harness.ts     | 56 +++++++++++++------
 3 files changed, 53 insertions(+), 24 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index ee6fb33..6e8dde4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -47,6 +47,7 @@ Run after any change to server-side code (`src/**`).
 npm test                                                                         # Unit + E2E tests (no API cost)
 CW_CASSETTE_RECORD=1 npm test -- <test-file>                                    # Record new cassettes locally
 REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000  # Real provider tests (~$0.50)
+FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000        # Full end-to-end test (~$2-5)
 ```
 
 See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs.
diff --git a/src/test/integration/full-flow/full-flow.test.ts b/src/test/integration/full-flow/full-flow.test.ts
index 6516bfa..006fd36 100644
--- a/src/test/integration/full-flow/full-flow.test.ts
+++ b/src/test/integration/full-flow/full-flow.test.ts
@@ -42,14 +42,14 @@ import {
 // Constants
 // =============================================================================
 
-/** Total test timeout: 30 minutes */
-const FULL_FLOW_TIMEOUT = 30 * 60 * 1000;
+/** Total test timeout: 60 minutes */
+const FULL_FLOW_TIMEOUT = 60 * 60 * 1000;
 
-/** Per-stage timeouts */
-const DISCUSS_TIMEOUT_MS = 5 * 60_000;
-const PLAN_TIMEOUT_MS = 8 * 60_000;
-const DETAIL_TIMEOUT_MS = 8 * 60_000; // per phase
-const EXECUTE_TIMEOUT_MS = 10 * 60_000; // per task
+/** Per-stage timeouts — architect agents are variable; these are upper bounds not expectations */
+const DISCUSS_TIMEOUT_MS = 8 * 60_000;
+const PLAN_TIMEOUT_MS = 12 * 60_000;
+const DETAIL_TIMEOUT_MS = 15 * 60_000; // per phase
+const EXECUTE_TIMEOUT_MS = 20 * 60_000; // per task — real RED-GREEN-REFACTOR cycles take time
 
 // =============================================================================
 // Test
@@ -154,7 +154,11 @@ describe.skipIf(!shouldRunFullFlowTests)('full flow (real agents — costs API c
         });
         console.log(`    Agent: ${execAgent.name} (${execAgent.id})`);
 
-        const result = await harness.waitForAgentCompletion(execAgent.id, EXECUTE_TIMEOUT_MS);
+        const result = await harness.driveToCompletion(
+          execAgent.id,
+          'Use your best judgment and keep it simple.',
+          EXECUTE_TIMEOUT_MS,
+        );
         executed.push({ task, result });
 
         const icon = result?.success ? '✓' : '✗';
diff --git a/src/test/integration/full-flow/harness.ts b/src/test/integration/full-flow/harness.ts
index 4ff2331..56c31c1 100644
--- a/src/test/integration/full-flow/harness.ts
+++ b/src/test/integration/full-flow/harness.ts
@@ -178,26 +178,38 @@ const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
 export async function createFullFlowHarness(
   initiativeName = 'Add complete() method to TodoStore',
 ): Promise<FullFlowHarness> {
+  // ── 0. Allow nested claude invocations ────────────────────────────────────
+  // Claude Code sets CLAUDECODE in the environment, which prevents nested
+  // claude CLI calls from starting ("cannot be launched inside another Claude
+  // Code session").  Save and remove it so spawned agents can run normally.
+  // It is restored in cleanup().
+  const savedClaudeCodeEnv = process.env.CLAUDECODE;
+  delete process.env.CLAUDECODE;
+
   // ── 1. Fixture project ────────────────────────────────────────────────────
-  const fixtureRoot = await mkdtemp(join(tmpdir(), 'cw-todo-api-'));
+  // IMPORTANT: cp(src, dest) puts src INSIDE dest when dest already exists
+  // (like `cp -r src dest/` → creates dest/src/).  We need dest to NOT exist
+  // yet so that cp creates it as a copy of src directly.
+  const fixtureBase = await mkdtemp(join(tmpdir(), 'cw-fixture-'));
+  const fixtureRoot = join(fixtureBase, 'todo-api'); // does not exist yet
   await cp(FIXTURES_DIR, fixtureRoot, { recursive: true });
-  execSync('git init', { cwd: fixtureRoot, stdio: 'ignore' });
-  execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'ignore' });
-  execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'ignore' });
-  execSync('git add . && git commit -m "initial todo-api with missing complete()"', {
+
+  // Verify files landed at the right level before git operations
+  execSync(`test -f "${join(fixtureRoot, 'package.json')}"`, { stdio: 'pipe' });
+
+  execSync('git init', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git add .', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git commit -m "initial todo-api with missing complete()"', {
     cwd: fixtureRoot,
-    stdio: 'ignore',
+    stdio: 'pipe',
   });
 
   // ── 2. Workspace root ─────────────────────────────────────────────────────
+  // Just a plain temp directory — agent worktrees live under repos/ inside it.
+  // No git init needed; the PROJECT clone (repos/<name>-<id>/) is the git repo.
   const workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-workspace-'));
-  execSync('git init', { cwd: workspaceRoot, stdio: 'ignore' });
-  execSync('git config user.email "test@test.com"', { cwd: workspaceRoot, stdio: 'ignore' });
-  execSync('git config user.name "Test"', { cwd: workspaceRoot, stdio: 'ignore' });
-  execSync('touch .gitkeep && git add .gitkeep && git commit -m "init"', {
-    cwd: workspaceRoot,
-    stdio: 'ignore',
-  });
 
   // ── 3. Database + repositories ────────────────────────────────────────────
   const db = createTestDatabase();
@@ -301,7 +313,15 @@ export async function createFullFlowHarness(
       const remaining = deadline - Date.now();
       if (remaining <= 0) break;
 
-      const status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
+      let status: AgentAttentionStatus;
+      try {
+        status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
+      } catch {
+        // Agent is still running (hasn't reached an attention state within the polling
+        // window). This is normal for long-running execute agents. Continue the outer
+        // loop — the deadline check above will terminate us if we truly time out.
+        continue;
+      }
 
       if (status === 'done' || status === 'crashed') {
         return agentManager.getResult(agentId);
@@ -353,9 +373,13 @@ export async function createFullFlowHarness(
           .filter((a) => a.status === 'running')
           .map((a) => agentManager.stop(a.id)),
       );
-      // Remove temp directories
+      // Restore CLAUDECODE env var
+      if (savedClaudeCodeEnv !== undefined) {
+        process.env.CLAUDECODE = savedClaudeCodeEnv;
+      }
+      // Remove temp directories (fixtureBase contains fixtureRoot)
       await Promise.allSettled([
-        rm(fixtureRoot, { recursive: true, force: true }),
+        rm(fixtureBase, { recursive: true, force: true }),
         rm(workspaceRoot, { recursive: true, force: true }),
       ]);
     },