refactor: Restructure monorepo to apps/server/ and apps/web/ layout

Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt standard monorepo conventions (apps/ for runnable apps, packages/ for reusable libraries). Update all config files, shared package imports, test fixtures, and documentation to reflect new paths. Key fixes: - Update workspace config to ["apps/*", "packages/*"] - Update tsconfig.json rootDir/include for apps/server/ - Add apps/web/** to vitest exclude list - Update drizzle.config.ts schema path - Fix ensure-schema.ts migration path detection (3 levels up in dev, 2 levels up in dist) - Fix tests/integration/cli-server.test.ts import paths - Update packages/shared imports to apps/server/ paths - Update all docs/ files with new paths
2026-03-03 11:22:53 +01:00
parent 8c38d958ce
commit 34578d39c6
535 changed files with 75452 additions and 687 deletions
--- a/apps/server/test/integration/agent-workdir-verification.test.ts
+++ b/apps/server/test/integration/agent-workdir-verification.test.ts
@@ -0,0 +1,203 @@
+/**
+ * Agent Working Directory Verification Tests
+ *
+ * Tests that verify agents actually run in their intended working directories.
+ * These tests use simple shell commands to prove the agent execution location.
+ *
+ * IMPORTANT: These tests spawn real CLI processes and may incur API costs.
+ * They are SKIPPED by default to prevent accidental charges.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_WORKDIR_TESTS=1 npm test -- src/test/integration/agent-workdir-verification.test.ts --test-timeout=120000
+ * ```
+ */
+
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import { mkdtemp, rm, readFile } from 'node:fs/promises';
+import { existsSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { MultiProviderAgentManager } from '../../agent/manager.js';
+import { createTestDatabase } from '../../db/repositories/drizzle/test-helpers.js';
+import {
+  DrizzleAgentRepository,
+  DrizzleProjectRepository,
+  DrizzleAccountRepository,
+  DrizzleInitiativeRepository,
+} from '../../db/repositories/drizzle/index.js';
+import { EventEmitterBus } from '../../events/bus.js';
+
+const SHOULD_SKIP = !process.env.REAL_WORKDIR_TESTS;
+const TEST_TIMEOUT = 60000;
+
+describe.skipIf(SHOULD_SKIP)('Agent Working Directory Verification', () => {
+  let tempDir: string;
+  let agentManager: MultiProviderAgentManager;
+  let agentRepository: DrizzleAgentRepository;
+
+  beforeAll(async () => {
+    if (SHOULD_SKIP) return;
+
+    console.log('\n=== Running Agent Working Directory Tests ===');
+    console.log('These tests verify agents run in correct working directories.\n');
+
+    // Create temp directory for test workspace
+    tempDir = await mkdtemp(join(tmpdir(), 'cw-workdir-test-'));
+
+    // Set up test database and repositories
+    const db = await createTestDatabase();
+    const eventBus = new EventEmitterBus();
+
+    agentRepository = new DrizzleAgentRepository(db);
+    const projectRepository = new DrizzleProjectRepository(db);
+    const accountRepository = new DrizzleAccountRepository(db);
+
+    agentManager = new MultiProviderAgentManager(
+      agentRepository,
+      tempDir,
+      projectRepository,
+      accountRepository,
+      eventBus,
+    );
+  });
+
+  afterAll(async () => {
+    if (SHOULD_SKIP || !tempDir) return;
+    try {
+      await rm(tempDir, { recursive: true });
+    } catch (err) {
+      console.warn('Failed to cleanup temp directory:', err);
+    }
+  });
+
+  it('spawns agent in correct standalone working directory', async () => {
+    const prompt = `
+Write your current working directory to a file called 'verify-pwd.txt'.
+Use this exact bash command:
+
+pwd > verify-pwd.txt
+
+Then output the signal: {"done": true}
+`.trim();
+
+    // Spawn standalone agent
+    const agent = await agentManager.spawn({
+      taskId: null,
+      prompt,
+      mode: 'execute',
+      provider: 'claude',
+    });
+
+    expect(agent.id).toBeTruthy();
+    expect(agent.status).toBe('running');
+
+    // Wait for completion (poll agent status)
+    let attempts = 0;
+    const maxAttempts = 60; // 60 seconds timeout
+
+    while (attempts < maxAttempts) {
+      await new Promise(resolve => setTimeout(resolve, 1000));
+      attempts++;
+
+      const currentAgent = await agentRepository.findById(agent.id);
+      if (!currentAgent || currentAgent.status !== 'running') {
+        break;
+      }
+    }
+
+    // Verify final agent state
+    const completedAgent = await agentRepository.findById(agent.id);
+    expect(completedAgent).toBeTruthy();
+    expect(completedAgent!.status).not.toBe('running');
+
+    // Get the agent's expected working directory
+    const expectedWorkdir = join(tempDir, 'agent-workdirs', agent.name, 'workspace');
+
+    // Read diagnostic files
+    const diagnosticFile = join(expectedWorkdir, '.cw', 'spawn-diagnostic.json');
+    const expectedPwdFile = join(expectedWorkdir, '.cw', 'expected-pwd.txt');
+    const verifyPwdFile = join(expectedWorkdir, 'verify-pwd.txt');
+
+    // Verify diagnostic files exist
+    expect(existsSync(diagnosticFile), 'spawn diagnostic file should exist').toBe(true);
+    expect(existsSync(expectedPwdFile), 'expected pwd file should exist').toBe(true);
+
+    // Read diagnostic data
+    const diagnostic = JSON.parse(await readFile(diagnosticFile, 'utf-8'));
+    const expectedPwd = (await readFile(expectedPwdFile, 'utf-8')).trim();
+
+    console.log('Diagnostic data:', diagnostic);
+    console.log('Expected working directory:', expectedPwd);
+
+    // Verify diagnostic consistency
+    expect(diagnostic.intendedCwd).toBe(expectedWorkdir);
+    expect(diagnostic.cwdExistsAtSpawn).toBe(true);
+    expect(expectedPwd).toBe(expectedWorkdir);
+
+    // The critical test: verify the agent actually wrote the file in the expected location
+    if (existsSync(verifyPwdFile)) {
+      const actualPwd = (await readFile(verifyPwdFile, 'utf-8')).trim();
+      console.log('Agent reported working directory:', actualPwd);
+
+      // This is the key verification: the pwd reported by the agent should match expected
+      expect(actualPwd).toBe(expectedWorkdir);
+    } else {
+      // If the file doesn't exist, the agent either failed or ran somewhere else
+      console.warn('Agent did not create verify-pwd.txt file');
+      console.log('Expected at:', verifyPwdFile);
+
+      // Let's check if it was created elsewhere (debugging)
+      const alternativeLocations = [
+        join(tempDir, 'verify-pwd.txt'),
+        join(process.cwd(), 'verify-pwd.txt'),
+      ];
+
+      for (const loc of alternativeLocations) {
+        if (existsSync(loc)) {
+          const content = await readFile(loc, 'utf-8');
+          console.log(`Found verify-pwd.txt at unexpected location ${loc}:`, content.trim());
+        }
+      }
+
+      throw new Error('Agent did not create pwd verification file in expected location');
+    }
+  }, TEST_TIMEOUT);
+
+  it('creates diagnostic files with correct metadata', async () => {
+    const prompt = `Output the signal: {"done": true}`;
+
+    const agent = await agentManager.spawn({
+      taskId: null,
+      prompt,
+      mode: 'execute',
+      provider: 'claude',
+    });
+
+    // Wait a bit for spawn to complete
+    await new Promise(resolve => setTimeout(resolve, 2000));
+
+    const expectedWorkdir = join(tempDir, 'agent-workdirs', agent.name, 'workspace');
+    const diagnosticFile = join(expectedWorkdir, '.cw', 'spawn-diagnostic.json');
+    const expectedPwdFile = join(expectedWorkdir, '.cw', 'expected-pwd.txt');
+
+    // Verify files exist immediately after spawn
+    expect(existsSync(diagnosticFile), 'diagnostic file should be created after spawn').toBe(true);
+    expect(existsSync(expectedPwdFile), 'expected pwd file should be created').toBe(true);
+
+    // Verify diagnostic content
+    const diagnostic = JSON.parse(await readFile(diagnosticFile, 'utf-8'));
+    const expectedPwd = (await readFile(expectedPwdFile, 'utf-8')).trim();
+
+    expect(diagnostic.agentId).toBe(agent.id);
+    expect(diagnostic.alias).toBe(agent.name);
+    expect(diagnostic.intendedCwd).toBe(expectedWorkdir);
+    expect(diagnostic.provider).toBe('claude');
+    expect(diagnostic.cwdExistsAtSpawn).toBe(true);
+    expect(diagnostic.customCwdProvided).toBe(false);
+    expect(typeof diagnostic.timestamp).toBe('string');
+    expect(Array.isArray(diagnostic.args)).toBe(true);
+
+    expect(expectedPwd).toBe(expectedWorkdir);
+  });
+});
--- a/apps/server/test/integration/crash-race-condition.test.ts
+++ b/apps/server/test/integration/crash-race-condition.test.ts
@@ -0,0 +1,232 @@
+/**
+ * Integration test to reproduce and fix the crash marking race condition.
+ *
+ * This test simulates the exact scenario where agents complete successfully
+ * but get marked as crashed due to timing issues in the output handler.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { writeFile, mkdir, rm } from 'node:fs/promises';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { randomBytes } from 'node:crypto';
+import { OutputHandler } from '../../agent/output-handler.js';
+import type { AgentRepository } from '../../db/repositories/agent-repository.js';
+
+interface TestAgent {
+  id: string;
+  name: string;
+  status: 'idle' | 'running' | 'waiting_for_input' | 'stopped' | 'crashed';
+  mode: 'execute' | 'discuss' | 'plan' | 'detail' | 'refine';
+  taskId: string | null;
+  sessionId: string | null;
+  worktreeId: string;
+  createdAt: Date;
+  updatedAt: Date;
+  provider: string;
+  accountId: string | null;
+  pid: number | null;
+  outputFilePath: string | null;
+  result: string | null;
+  pendingQuestions: string | null;
+  initiativeId: string | null;
+  userDismissedAt: Date | null;
+  exitCode: number | null;
+}
+
+describe('Crash marking race condition', () => {
+  let outputHandler: OutputHandler;
+  let testAgent: TestAgent;
+  let testDir: string;
+  let mockRepo: AgentRepository;
+
+  // Track all repository calls
+  let updateCalls: Array<{ id: string; data: any }> = [];
+  let finalAgentStatus: string | null = null;
+
+  beforeEach(async () => {
+    updateCalls = [];
+    finalAgentStatus = null;
+
+    // Create test directory structure
+    testDir = join(tmpdir(), `crash-test-${randomBytes(8).toString('hex')}`);
+    const outputDir = join(testDir, '.cw/output');
+    await mkdir(outputDir, { recursive: true });
+
+    // Create test agent
+    testAgent = {
+      id: 'test-agent-id',
+      name: 'test-agent',
+      status: 'running',
+      mode: 'refine',
+      taskId: 'task-1',
+      sessionId: 'session-1',
+      worktreeId: 'worktree-1',
+      createdAt: new Date(),
+      updatedAt: new Date(),
+      provider: 'claude',
+      accountId: null,
+      pid: 12345,
+      outputFilePath: join(testDir, 'output.jsonl'),
+      result: null,
+      pendingQuestions: null,
+      initiativeId: 'init-1',
+      userDismissedAt: null,
+      exitCode: null
+    };
+
+    // Mock repository that tracks all update calls
+    mockRepo = {
+      async findById(id: string) {
+        return id === testAgent.id ? { ...testAgent } : null;
+      },
+      async update(id: string, data: any) {
+        updateCalls.push({ id, data });
+        if (data.status) {
+          finalAgentStatus = data.status;
+          testAgent.status = data.status;
+        }
+        return { ...testAgent, ...data };
+      },
+      async create() { throw new Error('Not implemented'); },
+      async findAll() { throw new Error('Not implemented'); },
+      async findByStatus() { throw new Error('Not implemented'); },
+      async findByTaskId() { throw new Error('Not implemented'); },
+      async findByName() { throw new Error('Not implemented'); },
+      async findBySessionId() { throw new Error('Not implemented'); },
+      async delete() { throw new Error('Not implemented'); }
+    };
+
+    outputHandler = new OutputHandler(mockRepo);
+  });
+
+  afterEach(async () => {
+    try {
+      await rm(testDir, { recursive: true });
+    } catch {
+      // Ignore cleanup errors
+    }
+  });
+
+  it('should NOT mark agent as crashed when signal.json indicates completion', async () => {
+    // SETUP: Create a valid completion signal that should prevent crash marking
+    const signalPath = join(testDir, '.cw/output/signal.json');
+    const signalContent = {
+      status: 'questions',
+      questions: [
+        { id: 'q1', question: 'Test question?' }
+      ]
+    };
+    await writeFile(signalPath, JSON.stringify(signalContent, null, 2));
+
+    // SETUP: Create empty output file to simulate "no new output detected" scenario
+    const outputFilePath = join(testDir, 'output.jsonl');
+    await writeFile(outputFilePath, ''); // Empty file simulates the race condition
+
+    // Mock active agent with output file path
+    const mockActive = {
+      outputFilePath,
+      streamSessionId: 'session-1'
+    };
+
+    // Mock getAgentWorkdir function — receives worktreeId, not agentId
+    const getAgentWorkdir = (worktreeId: string) => {
+      expect(worktreeId).toBe(testAgent.worktreeId);
+      return testDir;
+    };
+
+    // EXECUTE: Call handleCompletion which should trigger the race condition scenario
+    // This simulates: no stream text + no new file content + valid signal.json
+    await (outputHandler as any).handleCompletion(
+      testAgent.id,
+      mockActive,
+      getAgentWorkdir
+    );
+
+    // VERIFY: Agent should NOT be marked as crashed
+    console.log('Update calls:', updateCalls);
+    console.log('Final agent status:', finalAgentStatus);
+
+    expect(updateCalls.length).toBeGreaterThan(0);
+    expect(finalAgentStatus).not.toBe('crashed');
+
+    // Should be marked with the appropriate completion status
+    expect(['idle', 'waiting_for_input', 'stopped']).toContain(finalAgentStatus);
+  });
+
+  it('should mark agent as crashed when no completion signal exists', async () => {
+    // SETUP: No signal.json file exists - agent should be marked as crashed
+    const outputFilePath = join(testDir, 'output.jsonl');
+    await writeFile(outputFilePath, ''); // Empty file
+
+    const mockActive = {
+      outputFilePath,
+      streamSessionId: 'session-1'
+    };
+
+    const getAgentWorkdir = (agentId: string) => testDir;
+
+    // EXECUTE: This should mark agent as crashed since no completion signal exists
+    await (outputHandler as any).handleCompletion(
+      testAgent.id,
+      mockActive,
+      getAgentWorkdir
+    );
+
+    // VERIFY: Agent SHOULD be marked as crashed
+    expect(finalAgentStatus).toBe('crashed');
+  });
+
+  it('should handle the exact slim-wildebeest scenario', async () => {
+    // SETUP: Reproduce the exact conditions that slim-wildebeest had
+    const signalPath = join(testDir, '.cw/output/signal.json');
+    const exactSignalContent = {
+      "status": "questions",
+      "questions": [
+        {
+          "id": "q1",
+          "question": "What UI framework/styling system is the admin UI currently using that needs to be replaced?"
+        },
+        {
+          "id": "q2",
+          "question": "What specific problems with the current admin UI are we solving? (e.g., poor developer experience, design inconsistency, performance issues, lack of accessibility)"
+        }
+      ]
+    };
+    await writeFile(signalPath, JSON.stringify(exactSignalContent, null, 2));
+
+    // Create SUMMARY.md like slim-wildebeest had
+    const summaryPath = join(testDir, '.cw/output/SUMMARY.md');
+    const summaryContent = `---
+files_modified: []
+---
+Initiative page is essentially empty — lacks context, scope, goals, and technical approach. Requested clarification on current state, problems being solved, scope boundaries, and success criteria before proposing meaningful improvements.`;
+    await writeFile(summaryPath, summaryContent);
+
+    // Simulate the output file scenario
+    const outputFilePath = join(testDir, 'output.jsonl');
+    await writeFile(outputFilePath, 'some initial content\n'); // Some content but no new lines
+
+    const mockActive = {
+      outputFilePath,
+      streamSessionId: 'session-1'
+    };
+
+    const getAgentWorkdir = (agentId: string) => testDir;
+
+    // EXECUTE: This is the exact scenario that caused slim-wildebeest to be marked as crashed
+    await (outputHandler as any).handleCompletion(
+      testAgent.id,
+      mockActive,
+      getAgentWorkdir
+    );
+
+    // VERIFY: This should NOT be marked as crashed
+    console.log('slim-wildebeest scenario - Final status:', finalAgentStatus);
+    console.log('slim-wildebeest scenario - Update calls:', updateCalls);
+
+    expect(finalAgentStatus).not.toBe('crashed');
+    expect(['idle', 'waiting_for_input', 'stopped']).toContain(finalAgentStatus);
+  });
+
+});
--- a/apps/server/test/integration/full-flow/full-flow-cassette.test.ts
+++ b/apps/server/test/integration/full-flow/full-flow-cassette.test.ts
@@ -0,0 +1,244 @@
+/**
+ * Full-Flow Cassette Integration Test
+ *
+ * Cassette-backed variant of the full multi-agent workflow test.
+ * Runs the same discuss → plan → detail → execute pipeline but intercepts
+ * subprocess spawning with CassetteProcessManager — no real API calls in CI.
+ *
+ * Recording (one-time, costs ~$2–5):
+ *   CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
+ *   # Commit the generated src/test/cassettes/<hash>.json files afterward
+ *
+ * Replay (default — runs in seconds):
+ *   npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts
+ *
+ * Force re-record (overwrites existing cassettes):
+ *   CW_CASSETTE_FORCE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
+ */
+
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import { existsSync, readdirSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import type { Phase, Task } from '../../../db/schema.js';
+import type { AgentResult } from '../../../agent/types.js';
+import { buildExecutePrompt } from '../../../agent/prompts/index.js';
+import { CassetteStore } from '../../cassette/store.js';
+import { CassetteProcessManager, type CassetteMode } from '../../cassette/process-manager.js';
+import {
+  createFullFlowHarness,
+  type FullFlowHarness,
+} from './harness.js';
+import {
+  printHeader,
+  printDiscussResult,
+  printPlanResult,
+  printDetailResult,
+  printExecuteResult,
+  printFinalSummary,
+  type ExecutedTask,
+} from './report.js';
+
+// =============================================================================
+// Constants
+// =============================================================================
+
+const RECORDING =
+  process.env.CW_CASSETTE_FORCE_RECORD === '1' || process.env.CW_CASSETTE_RECORD === '1';
+
+/**
+ * Test timeout.
+ * - Replay: 5 min (cassettes complete in seconds; cap is generous headroom)
+ * - Record: 60 min (real agents doing discuss/plan/detail/execute take API time)
+ */
+const CASSETTE_FLOW_TIMEOUT = RECORDING ? 60 * 60_000 : 5 * 60_000;
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const CASSETTE_DIR =
+  process.env.CW_CASSETTE_DIR ?? join(__dirname, '../../cassettes');
+
+// =============================================================================
+// Mode helper
+// =============================================================================
+
+function cassetteMode(): CassetteMode {
+  if (process.env.CW_CASSETTE_FORCE_RECORD === '1') return 'record';
+  if (process.env.CW_CASSETTE_RECORD === '1') return 'auto';
+  return 'replay';
+}
+
+/**
+ * True when cassettes are available (at least one .json file) OR we're in a
+ * recording run. Skips the suite if no cassettes have been recorded yet so
+ * that `npm test` doesn't fail on a fresh clone before cassettes are committed.
+ */
+function cassettesAvailable(): boolean {
+  const mode = cassetteMode();
+  if (mode !== 'replay') return true; // recording runs always proceed
+  if (!existsSync(CASSETTE_DIR)) return false;
+  return readdirSync(CASSETTE_DIR).some((f) => f.endsWith('.json'));
+}
+
+// =============================================================================
+// Test
+// =============================================================================
+
+describe.skipIf(!cassettesAvailable())('full flow (cassette replay)', () => {
+  let harness: FullFlowHarness;
+  const startedAt = Date.now();
+
+  beforeAll(async () => {
+    const store = new CassetteStore(CASSETTE_DIR);
+    const mode = cassetteMode();
+
+    harness = await createFullFlowHarness('Add complete() method to TodoStore', {
+      processManagerFactory: (workspaceRoot, projectRepo) =>
+        new CassetteProcessManager(workspaceRoot, projectRepo, store, mode),
+    });
+
+    printHeader(harness.initiative.name);
+    console.log(`  Cassette mode : ${mode}`);
+    console.log(`  Cassette dir  : ${CASSETTE_DIR}`);
+    console.log(`  Initiative ID : ${harness.initiative.id}`);
+    console.log(`  Workspace     : ${harness.workspaceRoot}`);
+  }, CASSETTE_FLOW_TIMEOUT);
+
+  afterAll(async () => {
+    if (harness) await harness.cleanup();
+  });
+
+  it(
+    'runs the complete multi-agent workflow from cassettes',
+    async () => {
+      const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness;
+      const initiativeId = initiative.id;
+
+      // ── Stage 2: Discuss ───────────────────────────────────────────────────
+      console.log('\n\n>>> Stage 2: DISCUSS <<<');
+      const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId });
+      expect(discussAgent.id).toBeTruthy();
+      console.log(`  Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`);
+
+      const discussResult = await harness.driveToCompletion(
+        discussAgent.id,
+        'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.',
+        CASSETTE_FLOW_TIMEOUT,
+      );
+      printDiscussResult(discussAgent.id, discussResult);
+
+      if (!discussResult?.success) {
+        console.warn('  [WARN] discuss agent did not succeed; continuing to plan stage');
+      }
+
+      // ── Stage 3: Plan ──────────────────────────────────────────────────────
+      console.log('\n\n>>> Stage 3: PLAN <<<');
+      const planAgent = await caller.spawnArchitectPlan({ initiativeId });
+      expect(planAgent.id).toBeTruthy();
+      console.log(`  Spawned plan agent: ${planAgent.name} (${planAgent.id})`);
+
+      const planResult = await harness.driveToCompletion(
+        planAgent.id,
+        'Keep it simple.',
+        CASSETTE_FLOW_TIMEOUT,
+      );
+      expect(planResult).toBeTruthy();
+
+      const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId);
+      expect(phases.length).toBeGreaterThan(0);
+      printPlanResult(phases);
+
+      // ── Stage 4: Detail (per phase) ────────────────────────────────────────
+      console.log('\n\n>>> Stage 4: DETAIL <<<');
+      for (const phase of phases) {
+        const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id });
+        expect(detailAgent.id).toBeTruthy();
+        console.log(`  Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`);
+
+        const detailResult = await harness.driveToCompletion(
+          detailAgent.id,
+          'Keep it simple.',
+          CASSETTE_FLOW_TIMEOUT,
+        );
+        expect(detailResult).toBeTruthy();
+
+        const phaseTasks = await taskRepository.findByPhaseId(phase.id);
+        const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
+        expect(executeTasks.length).toBeGreaterThan(0);
+        printDetailResult(phase, phaseTasks);
+      }
+
+      // ── Stage 5: Execute ───────────────────────────────────────────────────
+      console.log('\n\n>>> Stage 5: EXECUTE <<<');
+      const allTasks = await gatherAllExecuteTasks(taskRepository, phases);
+      console.log(`  Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`);
+
+      const executed: ExecutedTask[] = [];
+      for (const task of allTasks) {
+        console.log(`  Spawning execute agent for: "${task.name}"`);
+        const execAgent = await agentManager.spawn({
+          taskId: task.id,
+          prompt: buildExecutePrompt(task.description ?? task.name),
+          mode: 'execute',
+          initiativeId,
+          phaseId: task.phaseId ?? undefined,
+          inputContext: {
+            initiative,
+            task,
+          },
+        });
+        console.log(`    Agent: ${execAgent.name} (${execAgent.id})`);
+
+        const result = await harness.driveToCompletion(
+          execAgent.id,
+          'Use your best judgment and keep it simple.',
+          CASSETTE_FLOW_TIMEOUT,
+        );
+        executed.push({ task, result });
+
+        const icon = result?.success ? '✓' : '✗';
+        console.log(`    ${icon} Completed with success=${result?.success ?? null}`);
+        if (result && !result.success) {
+          console.log(`      Message: ${result.message?.slice(0, 200)}`);
+        }
+      }
+
+      printExecuteResult(executed);
+
+      // ── Assertions ─────────────────────────────────────────────────────────
+      expect(executed.length).toBeGreaterThan(0);
+
+      const allSucceeded = executed.every((e) => e.result?.success === true);
+      if (!allSucceeded) {
+        const failed = executed.filter((e) => !e.result?.success);
+        console.warn(`  [WARN] ${failed.length} execute task(s) did not succeed`);
+      }
+
+      // ── Final summary ──────────────────────────────────────────────────────
+      printFinalSummary(
+        initiative.name,
+        phases,
+        allTasks,
+        executed,
+        Date.now() - startedAt,
+      );
+    },
+    CASSETTE_FLOW_TIMEOUT,
+  );
+});
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+async function gatherAllExecuteTasks(
+  taskRepository: FullFlowHarness['taskRepository'],
+  phases: Phase[],
+): Promise<Task[]> {
+  const result: Task[] = [];
+  for (const phase of phases) {
+    const phaseTasks = await taskRepository.findByPhaseId(phase.id);
+    const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
+    result.push(...execTasks);
+  }
+  return result;
+}
--- a/apps/server/test/integration/full-flow/harness.ts
+++ b/apps/server/test/integration/full-flow/harness.ts
@@ -0,0 +1,399 @@
+/**
+ * Full-Flow Test Harness
+ *
+ * Wires up the complete system with real agents for end-to-end multi-agent
+ * workflow testing: discuss → plan → detail → execute.
+ *
+ * Unlike the standard TestHarness (MockAgentManager) or RealProviderHarness
+ * (agents only), this harness adds:
+ *  - All 11 repositories
+ *  - tRPC caller for architect/agent procedures
+ *  - A self-contained fixture git repo (todo-api) for agents to work on
+ *  - Helpers for driving agents through question/answer loops
+ *
+ * Used by full-flow-cassette.test.ts (replay) and for manual recording runs.
+ */
+
+import { mkdtemp, rm, cp } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { execSync } from 'node:child_process';
+import type { DrizzleDatabase } from '../../../db/index.js';
+import type { DomainEvent } from '../../../events/types.js';
+import { EventEmitterBus } from '../../../events/bus.js';
+import { MultiProviderAgentManager } from '../../../agent/manager.js';
+import type { AgentResult, PendingQuestions } from '../../../agent/types.js';
+import type { Initiative, Project, Phase, Task } from '../../../db/schema.js';
+import type { InitiativeRepository } from '../../../db/repositories/initiative-repository.js';
+import type { PhaseRepository } from '../../../db/repositories/phase-repository.js';
+import type { TaskRepository } from '../../../db/repositories/task-repository.js';
+import type { MessageRepository } from '../../../db/repositories/message-repository.js';
+import type { AgentRepository } from '../../../db/repositories/agent-repository.js';
+import type { PageRepository } from '../../../db/repositories/page-repository.js';
+import type { ProjectRepository } from '../../../db/repositories/project-repository.js';
+import type { AccountRepository } from '../../../db/repositories/account-repository.js';
+import type { ChangeSetRepository } from '../../../db/repositories/change-set-repository.js';
+import type { LogChunkRepository } from '../../../db/repositories/log-chunk-repository.js';
+import type { ConversationRepository } from '../../../db/repositories/conversation-repository.js';
+import type { ProcessManager } from '../../../agent/process-manager.js';
+import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
+import { createRepositories } from '../../../container.js';
+import { DefaultDispatchManager } from '../../../dispatch/manager.js';
+import { appRouter, createCallerFactory } from '../../../trpc/router.js';
+import { createContext } from '../../../trpc/context.js';
+
+// =============================================================================
+// CapturingEventBus
+// =============================================================================
+
+export class CapturingEventBus extends EventEmitterBus {
+  emittedEvents: DomainEvent[] = [];
+
+  emit<T extends DomainEvent>(event: T): void {
+    this.emittedEvents.push(event);
+    super.emit(event);
+  }
+
+  getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
+    return this.emittedEvents.filter((e) => e.type === type) as T[];
+  }
+
+  clearEvents(): void {
+    this.emittedEvents = [];
+  }
+}
+
+// =============================================================================
+// Sleep helper
+// =============================================================================
+
+export function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+// =============================================================================
+// tRPC caller type
+// =============================================================================
+
+const createCaller = createCallerFactory(appRouter);
+export type FullFlowCaller = ReturnType<typeof createCaller>;
+
+// =============================================================================
+// FullFlowHarness interface
+// =============================================================================
+
+/** Status of an agent that requires attention: done, waiting for answers, or crashed */
+export type AgentAttentionStatus = 'done' | 'waiting' | 'crashed';
+
+export interface FullFlowHarness {
+  /** Absolute path to the CW workspace (worktrees are created here) */
+  workspaceRoot: string;
+  /** Absolute path to the cloned todo-api fixture git repo */
+  fixtureRoot: string;
+  /** The registered todo-api project */
+  project: Project;
+  /** The initiative created for the test run */
+  initiative: Initiative;
+  /** tRPC caller (all procedures available) */
+  caller: FullFlowCaller;
+  /** Real MultiProviderAgentManager */
+  agentManager: MultiProviderAgentManager;
+  /** In-memory SQLite database */
+  db: DrizzleDatabase;
+  /** Event bus with capture capability */
+  eventBus: CapturingEventBus;
+
+  // All 11 repositories
+  initiativeRepository: InitiativeRepository;
+  phaseRepository: PhaseRepository;
+  taskRepository: TaskRepository;
+  messageRepository: MessageRepository;
+  agentRepository: AgentRepository;
+  pageRepository: PageRepository;
+  projectRepository: ProjectRepository;
+  accountRepository: AccountRepository;
+  changeSetRepository: ChangeSetRepository;
+  logChunkRepository: LogChunkRepository;
+  conversationRepository: ConversationRepository;
+
+  /**
+   * Wait for an agent to reach a terminal status (idle/stopped/crashed).
+   * Returns null if the agent enters waiting_for_input.
+   */
+  waitForAgentCompletion(agentId: string, timeoutMs?: number): Promise<AgentResult | null>;
+
+  /**
+   * Poll until the agent needs attention: done (idle/stopped), waiting for input, or crashed.
+   * Useful for the question/answer loop in discuss mode.
+   */
+  waitForAgentAttention(agentId: string, timeoutMs?: number): Promise<AgentAttentionStatus>;
+
+  /**
+   * Drive an agent to full completion, answering any questions along the way.
+   * Answers all questions with the provided answer string (or a default).
+   */
+  driveToCompletion(
+    agentId: string,
+    answer?: string,
+    timeoutMs?: number,
+  ): Promise<AgentResult | null>;
+
+  /**
+   * Get captured events filtered by type.
+   */
+  getEventsByType<T extends DomainEvent>(type: T['type']): T[];
+
+  /**
+   * Kill all running agents and remove temp directories.
+   */
+  cleanup(): Promise<void>;
+}
+
+// =============================================================================
+// Poll interval
+// =============================================================================
+
+const POLL_INTERVAL_MS = 1500;
+
+// =============================================================================
+// Factory
+// =============================================================================
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
+
+export interface FullFlowHarnessOptions {
+  /** Factory called after workspaceRoot + repos are created. Return a custom ProcessManager. */
+  processManagerFactory?: (workspaceRoot: string, projectRepo: ProjectRepository) => ProcessManager;
+}
+
+/**
+ * Create a full-flow test harness.
+ *
+ * Setup steps:
+ *  1. Copy todo-api fixture into a temp git repo (fixtureRoot).
+ *  2. Create workspace temp dir (workspaceRoot) for CW operations.
+ *  3. Init in-memory DB + all 11 repos.
+ *  4. Wire real MultiProviderAgentManager with all repos.
+ *  5. Wire DefaultDispatchManager for execute stage.
+ *  6. Create tRPC caller with full context.
+ *  7. Register project in DB directly (url = fixtureRoot).
+ *  8. Create initiative via tRPC (links project, creates root page).
+ */
+export async function createFullFlowHarness(
+  initiativeName = 'Add complete() method to TodoStore',
+  options?: FullFlowHarnessOptions,
+): Promise<FullFlowHarness> {
+  // ── 0. Allow nested claude invocations ────────────────────────────────────
+  // Claude Code sets CLAUDECODE in the environment, which prevents nested
+  // claude CLI calls from starting ("cannot be launched inside another Claude
+  // Code session").  Save and remove it so spawned agents can run normally.
+  // It is restored in cleanup().
+  const savedClaudeCodeEnv = process.env.CLAUDECODE;
+  delete process.env.CLAUDECODE;
+
+  // ── 1. Fixture project ────────────────────────────────────────────────────
+  // IMPORTANT: cp(src, dest) puts src INSIDE dest when dest already exists
+  // (like `cp -r src dest/` → creates dest/src/).  We need dest to NOT exist
+  // yet so that cp creates it as a copy of src directly.
+  const fixtureBase = await mkdtemp(join(tmpdir(), 'cw-fixture-'));
+  const fixtureRoot = join(fixtureBase, 'todo-api'); // does not exist yet
+  await cp(FIXTURES_DIR, fixtureRoot, { recursive: true });
+
+  // Verify files landed at the right level before git operations
+  execSync(`test -f "${join(fixtureRoot, 'package.json')}"`, { stdio: 'pipe' });
+
+  execSync('git init', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git add .', { cwd: fixtureRoot, stdio: 'pipe' });
+  execSync('git commit -m "initial todo-api with missing complete()"', {
+    cwd: fixtureRoot,
+    stdio: 'pipe',
+  });
+
+  // ── 2. Workspace root ─────────────────────────────────────────────────────
+  // Just a plain temp directory — agent worktrees live under repos/ inside it.
+  // No git init needed; the PROJECT clone (repos/<name>-<id>/) is the git repo.
+  const workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-workspace-'));
+
+  // ── 3. Database + repositories ────────────────────────────────────────────
+  const db = createTestDatabase();
+  const repos = createRepositories(db);
+
+  // ── 4. Event bus ──────────────────────────────────────────────────────────
+  const eventBus = new CapturingEventBus();
+
+  // ── 5. Real agent manager ─────────────────────────────────────────────────
+  const customProcessManager = options?.processManagerFactory?.(workspaceRoot, repos.projectRepository);
+  const agentManager = new MultiProviderAgentManager(
+    repos.agentRepository,
+    workspaceRoot,
+    repos.projectRepository,
+    repos.accountRepository,
+    eventBus,
+    undefined, // no credential manager needed for default claude account
+    repos.changeSetRepository,
+    repos.phaseRepository,
+    repos.taskRepository,
+    repos.pageRepository,
+    repos.logChunkRepository,
+    false,                 // debug
+    customProcessManager,  // processManagerOverride
+  );
+
+  // ── 6. Dispatch manager (for execute stage) ───────────────────────────────
+  const dispatchManager = new DefaultDispatchManager(
+    repos.taskRepository,
+    repos.messageRepository,
+    agentManager,
+    eventBus,
+    repos.initiativeRepository,
+    repos.phaseRepository,
+  );
+
+  // ── 7. tRPC caller ────────────────────────────────────────────────────────
+  const ctx = createContext({
+    eventBus,
+    serverStartedAt: new Date(),
+    processCount: 0,
+    agentManager,
+    dispatchManager,
+    workspaceRoot,
+    ...repos,
+  });
+  const caller = createCaller(ctx);
+
+  // ── 8. Register project directly in DB (bypass tRPC clone) ───────────────
+  const project = await repos.projectRepository.create({
+    name: 'todo-api',
+    url: fixtureRoot,
+  });
+
+  // ── 9. Create initiative via tRPC (creates root page automatically) ───────
+  const initiative = await caller.createInitiative({
+    name: initiativeName,
+    projectIds: [project.id],
+  });
+
+  // ── Helpers ───────────────────────────────────────────────────────────────
+
+  async function waitForAgentCompletion(
+    agentId: string,
+    timeoutMs = 120_000,
+  ): Promise<AgentResult | null> {
+    const deadline = Date.now() + timeoutMs;
+    while (Date.now() < deadline) {
+      const agent = await repos.agentRepository.findById(agentId);
+      if (!agent) return null;
+      if (agent.status === 'idle' || agent.status === 'stopped' || agent.status === 'crashed') {
+        return agentManager.getResult(agentId);
+      }
+      if (agent.status === 'waiting_for_input') return null;
+      await sleep(POLL_INTERVAL_MS);
+    }
+    throw new Error(`Timeout: agent ${agentId} did not complete within ${timeoutMs}ms`);
+  }
+
+  async function waitForAgentAttention(
+    agentId: string,
+    timeoutMs = 120_000,
+  ): Promise<AgentAttentionStatus> {
+    const deadline = Date.now() + timeoutMs;
+    while (Date.now() < deadline) {
+      const agent = await repos.agentRepository.findById(agentId);
+      if (!agent) return 'crashed';
+      if (agent.status === 'idle' || agent.status === 'stopped') return 'done';
+      if (agent.status === 'crashed') return 'crashed';
+      if (agent.status === 'waiting_for_input') return 'waiting';
+      await sleep(POLL_INTERVAL_MS);
+    }
+    throw new Error(`Timeout: agent ${agentId} did not reach attention state within ${timeoutMs}ms`);
+  }
+
+  async function driveToCompletion(
+    agentId: string,
+    answer = 'Use your best judgment and keep it simple.',
+    timeoutMs = 10 * 60_000,
+  ): Promise<AgentResult | null> {
+    const deadline = Date.now() + timeoutMs;
+
+    while (Date.now() < deadline) {
+      const remaining = deadline - Date.now();
+      if (remaining <= 0) break;
+
+      let status: AgentAttentionStatus;
+      try {
+        status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
+      } catch {
+        // Agent is still running (hasn't reached an attention state within the polling
+        // window). This is normal for long-running execute agents. Continue the outer
+        // loop — the deadline check above will terminate us if we truly time out.
+        continue;
+      }
+
+      if (status === 'done' || status === 'crashed') {
+        return agentManager.getResult(agentId);
+      }
+
+      if (status === 'waiting') {
+        const pending = await agentManager.getPendingQuestions(agentId);
+        if (!pending || pending.questions.length === 0) {
+          // Shouldn't happen, but guard against it
+          await sleep(POLL_INTERVAL_MS);
+          continue;
+        }
+        const answers = Object.fromEntries(
+          pending.questions.map((q) => [q.id, answer]),
+        );
+        await agentManager.resume(agentId, answers);
+      }
+    }
+
+    throw new Error(`driveToCompletion: agent ${agentId} did not finish within ${timeoutMs}ms`);
+  }
+
+  // ── Build and return harness ───────────────────────────────────────────────
+
+  const harness: FullFlowHarness = {
+    workspaceRoot,
+    fixtureRoot,
+    project,
+    initiative,
+    caller,
+    agentManager,
+    db,
+    eventBus,
+    ...repos,
+
+    waitForAgentCompletion,
+    waitForAgentAttention,
+    driveToCompletion,
+
+    getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
+      return eventBus.getEventsByType<T>(type);
+    },
+
+    async cleanup() {
+      // Kill any running agents
+      const agents = await repos.agentRepository.findAll();
+      await Promise.allSettled(
+        agents
+          .filter((a) => a.status === 'running')
+          .map((a) => agentManager.stop(a.id)),
+      );
+      // Restore CLAUDECODE env var
+      if (savedClaudeCodeEnv !== undefined) {
+        process.env.CLAUDECODE = savedClaudeCodeEnv;
+      }
+      // Remove temp directories (fixtureBase contains fixtureRoot)
+      await Promise.allSettled([
+        rm(fixtureBase, { recursive: true, force: true }),
+        rm(workspaceRoot, { recursive: true, force: true }),
+      ]);
+    },
+  };
+
+  return harness;
+}
--- a/apps/server/test/integration/full-flow/report.ts
+++ b/apps/server/test/integration/full-flow/report.ts
@@ -0,0 +1,156 @@
+/**
+ * Full-Flow Test Report Utility
+ *
+ * Plain console.log formatters for human-readable output at each stage of the
+ * full-flow integration test. No external dependencies.
+ */
+
+import { execSync } from 'node:child_process';
+import { join } from 'node:path';
+import type { Phase, Task } from '../../../db/schema.js';
+import type { AgentResult } from '../../../agent/types.js';
+
+// =============================================================================
+// Types
+// =============================================================================
+
+export interface ExecutedTask {
+  task: Task;
+  result: AgentResult | null;
+}
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+const DIVIDER = '═'.repeat(60);
+const THIN = '─'.repeat(60);
+
+function section(title: string): void {
+  console.log(`\n${DIVIDER}`);
+  console.log(`  ${title}`);
+  console.log(DIVIDER);
+}
+
+function line(msg: string): void {
+  console.log(`  ${msg}`);
+}
+
+// =============================================================================
+// Stage reporters
+// =============================================================================
+
+export function printHeader(initiativeName: string): void {
+  section(`FULL-FLOW TEST: ${initiativeName}`);
+  console.log(`  Started at: ${new Date().toISOString()}`);
+}
+
+export function printDiscussResult(agentId: string, result: AgentResult | null): void {
+  console.log(`\n[DISCUSS]`);
+  console.log(THIN);
+  line(`Agent: ${agentId}`);
+  if (result) {
+    line(`Success: ${result.success}`);
+    if (result.message) line(`Message: ${result.message.slice(0, 200)}`);
+  } else {
+    line('Result: null (agent may have crashed)');
+  }
+}
+
+export function printPlanResult(phases: Phase[]): void {
+  console.log(`\n[PLAN] ${phases.length} phase(s) created`);
+  console.log(THIN);
+  phases.forEach((ph, i) => {
+    line(`${i + 1}. ${ph.name}`);
+  });
+}
+
+export function printDetailResult(phase: Phase, tasks: Task[]): void {
+  console.log(`\n[DETAIL] Phase "${phase.name}" → ${tasks.length} task(s)`);
+  console.log(THIN);
+  tasks.forEach((t, i) => {
+    const flags = [t.category, t.type, t.requiresApproval ? 'approval-required' : 'auto'].join(', ');
+    line(`${i + 1}. ${t.name} [${flags}]`);
+    if (t.description) {
+      line(`   ${t.description.slice(0, 120)}`);
+    }
+  });
+}
+
+export function printExecuteResult(executed: ExecutedTask[]): void {
+  const succeeded = executed.filter((e) => e.result?.success).length;
+  console.log(`\n[EXECUTE] ${succeeded}/${executed.length} task(s) succeeded`);
+  console.log(THIN);
+  for (const { task, result } of executed) {
+    const icon = result?.success ? '✓' : '✗';
+    line(`${icon} ${task.name}`);
+    if (result && !result.success) {
+      line(`  Error: ${result.message?.slice(0, 120)}`);
+    }
+  }
+}
+
+export function printGitDiff(workspaceRoot: string, projectName: string): void {
+  console.log('\n[GIT DIFF — agent worktrees]');
+  console.log(THIN);
+
+  // Find all agent worktrees for this project
+  const worktreesBase = join(workspaceRoot, 'agent-workdirs');
+  try {
+    const dirs = execSync(`ls "${worktreesBase}" 2>/dev/null || echo ""`, { encoding: 'utf8' })
+      .trim()
+      .split('\n')
+      .filter(Boolean);
+
+    for (const dir of dirs) {
+      const projectDir = join(worktreesBase, dir, projectName);
+      try {
+        const stat = execSync(`git -C "${projectDir}" diff HEAD~1 --stat 2>/dev/null || echo ""`, {
+          encoding: 'utf8',
+        }).trim();
+        if (stat) {
+          line(`Worktree: ${dir}/${projectName}`);
+          stat.split('\n').forEach((l) => line(`  ${l}`));
+        }
+      } catch {
+        // Worktree might not have commits — skip silently
+      }
+    }
+  } catch {
+    line('(no agent worktrees found)');
+  }
+}
+
+export function printNpmTestResult(projectDir: string): void {
+  console.log('\n[NPM TEST]');
+  console.log(THIN);
+  try {
+    const output = execSync('node --test src/todo.test.js', {
+      cwd: projectDir,
+      encoding: 'utf8',
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+    line('Tests passed:');
+    output.split('\n').forEach((l) => line(`  ${l}`));
+  } catch (err: unknown) {
+    const e = err as { stdout?: string; stderr?: string; status?: number };
+    line(`Tests FAILED (exit ${e.status ?? '?'})`);
+    if (e.stdout) e.stdout.split('\n').forEach((l) => line(`  ${l}`));
+    if (e.stderr) e.stderr.split('\n').forEach((l) => line(`  ${l}`));
+  }
+}
+
+export function printFinalSummary(
+  initiativeName: string,
+  phases: Phase[],
+  tasks: Task[],
+  executed: ExecutedTask[],
+  durationMs: number,
+): void {
+  section(`SUMMARY: ${initiativeName}`);
+  line(`Duration : ${Math.round(durationMs / 1000)}s`);
+  line(`Phases   : ${phases.length}`);
+  line(`Tasks    : ${tasks.length}`);
+  line(`Executed : ${executed.filter((e) => e.result?.success).length}/${executed.length} succeeded`);
+  console.log(DIVIDER);
+}
--- a/apps/server/test/integration/real-claude.test.ts
+++ b/apps/server/test/integration/real-claude.test.ts
@@ -0,0 +1,183 @@
+/**
+ * Real Claude CLI Integration Tests
+ *
+ * IMPORTANT: These tests call the real Claude CLI and incur API costs.
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts --test-timeout=120000
+ * ```
+ *
+ * Purpose:
+ * - Validate that JSON schemas work correctly with Claude CLI --json-schema flag
+ * - Confirm MockAgentManager accurately simulates real CLI behavior
+ * - Document actual response structure and costs
+ *
+ * Updated (2026-02-06): Now uses the universal agentSignalSchema instead of
+ * per-mode schemas. Agents output trivial signals (done/questions/error) and
+ * write files instead of producing mode-specific JSON.
+ *
+ * Total validation cost: ~$0.10 (3 tests)
+ */
+
+import { describe, it, expect, beforeAll } from 'vitest';
+import { execa } from 'execa';
+import {
+  agentSignalJsonSchema,
+  agentSignalSchema,
+} from '../../agent/schema.js';
+
+/**
+ * Result structure from Claude CLI with --output-format json
+ *
+ * When --json-schema is used:
+ * - result: "" (empty string)
+ * - structured_output: { ... } (the validated JSON object)
+ */
+interface ClaudeCliResult {
+  type: 'result';
+  subtype: 'success' | 'error' | 'error_max_turns';
+  is_error: boolean;
+  session_id: string;
+  result: string;
+  structured_output?: unknown;
+  total_cost_usd?: number;
+}
+
+/**
+ * Helper to call Claude CLI directly with a prompt and JSON schema.
+ *
+ * @param prompt - The prompt to send to Claude
+ * @param jsonSchema - JSON schema to enforce structured output
+ * @param timeoutMs - Timeout in milliseconds (default 90s)
+ * @returns Parsed CLI result with structured_output
+ */
+async function callClaudeCli(
+  prompt: string,
+  jsonSchema: object,
+  timeoutMs = 90000
+): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
+  const startTime = Date.now();
+
+  const { stdout } = await execa(
+    'claude',
+    [
+      '-p',
+      prompt,
+      '--output-format',
+      'json',
+      '--json-schema',
+      JSON.stringify(jsonSchema),
+    ],
+    {
+      timeout: timeoutMs,
+    }
+  );
+
+  const duration = Date.now() - startTime;
+  const cliResult: ClaudeCliResult = JSON.parse(stdout);
+
+  console.log(`\n  Duration: ${(duration / 1000).toFixed(1)}s`);
+  console.log(`  Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
+  console.log(`  Session ID: ${cliResult.session_id}`);
+  console.log(`  Result field empty: ${cliResult.result === ''}`);
+  console.log(`  Has structured_output: ${cliResult.structured_output !== undefined}`);
+
+  // When --json-schema is used, structured output is in structured_output field
+  // The result field is typically empty when using --json-schema
+  const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);
+
+  return { cliResult, structuredOutput };
+}
+
+/**
+ * Check if real Claude tests should run.
+ * Set REAL_CLAUDE_TESTS=1 environment variable to enable.
+ */
+const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';
+
+/**
+ * Skip wrapper - tests are expensive and should run manually
+ */
+const describeReal = shouldRunRealTests ? describe : describe.skip;
+
+// Individual test timeout - real API calls take 5-30 seconds
+const TEST_TIMEOUT = 120000; // 2 minutes
+
+describeReal('Real Claude CLI Integration', () => {
+  beforeAll(() => {
+    console.log('\n=== Running Real Claude CLI Tests ===');
+    console.log('These tests call the real Claude API and incur costs.\n');
+  });
+
+  describe('Universal Signal Schema', () => {
+    it(
+      'should return done status',
+      async () => {
+        const prompt = `Complete this simple task: Say "Hello, World!" as a test.
+
+Output your response in the required JSON format with status "done".`;
+
+        const { cliResult, structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
+
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+        // Verify the CLI response structure
+        expect(cliResult.subtype).toBe('success');
+        expect(cliResult.result).toBe(''); // Empty when using --json-schema
+        expect(cliResult.structured_output).toBeDefined();
+
+        // Validate against Zod schema
+        const parsed = agentSignalSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('done');
+      },
+      TEST_TIMEOUT
+    );
+
+    it(
+      'should return questions status with array',
+      async () => {
+        const prompt = `You are working on a vague task: "Make it better"
+
+You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.
+
+Output your response with status "questions" and include at least 2 questions with unique IDs.`;
+
+        const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
+
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+        // Validate against Zod schema
+        const parsed = agentSignalSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('questions');
+        if (parsed.status === 'questions') {
+          expect(Array.isArray(parsed.questions)).toBe(true);
+          expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
+          expect(parsed.questions[0].id).toBeTruthy();
+          expect(parsed.questions[0].question).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
+
+    it(
+      'should return error status',
+      async () => {
+        const prompt = `You have encountered an unrecoverable error. Output your response with status "error" and a descriptive error message.`;
+
+        const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
+
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+        // Validate against Zod schema
+        const parsed = agentSignalSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('error');
+        if (parsed.status === 'error') {
+          expect(parsed.error).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
+  });
+});
--- a/apps/server/test/integration/real-providers/claude-manager.test.ts
+++ b/apps/server/test/integration/real-providers/claude-manager.test.ts
@@ -0,0 +1,298 @@
+/**
+ * Real Claude CLI Manager Integration Tests
+ *
+ * IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/claude-manager.test.ts --test-timeout=300000
+ * ```
+ *
+ * Tests covered:
+ * - Output stream parsing (text_delta events)
+ * - Session ID extraction from init event
+ * - Result parsing and validation
+ * - Session resume with user answers
+ *
+ * Estimated cost: ~$0.10 per full run
+ */
+
+import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
+import {
+  createRealProviderHarness,
+  describeRealClaude,
+  REAL_TEST_TIMEOUT,
+  sleep,
+  type RealProviderHarness,
+} from './harness.js';
+import { MINIMAL_PROMPTS } from './prompts.js';
+import type { AgentSpawnedEvent, AgentStoppedEvent, AgentOutputEvent } from '../../../events/types.js';
+
+describeRealClaude('Real Claude Manager Integration', () => {
+  let harness: RealProviderHarness;
+
+  beforeAll(async () => {
+    console.log('\n=== Running Real Claude Manager Tests ===');
+    console.log('These tests call the real Claude API and incur costs.\n');
+    harness = await createRealProviderHarness({ provider: 'claude' });
+  });
+
+  afterAll(async () => {
+    await harness.cleanup();
+  });
+
+  beforeEach(() => {
+    harness.clearEvents();
+  });
+
+  describe('Output Parsing', () => {
+    it(
+      'parses text_delta events from stream',
+      async () => {
+        // Spawn agent with streaming prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.streaming,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        expect(agent.id).toBeTruthy();
+        expect(agent.status).toBe('running');
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify we got output events
+        const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
+        console.log('  Output events:', outputEvents.length);
+
+        // Verify completion
+        expect(result).toBeTruthy();
+        console.log('  Result:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'parses init event and extracts session ID',
+      async () => {
+        // Spawn agent with simple done prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.done,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion
+        await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify session ID was extracted and persisted
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.sessionId).toBeTruthy();
+        expect(dbAgent?.sessionId).toMatch(/^[a-f0-9-]+$/);
+
+        console.log('  Session ID:', dbAgent?.sessionId);
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'parses result event with completion',
+      async () => {
+        // Spawn agent with simple done prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.done,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify result was parsed
+        expect(result).toBeTruthy();
+        expect(result?.success).toBe(true);
+        expect(result?.message).toBeTruthy();
+
+        // Verify events
+        const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
+        expect(spawnedEvents.length).toBe(1);
+        expect(spawnedEvents[0].payload.agentId).toBe(agent.id);
+        expect(spawnedEvents[0].payload.provider).toBe('claude');
+
+        const stoppedEvents = harness.getEventsByType<AgentStoppedEvent>('agent:stopped');
+        expect(stoppedEvents.length).toBe(1);
+        expect(stoppedEvents[0].payload.agentId).toBe(agent.id);
+        expect(stoppedEvents[0].payload.reason).toBe('task_complete');
+
+        console.log('  Result message:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+
+  describe('Questions Flow', () => {
+    it(
+      'parses questions status and enters waiting_for_input',
+      async () => {
+        // Spawn agent with questions prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.questions,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for waiting_for_input status
+        const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify questions were parsed
+        expect(questions).toBeTruthy();
+        expect(questions?.questions).toBeTruthy();
+        expect(questions?.questions.length).toBeGreaterThan(0);
+        expect(questions?.questions[0].id).toBeTruthy();
+        expect(questions?.questions[0].question).toBeTruthy();
+
+        // Verify agent status
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('waiting_for_input');
+        expect(dbAgent?.sessionId).toBeTruthy();
+
+        console.log('  Questions:', questions?.questions.length);
+        console.log('  First question:', questions?.questions[0].question);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+
+  describe('Session Resume', () => {
+    it(
+      'resumes session with user answers',
+      async () => {
+        // 1. Spawn agent that asks questions
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.questions,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // 2. Wait for waiting_for_input
+        const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
+        expect(questions?.questions.length).toBeGreaterThan(0);
+
+        const sessionIdBeforeResume = (await harness.agentRepository.findById(agent.id))?.sessionId;
+        console.log('  Session ID before resume:', sessionIdBeforeResume);
+        console.log('  Questions received:', questions?.questions.map((q) => q.id).join(', '));
+
+        harness.clearEvents();
+
+        // 3. Resume with answer
+        const answers: Record<string, string> = {};
+        for (const q of questions?.questions ?? []) {
+          answers[q.id] = `Answer to ${q.id}`;
+        }
+
+        await harness.agentManager.resume(agent.id, answers);
+
+        // 4. Wait for completion or another waiting state
+        let attempts = 0;
+        let finalStatus = 'running';
+        while (attempts < 60) {
+          const agent2 = await harness.agentRepository.findById(agent.id);
+          if (agent2?.status !== 'running') {
+            finalStatus = agent2?.status ?? 'unknown';
+            break;
+          }
+          await sleep(1000);
+          attempts++;
+        }
+
+        // Verify the agent processed the resume (either completed or asked more questions)
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  Final status:', dbAgent?.status);
+
+        // Agent should not still be running
+        expect(['idle', 'waiting_for_input', 'crashed']).toContain(dbAgent?.status);
+
+        // If idle, verify result
+        if (dbAgent?.status === 'idle') {
+          const result = await harness.agentManager.getResult(agent.id);
+          console.log('  Result:', result?.message);
+          expect(result).toBeTruthy();
+        }
+      },
+      REAL_TEST_TIMEOUT * 2 // Double timeout for two-step process
+    );
+
+    it(
+      'maintains session continuity across resume',
+      async () => {
+        // 1. Spawn agent that asks questions
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.questions,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // 2. Wait for waiting_for_input
+        const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
+        expect(questions?.questions.length).toBeGreaterThan(0);
+
+        const sessionIdBefore = (await harness.agentRepository.findById(agent.id))?.sessionId;
+        expect(sessionIdBefore).toBeTruthy();
+
+        // 3. Resume with answer
+        const answers: Record<string, string> = {};
+        for (const q of questions?.questions ?? []) {
+          answers[q.id] = `Answer to ${q.id}`;
+        }
+
+        await harness.agentManager.resume(agent.id, answers);
+
+        // 4. Wait for completion
+        await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify session ID exists (may be same or new depending on CLI behavior)
+        const sessionIdAfter = (await harness.agentRepository.findById(agent.id))?.sessionId;
+        expect(sessionIdAfter).toBeTruthy();
+
+        console.log('  Session ID before:', sessionIdBefore);
+        console.log('  Session ID after:', sessionIdAfter);
+      },
+      REAL_TEST_TIMEOUT * 2
+    );
+  });
+
+  describe('Error Handling', () => {
+    it(
+      'handles error status',
+      async () => {
+        // Spawn agent with error prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.error,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion (will be crashed)
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify error was handled
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('crashed');
+        expect(result?.success).toBe(false);
+        expect(result?.message).toContain('Test error');
+
+        console.log('  Error message:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+});
--- a/apps/server/test/integration/real-providers/codex-manager.test.ts
+++ b/apps/server/test/integration/real-providers/codex-manager.test.ts
@@ -0,0 +1,172 @@
+/**
+ * Real Codex CLI Manager Integration Tests
+ *
+ * IMPORTANT: These tests call the REAL Codex CLI and incur API costs!
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts --test-timeout=300000
+ * ```
+ *
+ * Tests covered:
+ * - Codex spawn and thread_id extraction
+ * - Generic output parsing (non-schema)
+ * - Streaming output
+ *
+ * Estimated cost: ~$0.10 per full run
+ *
+ * Note: Codex uses different output format and session ID field (thread_id).
+ */
+
+import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
+import {
+  createRealProviderHarness,
+  describeRealCodex,
+  REAL_TEST_TIMEOUT,
+  type RealProviderHarness,
+} from './harness.js';
+import { CODEX_PROMPTS } from './prompts.js';
+import type { AgentSpawnedEvent, AgentOutputEvent } from '../../../events/types.js';
+
+describeRealCodex('Real Codex Manager Integration', () => {
+  let harness: RealProviderHarness;
+
+  beforeAll(async () => {
+    console.log('\n=== Running Real Codex Manager Tests ===');
+    console.log('These tests call the real Codex API and incur costs.\n');
+    harness = await createRealProviderHarness({ provider: 'codex' });
+  });
+
+  afterAll(async () => {
+    await harness.cleanup();
+  });
+
+  beforeEach(() => {
+    harness.clearEvents();
+  });
+
+  describe('Codex Spawn', () => {
+    it(
+      'spawns codex agent and extracts thread_id',
+      async () => {
+        // Spawn agent with simple task
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: CODEX_PROMPTS.done,
+          mode: 'execute',
+          provider: 'codex',
+        });
+
+        expect(agent.id).toBeTruthy();
+        expect(agent.provider).toBe('codex');
+        expect(agent.status).toBe('running');
+
+        // Verify spawned event
+        const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
+        expect(spawnedEvents.length).toBe(1);
+        expect(spawnedEvents[0].payload.provider).toBe('codex');
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify session ID (thread_id) was extracted
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  Thread ID:', dbAgent?.sessionId);
+        console.log('  Status:', dbAgent?.status);
+        console.log('  Result:', result?.message);
+
+        // Codex should complete or crash
+        expect(['idle', 'crashed']).toContain(dbAgent?.status);
+
+        // If completed successfully, should have extracted thread_id
+        if (dbAgent?.status === 'idle' && dbAgent?.sessionId) {
+          expect(dbAgent.sessionId).toBeTruthy();
+        }
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'uses generic parser for output',
+      async () => {
+        // Spawn agent with streaming prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: CODEX_PROMPTS.streaming,
+          mode: 'execute',
+          provider: 'codex',
+        });
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify output events were captured
+        const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
+        console.log('  Output events:', outputEvents.length);
+
+        // For generic provider, result should be captured
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  Status:', dbAgent?.status);
+        console.log('  Result:', result?.message?.substring(0, 100) + '...');
+
+        expect(['idle', 'crashed']).toContain(dbAgent?.status);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+
+  describe('Codex Provider Config', () => {
+    it(
+      'uses correct command and args for codex',
+      async () => {
+        // This is more of a config verification test
+        // The actual command execution is validated by the spawn test
+
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: 'Say hello',
+          mode: 'execute',
+          provider: 'codex',
+        });
+
+        // Verify agent was created with codex provider
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.provider).toBe('codex');
+
+        // Wait for completion (or timeout)
+        try {
+          await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+        } catch {
+          // Codex might fail if not installed, that's OK for config test
+        }
+
+        const finalAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  Provider:', finalAgent?.provider);
+        console.log('  Status:', finalAgent?.status);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+});
+
+/**
+ * Codex-specific observations from testing:
+ *
+ * 1. Output Format:
+ *    - Codex uses JSONL streaming with different event types
+ *    - thread.started event contains thread_id
+ *    - Output parsing is more generic (not JSON schema validated)
+ *
+ * 2. Command Structure:
+ *    - codex exec --full-auto --json -p "prompt"
+ *    - resume: codex exec resume <thread_id>
+ *
+ * 3. Session ID:
+ *    - Called "thread_id" in Codex
+ *    - Extracted from thread.started event
+ *
+ * 4. Resume:
+ *    - Uses subcommand style: codex exec resume <thread_id>
+ *    - Different from Claude's flag style: claude --resume <session_id>
+ */
--- a/apps/server/test/integration/real-providers/conversation.test.ts
+++ b/apps/server/test/integration/real-providers/conversation.test.ts
@@ -0,0 +1,540 @@
+/**
+ * Real Claude Inter-Agent Conversation Integration Tests
+ *
+ * IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run:
+ * ```bash
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/conversation.test.ts --test-timeout=300000
+ * ```
+ *
+ * Architecture:
+ * - Mock conversation server (only cw listen/ask/answer endpoints, no full CoordinationServer)
+ * - In-memory ConversationRepository (no SQLite, no FK constraints)
+ * - Real agent harness for spawning two Claude sessions with actual coding tasks
+ * - Two sequential questions prove the listen→answer→re-listen cycle works
+ *
+ * Estimated cost: ~$0.30 per full run (two Claude sessions)
+ */
+
+import { it, expect, beforeAll, afterAll } from 'vitest';
+import { createServer } from 'node:http';
+import type { Server } from 'node:http';
+import { readFileSync, existsSync } from 'node:fs';
+import { join } from 'node:path';
+import { nanoid } from 'nanoid';
+import { fetchRequestHandler } from '@trpc/server/adapters/fetch';
+import { router, publicProcedure } from '../../../trpc/trpc.js';
+import { conversationProcedures } from '../../../trpc/routers/conversation.js';
+import { EventEmitterBus } from '../../../events/bus.js';
+import type { ConversationRepository, CreateConversationData } from '../../../db/repositories/conversation-repository.js';
+import type { Conversation } from '../../../db/schema.js';
+import {
+  createRealProviderHarness,
+  describeRealClaude,
+  sleep,
+  type RealProviderHarness,
+} from './harness.js';
+
+const TEST_TIMEOUT = 300000; // 5 minutes — agents do real coding + conversation
+
+// ---------------------------------------------------------------------------
+// In-memory ConversationRepository — no SQLite, no FK constraints
+// ---------------------------------------------------------------------------
+
+class InMemoryConversationRepository implements ConversationRepository {
+  private store = new Map<string, Conversation>();
+
+  async create(data: CreateConversationData): Promise<Conversation> {
+    const now = new Date();
+    const conversation: Conversation = {
+      id: nanoid(),
+      fromAgentId: data.fromAgentId,
+      toAgentId: data.toAgentId,
+      initiativeId: data.initiativeId ?? null,
+      phaseId: data.phaseId ?? null,
+      taskId: data.taskId ?? null,
+      question: data.question,
+      answer: null,
+      status: 'pending',
+      createdAt: now,
+      updatedAt: now,
+    };
+    this.store.set(conversation.id, conversation);
+    return conversation;
+  }
+
+  async findById(id: string): Promise<Conversation | null> {
+    return this.store.get(id) ?? null;
+  }
+
+  async findPendingForAgent(toAgentId: string): Promise<Conversation[]> {
+    return [...this.store.values()]
+      .filter((c) => c.toAgentId === toAgentId && c.status === 'pending')
+      .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime());
+  }
+
+  async answer(id: string, answer: string): Promise<Conversation | null> {
+    const conv = this.store.get(id);
+    if (!conv) return null;
+    const updated: Conversation = {
+      ...conv,
+      answer,
+      status: 'answered' as const,
+      updatedAt: new Date(),
+    };
+    this.store.set(id, updated);
+    return updated;
+  }
+
+  /** Test helper — return all conversations */
+  getAll(): Conversation[] {
+    return [...this.store.values()];
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Mock conversation server — serves ONLY conversation tRPC procedures
+// ---------------------------------------------------------------------------
+
+async function startMockConversationServer(): Promise<{
+  server: Server;
+  port: number;
+  repo: InMemoryConversationRepository;
+}> {
+  const repo = new InMemoryConversationRepository();
+  const eventBus = new EventEmitterBus();
+
+  // Mini router with only conversation procedures
+  const miniRouter = router({
+    ...conversationProcedures(publicProcedure),
+  });
+
+  const httpServer = createServer(async (req, res) => {
+    if (!req.url?.startsWith('/trpc')) {
+      res.writeHead(404);
+      res.end('Not found');
+      return;
+    }
+
+    const host = req.headers.host ?? 'localhost';
+    const url = new URL(req.url, `http://${host}`);
+
+    let body: string | undefined;
+    if (req.method !== 'GET' && req.method !== 'HEAD') {
+      body = await new Promise<string>((resolve) => {
+        let data = '';
+        req.on('data', (chunk: Buffer) => {
+          data += chunk.toString();
+        });
+        req.on('end', () => resolve(data));
+      });
+    }
+
+    const headers = new Headers();
+    for (const [key, value] of Object.entries(req.headers)) {
+      if (value) {
+        if (Array.isArray(value)) {
+          value.forEach((v) => headers.append(key, v));
+        } else {
+          headers.set(key, value);
+        }
+      }
+    }
+
+    const fetchRequest = new Request(url.toString(), {
+      method: req.method,
+      headers,
+      body: body ?? undefined,
+    });
+
+    const fetchResponse = await fetchRequestHandler({
+      endpoint: '/trpc',
+      req: fetchRequest,
+      router: miniRouter,
+      createContext: () =>
+        ({
+          eventBus,
+          serverStartedAt: new Date(),
+          processCount: 0,
+          conversationRepository: repo,
+          // Stub — requireAgentManager is called unconditionally in createConversation,
+          // but list() is only invoked for taskId/phaseId resolution. With --agent-id
+          // targeting, list() is never called.
+          agentManager: { list: async () => [] },
+        }) as any,
+    });
+
+    res.statusCode = fetchResponse.status;
+    fetchResponse.headers.forEach((value, key) => {
+      res.setHeader(key, value);
+    });
+
+    if (fetchResponse.body) {
+      const reader = fetchResponse.body.getReader();
+      const pump = async () => {
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) {
+            res.end();
+            return;
+          }
+          res.write(value);
+        }
+      };
+      pump().catch(() => res.end());
+    } else {
+      res.end(await fetchResponse.text());
+    }
+  });
+
+  const port = 40000 + Math.floor(Math.random() * 10000);
+  await new Promise<void>((resolve) => {
+    httpServer.listen(port, '127.0.0.1', () => resolve());
+  });
+
+  return { server: httpServer, port, repo };
+}
+
+// ---------------------------------------------------------------------------
+// Diagnostic helpers
+// ---------------------------------------------------------------------------
+
+function dumpAgentLogs(workspaceRoot: string, agentName: string) {
+  const logDir = join(workspaceRoot, '.cw', 'agent-logs', agentName);
+  if (!existsSync(logDir)) {
+    console.log(`  [${agentName}] No log directory at ${logDir}`);
+    return;
+  }
+  // Dump output.jsonl (last 30 lines)
+  const outputPath = join(logDir, 'output.jsonl');
+  if (existsSync(outputPath)) {
+    const lines = readFileSync(outputPath, 'utf-8').trim().split('\n');
+    const last = lines.slice(-30);
+    console.log(`  [${agentName}] output.jsonl (last ${last.length}/${lines.length} lines):`);
+    for (const line of last) {
+      try {
+        const ev = JSON.parse(line);
+        if (ev.type === 'assistant' && ev.message?.content) {
+          for (const block of ev.message.content) {
+            if (block.type === 'text') {
+              console.log(`    TEXT: ${block.text.substring(0, 200)}`);
+            } else if (block.type === 'tool_use') {
+              console.log(`    TOOL: ${block.name} ${JSON.stringify(block.input).substring(0, 150)}`);
+            }
+          }
+        } else if (ev.type === 'result') {
+          console.log(`    RESULT: ${JSON.stringify(ev).substring(0, 300)}`);
+        }
+      } catch {
+        console.log(`    RAW: ${line.substring(0, 200)}`);
+      }
+    }
+  }
+  // Dump stderr
+  const stderrPath = join(logDir, 'stderr.log');
+  if (existsSync(stderrPath)) {
+    const stderr = readFileSync(stderrPath, 'utf-8').trim();
+    if (stderr) {
+      console.log(`  [${agentName}] stderr: ${stderr.substring(0, 500)}`);
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Test suite
+// ---------------------------------------------------------------------------
+
+describeRealClaude('Real Inter-Agent Conversation (mock server)', () => {
+  let harness: RealProviderHarness;
+  let mockServer: Server;
+  let mockPort: number;
+  let mockRepo: InMemoryConversationRepository;
+  const originalCwPort = process.env.CW_PORT;
+
+  beforeAll(async () => {
+    console.log('\n=== Real Inter-Agent Conversation Test ===');
+    console.log('Mock conversation server + two Claude sessions.\n');
+
+    // Start mock conversation server (only listen/ask/answer endpoints)
+    const mock = await startMockConversationServer();
+    mockServer = mock.server;
+    mockPort = mock.port;
+    mockRepo = mock.repo;
+    console.log(`  Mock server on port ${mockPort}`);
+
+    // Set CW_PORT so agents' cw commands hit the mock server
+    process.env.CW_PORT = String(mockPort);
+
+    // Real agent harness for spawning + worktrees (no full CoordinationServer)
+    harness = await createRealProviderHarness({ provider: 'claude' });
+    console.log(`  Workspace: ${harness.workspaceRoot}`);
+  });
+
+  afterAll(async () => {
+    if (originalCwPort) {
+      process.env.CW_PORT = originalCwPort;
+    } else {
+      delete process.env.CW_PORT;
+    }
+    await harness?.cleanup();
+    mockServer?.close();
+  });
+
+  it(
+    'two agents with real tasks communicate via cw ask/listen/answer (two questions prove re-listen)',
+    async () => {
+      const agentSuffix = nanoid(6); // unique suffix for temp files
+
+      // ---------------------------------------------------------------
+      // Agent A — builds a validator module WHILE answering questions
+      // in the background via cw listen
+      // ---------------------------------------------------------------
+      const agentA = await harness.agentManager.spawn({
+        taskId: null,
+        prompt: `You are Agent A in a multi-agent coordination test.
+
+You have TWO concurrent responsibilities:
+1. Build a TypeScript validator module (your main coding task)
+2. Answer questions from other agents via a background listener
+
+SETUP (do this first):
+- Read .cw/input/manifest.json to get your agentId
+- Start a background listener that writes to a temp file:
+    cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
+    LISTEN_PID=$!
+
+MAIN CODING TASK — implement a user registration validator:
+
+1. Create types.ts:
+   export interface RegistrationInput { name: string; email: string; password: string; }
+   export interface ValidationResult { valid: boolean; errors: string[]; }
+
+2. Create validator.ts:
+   Import from types.ts. Export function validateRegistration(input: RegistrationInput): ValidationResult
+   Rules: name min 2 chars, email must have exactly one @ and domain with a dot and no spaces and max 254 chars, password min 8 chars.
+
+3. Create index.ts that re-exports everything from types.ts and validator.ts.
+
+BETWEEN EACH FILE, check for incoming questions:
+   if [ -s /tmp/cw-listen-${agentSuffix}.txt ]; then
+     # parse the JSON, get conversationId and question
+     # answer: cw answer "<answer based on your code>" --conversation-id <id>
+     # clear and restart listener:
+     > /tmp/cw-listen-${agentSuffix}.txt
+     cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
+     LISTEN_PID=$!
+   fi
+
+You will receive TWO questions total while you work. Answer them based on the code you are writing.
+
+CLEANUP: After all 3 files are written and both questions answered:
+- kill $LISTEN_PID 2>/dev/null
+- Write .cw/output/signal.json: {"status":"done","result":"validator module complete, answered 2 questions"}
+
+CRITICAL:
+- The listener MUST run in the background while you write code.
+- Check for questions between files, not as blocking waits.
+- The CW_PORT environment variable is already set to ${mockPort}.`,
+        mode: 'execute',
+        provider: 'claude',
+        inputContext: {},
+      });
+
+      console.log(`  Agent A: ${agentA.id} (${agentA.name})`);
+
+      // Give Agent A time to start its background listener and begin coding
+      await sleep(15000);
+
+      // ---------------------------------------------------------------
+      // Agent B — builds a client module, asks Agent A questions to
+      // learn the validation rules, then uses answers in its code
+      // ---------------------------------------------------------------
+      const agentB = await harness.agentManager.spawn({
+        taskId: null,
+        prompt: `You are Agent B in a multi-agent coordination test.
+
+Read .cw/input/manifest.json to get your agentId. Agent A (ID: ${agentA.id}) is building a validator module.
+
+YOUR CODING TASK — build a registration API client that includes client-side validation matching Agent A's server-side rules:
+
+1. Create client-scaffold.ts with a basic RegistrationClient class that has a register(name, email, password) method that returns Promise<{ok: boolean}>.
+   Leave a TODO comment where validation will go.
+
+2. NOW ask Agent A what the validation rules are — you need this to write proper client-side checks:
+   FIELDS=$(cw ask "What are the required fields and their types for registration?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
+
+3. Ask Agent A about the specific email validation rules:
+   EMAIL_RULES=$(cw ask "What are the exact email validation rules you implemented?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
+
+4. Create validated-client.ts — a COMPLETE implementation using the answers:
+   Import the scaffold, add a validateBeforeSubmit(name, email, password) function
+   that implements the EXACT validation rules Agent A told you about.
+   Include a comment at the top with the rules you received.
+
+5. Write .cw/output/signal.json: {"status":"done","result":"client module complete with validation from agent A"}
+
+CRITICAL:
+- Create client-scaffold.ts BEFORE asking questions (you have independent work to do first).
+- Use the ACTUAL answers from Agent A in your validated-client.ts implementation.
+- The CW_PORT environment variable is already set to ${mockPort}.`,
+        mode: 'execute',
+        provider: 'claude',
+        inputContext: {},
+      });
+
+      console.log(`  Agent B: ${agentB.id} (${agentB.name})`);
+
+      // ---------------------------------------------------------------
+      // Wait for both agents to stop running, then verify conversations
+      // ---------------------------------------------------------------
+      const deadline = Date.now() + TEST_TIMEOUT;
+      let aDone = false;
+      let bDone = false;
+      let lastLogTime = 0;
+
+      while (Date.now() < deadline && (!aDone || !bDone)) {
+        const agentAInfo = await harness.agentRepository.findById(agentA.id);
+        const agentBInfo = await harness.agentRepository.findById(agentB.id);
+
+        // Periodic progress logging every 30s
+        if (Date.now() - lastLogTime > 30000) {
+          const elapsed = Math.round((Date.now() - (deadline - TEST_TIMEOUT)) / 1000);
+          console.log(`  [${elapsed}s] A=${agentAInfo?.status ?? '?'} B=${agentBInfo?.status ?? '?'} convs=${mockRepo.getAll().length}`);
+          lastLogTime = Date.now();
+        }
+
+        if (agentAInfo && agentAInfo.status !== 'running' && !aDone) {
+          aDone = true;
+          console.log(`  Agent A final status: ${agentAInfo.status}`);
+          dumpAgentLogs(harness.workspaceRoot, agentA.name);
+        }
+        if (agentBInfo && agentBInfo.status !== 'running' && !bDone) {
+          bDone = true;
+          console.log(`  Agent B final status: ${agentBInfo.status}`);
+          dumpAgentLogs(harness.workspaceRoot, agentB.name);
+        }
+
+        if (!aDone || !bDone) await sleep(2000);
+      }
+
+      expect(aDone).toBe(true);
+      expect(bDone).toBe(true);
+
+      // ---------------------------------------------------------------
+      // Verify conversations in mock repo
+      // ---------------------------------------------------------------
+      const allConversations = mockRepo.getAll();
+      console.log(`  Total conversations: ${allConversations.length}`);
+      for (const c of allConversations) {
+        console.log(
+          `    ${c.id}: ${c.status} — Q: "${c.question}" A: "${c.answer?.substring(0, 80)}..."`,
+        );
+      }
+
+      // Exactly 2 conversations, both answered
+      expect(allConversations.length).toBe(2);
+      expect(allConversations.every((c) => c.status === 'answered')).toBe(true);
+
+      // Both target Agent A, both from Agent B
+      expect(allConversations.every((c) => c.toAgentId === agentA.id)).toBe(true);
+      expect(allConversations.every((c) => c.fromAgentId === agentB.id)).toBe(true);
+
+      // Questions should be distinct (one about fields, one about email validation)
+      const questions = allConversations.map((c) => c.question);
+      expect(questions.some((q) => q.toLowerCase().includes('field'))).toBe(true);
+      expect(questions.some((q) => q.toLowerCase().includes('email'))).toBe(true);
+
+      // Both answers should be non-empty
+      expect(allConversations.every((c) => c.answer && c.answer.length > 0)).toBe(true);
+
+      // ---------------------------------------------------------------
+      // Verify Agent A's coding output — validator module files exist
+      // ---------------------------------------------------------------
+      const aWorkdir = join(
+        harness.workspaceRoot,
+        'agent-workdirs',
+        agentA.name,
+        'workspace',
+      );
+      const aFiles = ['types.ts', 'validator.ts', 'index.ts'];
+      for (const f of aFiles) {
+        const filePath = join(aWorkdir, f);
+        const exists = existsSync(filePath);
+        console.log(`  Agent A file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
+        expect(exists).toBe(true);
+      }
+      // validator.ts should contain actual validation logic
+      const validatorContent = readFileSync(join(aWorkdir, 'validator.ts'), 'utf-8');
+      console.log(`  Agent A validator.ts (${validatorContent.length} chars): ${validatorContent.substring(0, 120)}...`);
+      expect(validatorContent.toLowerCase()).toContain('email');
+      expect(validatorContent.toLowerCase()).toContain('password');
+
+      // ---------------------------------------------------------------
+      // Verify Agent B's coding output — client module files exist
+      // ---------------------------------------------------------------
+      const bWorkdir = join(
+        harness.workspaceRoot,
+        'agent-workdirs',
+        agentB.name,
+        'workspace',
+      );
+      const bFiles = ['client-scaffold.ts', 'validated-client.ts'];
+      for (const f of bFiles) {
+        const filePath = join(bWorkdir, f);
+        const exists = existsSync(filePath);
+        console.log(`  Agent B file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
+        expect(exists).toBe(true);
+      }
+      // validated-client.ts should reference validation rules from Agent A's answers
+      const clientContent = readFileSync(join(bWorkdir, 'validated-client.ts'), 'utf-8');
+      console.log(`  Agent B validated-client.ts (${clientContent.length} chars): ${clientContent.substring(0, 120)}...`);
+      expect(clientContent.toLowerCase()).toContain('email');
+
+      // ---------------------------------------------------------------
+      // Verify interleaving: Agent A's JSONL log has coding tool calls
+      // (Write for .ts files) interleaved with conversation tool calls
+      // (Bash for cw listen/answer)
+      // ---------------------------------------------------------------
+      const aLogPath = join(harness.workspaceRoot, '.cw', 'agent-logs', agentA.name, 'output.jsonl');
+      const aLog = readFileSync(aLogPath, 'utf-8').trim().split('\n');
+      const toolCalls: { type: 'code' | 'conversation'; name: string; detail: string }[] = [];
+
+      for (const line of aLog) {
+        try {
+          const ev = JSON.parse(line);
+          if (ev.type !== 'assistant' || !ev.message?.content) continue;
+          for (const block of ev.message.content) {
+            if (block.type !== 'tool_use') continue;
+            const input = typeof block.input === 'string' ? block.input : JSON.stringify(block.input);
+            if (block.name === 'Write' && input.includes('.ts')) {
+              toolCalls.push({ type: 'code', name: 'Write', detail: input.substring(0, 80) });
+            } else if (block.name === 'Bash' && (input.includes('cw listen') || input.includes('cw answer'))) {
+              toolCalls.push({ type: 'conversation', name: 'Bash', detail: input.substring(0, 80) });
+            }
+          }
+        } catch { /* skip non-JSON lines */ }
+      }
+
+      console.log(`  Agent A interleaving (${toolCalls.length} relevant tool calls):`);
+      for (const tc of toolCalls) {
+        console.log(`    [${tc.type}] ${tc.name}: ${tc.detail}`);
+      }
+
+      // Must have both code and conversation tool calls
+      const hasCode = toolCalls.some((tc) => tc.type === 'code');
+      const hasConversation = toolCalls.some((tc) => tc.type === 'conversation');
+      expect(hasCode).toBe(true);
+      expect(hasConversation).toBe(true);
+
+      // Verify interleaving: at least one code call must appear AFTER a conversation call
+      // (proving coding continued after handling a question)
+      const firstConvIdx = toolCalls.findIndex((tc) => tc.type === 'conversation');
+      const lastCodeIdx = toolCalls.length - 1 - [...toolCalls].reverse().findIndex((tc) => tc.type === 'code');
+      console.log(`  First conversation at index ${firstConvIdx}, last code at index ${lastCodeIdx}`);
+      expect(lastCodeIdx).toBeGreaterThan(firstConvIdx);
+    },
+    TEST_TIMEOUT,
+  );
+});
--- a/apps/server/test/integration/real-providers/crash-recovery.test.ts
+++ b/apps/server/test/integration/real-providers/crash-recovery.test.ts
@@ -0,0 +1,265 @@
+/**
+ * Crash Recovery Integration Tests
+ *
+ * IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/crash-recovery.test.ts --test-timeout=300000
+ * ```
+ *
+ * Tests covered:
+ * - Server restart while agent is running
+ * - Resuming streaming after restart
+ * - Marking dead agents as crashed
+ * - Output file processing after restart
+ *
+ * Estimated cost: ~$0.08 per full run
+ */
+
+import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
+import {
+  createRealProviderHarness,
+  describeRealClaude,
+  REAL_TEST_TIMEOUT,
+  EXTENDED_TEST_TIMEOUT,
+  sleep,
+  type RealProviderHarness,
+} from './harness.js';
+import { MINIMAL_PROMPTS } from './prompts.js';
+import { MultiProviderAgentManager } from '../../../agent/manager.js';
+
+describeRealClaude('Crash Recovery', () => {
+  let harness: RealProviderHarness;
+
+  beforeAll(async () => {
+    console.log('\n=== Running Crash Recovery Tests ===');
+    console.log('These tests call the real Claude API and incur costs.\n');
+    harness = await createRealProviderHarness({ provider: 'claude' });
+  });
+
+  afterAll(async () => {
+    await harness.cleanup();
+  });
+
+  beforeEach(() => {
+    harness.clearEvents();
+  });
+
+  describe('Server Restart Simulation', () => {
+    it(
+      'resumes streaming for still-running agent after restart',
+      async () => {
+        // 1. Spawn agent with slow task
+        console.log('  1. Spawning agent with slow task...');
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.slow,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // 2. Wait for agent to be running
+        await harness.waitForAgentStatus(agent.id, 'running', 10000);
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.pid).toBeTruthy();
+        expect(dbAgent?.outputFilePath).toBeTruthy();
+        console.log('  2. Agent running with PID:', dbAgent?.pid);
+
+        // 3. Give the agent a moment to start writing output
+        await sleep(2000);
+
+        // 4. Simulate server crash - create NEW manager (old state lost)
+        console.log('  3. Simulating server restart with new manager...');
+        harness.clearEvents(); // Clear events from old manager
+
+        const newManager = new MultiProviderAgentManager(
+          harness.agentRepository,
+          harness.workspaceRoot,
+          harness.projectRepository,
+          harness.accountRepository,
+          harness.eventBus
+        );
+
+        // 5. Reconcile - should pick up running agent
+        console.log('  4. Reconciling agent state...');
+        await newManager.reconcileAfterRestart();
+
+        // 6. Wait for completion via new manager
+        console.log('  5. Waiting for completion via new manager...');
+        let attempts = 0;
+        let finalStatus = 'running';
+        while (attempts < 60) {
+          const refreshed = await harness.agentRepository.findById(agent.id);
+          if (refreshed?.status !== 'running') {
+            finalStatus = refreshed?.status ?? 'unknown';
+            break;
+          }
+          await sleep(2000);
+          attempts++;
+        }
+
+        const finalAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  6. Final status:', finalAgent?.status);
+
+        // Either completed successfully or crashed (both are valid outcomes)
+        expect(['idle', 'crashed', 'stopped']).toContain(finalAgent?.status);
+
+        if (finalAgent?.status === 'idle') {
+          const result = await newManager.getResult(agent.id);
+          console.log('  Result:', result?.message);
+        }
+      },
+      EXTENDED_TEST_TIMEOUT
+    );
+
+    it(
+      'marks dead agent as crashed during reconcile',
+      async () => {
+        // 1. Create a fake agent record with a dead PID
+        console.log('  1. Creating fake agent with dead PID...');
+        const fakeAgent = await harness.agentRepository.create({
+          name: 'dead-agent-test',
+          taskId: null,
+          initiativeId: null,
+          sessionId: null,
+          worktreeId: 'dead-worktree',
+          status: 'running',
+          mode: 'execute',
+          provider: 'claude',
+          accountId: null,
+        });
+
+        // Set a PID that's definitely dead (high number that won't exist)
+        await harness.agentRepository.update(fakeAgent.id, { pid: 999999, outputFilePath: '/nonexistent/path' });
+
+        // Verify it's marked as running
+        let agent = await harness.agentRepository.findById(fakeAgent.id);
+        expect(agent?.status).toBe('running');
+        expect(agent?.pid).toBe(999999);
+
+        // 2. Create new manager and reconcile
+        console.log('  2. Creating new manager and reconciling...');
+        const newManager = new MultiProviderAgentManager(
+          harness.agentRepository,
+          harness.workspaceRoot,
+          harness.projectRepository,
+          harness.accountRepository,
+          harness.eventBus
+        );
+
+        await newManager.reconcileAfterRestart();
+
+        // 3. Verify agent is now crashed
+        agent = await harness.agentRepository.findById(fakeAgent.id);
+        expect(agent?.status).toBe('crashed');
+        console.log('  3. Agent marked as crashed (dead PID detected)');
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'processes output file for dead agent during reconcile',
+      async () => {
+        // 1. Spawn agent and wait for completion
+        console.log('  1. Spawning agent to completion...');
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.done,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        const outputFilePath = dbAgent?.outputFilePath;
+        expect(outputFilePath).toBeTruthy();
+        console.log('  2. Output file:', outputFilePath);
+
+        // 2. Reset agent to "running" to simulate mid-crash state
+        await harness.agentRepository.update(agent.id, { status: 'running' });
+        // Clear result so reconcile has to re-process
+        await harness.agentRepository.update(agent.id, { result: null });
+
+        // Verify reset
+        let resetAgent = await harness.agentRepository.findById(agent.id);
+        expect(resetAgent?.status).toBe('running');
+
+        // 3. Create new manager and reconcile
+        console.log('  3. Creating new manager and reconciling...');
+        harness.clearEvents();
+
+        const newManager = new MultiProviderAgentManager(
+          harness.agentRepository,
+          harness.workspaceRoot,
+          harness.projectRepository,
+          harness.accountRepository,
+          harness.eventBus
+        );
+
+        await newManager.reconcileAfterRestart();
+
+        // Give it a moment to process the file
+        await sleep(1000);
+
+        // 4. Verify agent was processed from output file
+        const finalAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  4. Final status:', finalAgent?.status);
+
+        // Should either be idle (processed successfully) or crashed (couldn't process)
+        expect(['idle', 'crashed']).toContain(finalAgent?.status);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+
+  describe('Event Consistency', () => {
+    it(
+      'does not duplicate events on restart',
+      async () => {
+        // 1. Spawn agent with slow task
+        console.log('  1. Spawning agent...');
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.streaming,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // 2. Wait for some output events
+        await sleep(3000);
+        const initialOutputCount = harness.getEventsByType('agent:output').length;
+        console.log('  2. Initial output events:', initialOutputCount);
+
+        // 3. Wait for completion
+        await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+        const finalOutputCount = harness.getEventsByType('agent:output').length;
+        console.log('  3. Final output events:', finalOutputCount);
+
+        // 4. Create new manager and reconcile (agent already complete)
+        harness.clearEvents();
+
+        const newManager = new MultiProviderAgentManager(
+          harness.agentRepository,
+          harness.workspaceRoot,
+          harness.projectRepository,
+          harness.accountRepository,
+          harness.eventBus
+        );
+
+        await newManager.reconcileAfterRestart();
+        await sleep(1000);
+
+        // 5. Verify no new output events (agent was already complete)
+        const postReconcileOutputCount = harness.getEventsByType('agent:output').length;
+        console.log('  4. Post-reconcile output events:', postReconcileOutputCount);
+
+        // Should not have re-emitted all the old output events
+        expect(postReconcileOutputCount).toBe(0);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+});
--- a/apps/server/test/integration/real-providers/harness.ts
+++ b/apps/server/test/integration/real-providers/harness.ts
@@ -0,0 +1,378 @@
+/**
+ * Real Provider Test Harness
+ *
+ * Extends the existing test infrastructure to use REAL MultiProviderAgentManager
+ * for integration testing with actual CLI providers like Claude and Codex.
+ *
+ * Unlike the standard TestHarness which uses MockAgentManager, this harness:
+ * - Uses real CLI spawning (costs real API credits!)
+ * - Provides poll-based waiting helpers
+ * - Captures events for inspection
+ * - Manages temp directories for worktrees
+ */
+
+import { mkdtemp, rm } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { describe } from 'vitest';
+import type { DrizzleDatabase } from '../../../db/index.js';
+import type { DomainEvent, EventBus } from '../../../events/types.js';
+import { EventEmitterBus } from '../../../events/bus.js';
+import { MultiProviderAgentManager } from '../../../agent/manager.js';
+import type { AgentResult, PendingQuestions, AgentStatus } from '../../../agent/types.js';
+import type { AgentRepository } from '../../../db/repositories/agent-repository.js';
+import type { ProjectRepository } from '../../../db/repositories/project-repository.js';
+import type { AccountRepository } from '../../../db/repositories/account-repository.js';
+import type { InitiativeRepository } from '../../../db/repositories/initiative-repository.js';
+import {
+  DrizzleAgentRepository,
+  DrizzleProjectRepository,
+  DrizzleAccountRepository,
+  DrizzleInitiativeRepository,
+} from '../../../db/repositories/drizzle/index.js';
+import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
+
+/**
+ * Sleep helper for polling loops.
+ */
+export function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Event bus that captures all emitted events for inspection.
+ */
+export class CapturingEventBus extends EventEmitterBus {
+  emittedEvents: DomainEvent[] = [];
+
+  emit<T extends DomainEvent>(event: T): void {
+    this.emittedEvents.push(event);
+    super.emit(event);
+  }
+
+  getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
+    return this.emittedEvents.filter((e) => e.type === type) as T[];
+  }
+
+  clearEvents(): void {
+    this.emittedEvents = [];
+  }
+}
+
+/**
+ * Options for creating a real provider test harness.
+ */
+export interface RealProviderHarnessOptions {
+  /** Which provider to test (default: 'claude') */
+  provider?: 'claude' | 'codex';
+  /** Optional workspace root (temp dir created if omitted) */
+  workspaceRoot?: string;
+}
+
+/**
+ * Real Provider Test Harness interface.
+ *
+ * Provides everything needed to test against real CLI providers:
+ * - In-memory database with real repositories
+ * - Real MultiProviderAgentManager (spawns actual CLI processes)
+ * - Event capture for verification
+ * - Polling-based wait helpers
+ */
+export interface RealProviderHarness {
+  /** In-memory SQLite database */
+  db: DrizzleDatabase;
+  /** Event bus with capture capability */
+  eventBus: CapturingEventBus;
+  /** Real agent manager (not mock!) */
+  agentManager: MultiProviderAgentManager;
+  /** Workspace root directory */
+  workspaceRoot: string;
+
+  /** Agent repository */
+  agentRepository: AgentRepository;
+  /** Project repository */
+  projectRepository: ProjectRepository;
+  /** Account repository */
+  accountRepository: AccountRepository;
+  /** Initiative repository */
+  initiativeRepository: InitiativeRepository;
+
+  /**
+   * Wait for an agent to reach idle or crashed status.
+   * Polls the database at regular intervals.
+   *
+   * @param agentId - The agent ID to wait for
+   * @param timeoutMs - Maximum time to wait (default 120000ms = 2 minutes)
+   * @returns The agent result if completed, or null if crashed/timeout
+   */
+  waitForAgentCompletion(agentId: string, timeoutMs?: number): Promise<AgentResult | null>;
+
+  /**
+   * Wait for an agent to enter waiting_for_input status.
+   * Polls the database at regular intervals.
+   *
+   * @param agentId - The agent ID to wait for
+   * @param timeoutMs - Maximum time to wait (default 120000ms)
+   * @returns The pending questions if waiting, or null if timeout/other status
+   */
+  waitForAgentWaiting(agentId: string, timeoutMs?: number): Promise<PendingQuestions | null>;
+
+  /**
+   * Wait for an agent to reach a specific status.
+   *
+   * @param agentId - The agent ID to wait for
+   * @param status - The target status
+   * @param timeoutMs - Maximum time to wait (default 120000ms)
+   */
+  waitForAgentStatus(agentId: string, status: AgentStatus, timeoutMs?: number): Promise<void>;
+
+  /**
+   * Get captured events filtered by type.
+   */
+  getEventsByType<T extends DomainEvent>(type: T['type']): T[];
+
+  /**
+   * Clear all captured events.
+   */
+  clearEvents(): void;
+
+  /**
+   * Kill all running agents (for cleanup).
+   */
+  killAllAgents(): Promise<void>;
+
+  /**
+   * Clean up all resources (directories, processes).
+   * Call this in afterAll/afterEach.
+   */
+  cleanup(): Promise<void>;
+}
+
+/** Default poll interval for status checks */
+const POLL_INTERVAL_MS = 1000;
+
+/**
+ * Create a test harness for real provider integration tests.
+ *
+ * This creates:
+ * - In-memory SQLite database
+ * - Temp directory for worktrees (or uses provided workspace)
+ * - Real MultiProviderAgentManager
+ * - Event capture bus
+ *
+ * @example
+ * ```typescript
+ * let harness: RealProviderHarness;
+ *
+ * beforeAll(async () => {
+ *   harness = await createRealProviderHarness({ provider: 'claude' });
+ * });
+ *
+ * afterAll(async () => {
+ *   await harness.cleanup();
+ * });
+ *
+ * it('spawns and completes', async () => {
+ *   const agent = await harness.agentManager.spawn({...});
+ *   const result = await harness.waitForAgentCompletion(agent.id);
+ *   expect(result?.success).toBe(true);
+ * });
+ * ```
+ */
+export async function createRealProviderHarness(
+  options: RealProviderHarnessOptions = {}
+): Promise<RealProviderHarness> {
+  // Create workspace directory (temp if not provided)
+  const workspaceRoot = options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'cw-test-')));
+  const ownedWorkspace = !options.workspaceRoot; // Track if we need to clean up
+
+  // Initialize git repo in temp workspace (required for worktree operations)
+  if (ownedWorkspace) {
+    const { execSync } = await import('node:child_process');
+    execSync('git init', { cwd: workspaceRoot, stdio: 'ignore' });
+    execSync('git config user.email "test@test.com"', { cwd: workspaceRoot, stdio: 'ignore' });
+    execSync('git config user.name "Test"', { cwd: workspaceRoot, stdio: 'ignore' });
+    // Create initial commit (worktrees require at least one commit)
+    execSync('touch .gitkeep && git add .gitkeep && git commit -m "init"', { cwd: workspaceRoot, stdio: 'ignore' });
+  }
+
+  // Create in-memory database
+  const db = createTestDatabase();
+
+  // Create repositories
+  const agentRepository = new DrizzleAgentRepository(db);
+  const projectRepository = new DrizzleProjectRepository(db);
+  const accountRepository = new DrizzleAccountRepository(db);
+  const initiativeRepository = new DrizzleInitiativeRepository(db);
+
+  // Create event bus with capture (parent class already sets maxListeners to 100)
+  const eventBus = new CapturingEventBus();
+
+  // Create REAL agent manager (not mock!)
+  const agentManager = new MultiProviderAgentManager(
+    agentRepository,
+    workspaceRoot,
+    projectRepository,
+    accountRepository,
+    eventBus
+  );
+
+  // Build harness
+  const harness: RealProviderHarness = {
+    db,
+    eventBus,
+    agentManager,
+    workspaceRoot,
+    agentRepository,
+    projectRepository,
+    accountRepository,
+    initiativeRepository,
+
+    async waitForAgentCompletion(agentId: string, timeoutMs = 120000): Promise<AgentResult | null> {
+      const deadline = Date.now() + timeoutMs;
+
+      while (Date.now() < deadline) {
+        const agent = await agentRepository.findById(agentId);
+        if (!agent) return null;
+
+        if (agent.status === 'idle' || agent.status === 'stopped') {
+          // Agent completed - get result
+          return agentManager.getResult(agentId);
+        }
+
+        if (agent.status === 'crashed') {
+          // Agent crashed - return the error result
+          return agentManager.getResult(agentId);
+        }
+
+        if (agent.status === 'waiting_for_input') {
+          // Agent is waiting - return null (not completed)
+          return null;
+        }
+
+        // Still running - wait and check again
+        await sleep(POLL_INTERVAL_MS);
+      }
+
+      throw new Error(`Timeout waiting for agent ${agentId} to complete after ${timeoutMs}ms`);
+    },
+
+    async waitForAgentWaiting(agentId: string, timeoutMs = 120000): Promise<PendingQuestions | null> {
+      const deadline = Date.now() + timeoutMs;
+
+      while (Date.now() < deadline) {
+        const agent = await agentRepository.findById(agentId);
+        if (!agent) return null;
+
+        if (agent.status === 'waiting_for_input') {
+          return agentManager.getPendingQuestions(agentId);
+        }
+
+        if (agent.status === 'idle' || agent.status === 'stopped' || agent.status === 'crashed') {
+          // Agent finished without asking questions
+          return null;
+        }
+
+        // Still running - wait and check again
+        await sleep(POLL_INTERVAL_MS);
+      }
+
+      throw new Error(`Timeout waiting for agent ${agentId} to request input after ${timeoutMs}ms`);
+    },
+
+    async waitForAgentStatus(agentId: string, status: AgentStatus, timeoutMs = 120000): Promise<void> {
+      const deadline = Date.now() + timeoutMs;
+
+      while (Date.now() < deadline) {
+        const agent = await agentRepository.findById(agentId);
+        if (!agent) {
+          throw new Error(`Agent ${agentId} not found`);
+        }
+
+        if (agent.status === status) {
+          return;
+        }
+
+        // Check for terminal states that mean we'll never reach target
+        if (status === 'running' && ['idle', 'stopped', 'crashed', 'waiting_for_input'].includes(agent.status)) {
+          throw new Error(`Agent ${agentId} already in terminal state ${agent.status}, cannot reach ${status}`);
+        }
+
+        await sleep(POLL_INTERVAL_MS);
+      }
+
+      throw new Error(`Timeout waiting for agent ${agentId} to reach status ${status} after ${timeoutMs}ms`);
+    },
+
+    getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
+      return eventBus.getEventsByType<T>(type);
+    },
+
+    clearEvents(): void {
+      eventBus.clearEvents();
+    },
+
+    async killAllAgents(): Promise<void> {
+      const agents = await agentRepository.findAll();
+      for (const agent of agents) {
+        if (agent.status === 'running') {
+          try {
+            await agentManager.stop(agent.id);
+          } catch {
+            // Ignore errors during cleanup
+          }
+        }
+      }
+    },
+
+    async cleanup(): Promise<void> {
+      // Kill any running agents
+      await harness.killAllAgents();
+
+      // Clean up workspace directory if we created it
+      if (ownedWorkspace) {
+        try {
+          await rm(workspaceRoot, { recursive: true, force: true });
+        } catch {
+          // Ignore cleanup errors
+        }
+      }
+    },
+  };
+
+  return harness;
+}
+
+/**
+ * Check if real Claude tests should run.
+ * Set REAL_CLAUDE_TESTS=1 environment variable to enable.
+ */
+export const shouldRunRealClaudeTests = process.env.REAL_CLAUDE_TESTS === '1';
+
+/**
+ * Check if real Codex tests should run.
+ * Set REAL_CODEX_TESTS=1 environment variable to enable.
+ */
+export const shouldRunRealCodexTests = process.env.REAL_CODEX_TESTS === '1';
+
+/**
+ * Skip wrapper for Claude tests - skips unless REAL_CLAUDE_TESTS=1.
+ */
+export const describeRealClaude: typeof describe = shouldRunRealClaudeTests ? describe : (describe.skip as typeof describe);
+
+/**
+ * Skip wrapper for Codex tests - skips unless REAL_CODEX_TESTS=1.
+ */
+export const describeRealCodex: typeof describe = shouldRunRealCodexTests ? describe : (describe.skip as typeof describe);
+
+/**
+ * Default test timeout for real CLI tests (2 minutes).
+ * Real API calls take 5-30 seconds typically.
+ */
+export const REAL_TEST_TIMEOUT = 120000;
+
+/**
+ * Extended test timeout for slow tests (5 minutes).
+ * Used for schema retry tests and crash recovery tests.
+ */
+export const EXTENDED_TEST_TIMEOUT = 300000;
--- a/apps/server/test/integration/real-providers/index.ts
+++ b/apps/server/test/integration/real-providers/index.ts
@@ -0,0 +1,56 @@
+/**
+ * Real Provider Integration Tests
+ *
+ * This module provides infrastructure for testing against real CLI providers.
+ * Tests are expensive (real API calls) and skipped by default.
+ *
+ * ## Running Tests
+ *
+ * ```bash
+ * # Claude tests only
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000
+ *
+ * # Codex tests only
+ * REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts
+ *
+ * # All real provider tests
+ * REAL_CLAUDE_TESTS=1 REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/
+ * ```
+ *
+ * ## Cost Estimates
+ *
+ * | Suite | Tests | Est. Cost | Duration |
+ * |-------|-------|-----------|----------|
+ * | Output Parsing | 3 | $0.06 | ~2 min |
+ * | Schema Validation | 4 | $0.22 | ~4 min |
+ * | Crash Recovery | 3 | $0.08 | ~3 min |
+ * | Session Resume | 2 | $0.08 | ~3 min |
+ * | Codex Integration | 2 | $0.10 | ~2 min |
+ * | **TOTAL** | **14** | **~$0.54** | **~14 min** |
+ *
+ * ## Test Files
+ *
+ * - `harness.ts` - RealProviderHarness factory and utilities
+ * - `prompts.ts` - Minimal cost test prompts
+ * - `claude-manager.test.ts` - Claude spawn/resume/output tests
+ * - `codex-manager.test.ts` - Codex provider tests
+ * - `schema-retry.test.ts` - Schema validation + retry tests
+ * - `crash-recovery.test.ts` - Server restart simulation
+ * - `sample-outputs/` - Captured CLI output for parser unit tests
+ */
+
+export {
+  createRealProviderHarness,
+  CapturingEventBus,
+  sleep,
+  shouldRunRealClaudeTests,
+  shouldRunRealCodexTests,
+  describeRealClaude,
+  describeRealCodex,
+  REAL_TEST_TIMEOUT,
+  EXTENDED_TEST_TIMEOUT,
+  type RealProviderHarness,
+  type RealProviderHarnessOptions,
+} from './harness.js';
+
+export { MINIMAL_PROMPTS, CODEX_PROMPTS } from './prompts.js';
--- a/apps/server/test/integration/real-providers/prompts.ts
+++ b/apps/server/test/integration/real-providers/prompts.ts
@@ -0,0 +1,113 @@
+/**
+ * Minimal Cost Test Prompts
+ *
+ * Carefully crafted prompts designed to minimize token usage while
+ * testing specific CLI behaviors. Each prompt aims for the smallest
+ * possible API cost while still exercising the target functionality.
+ *
+ * Cost estimates assume Claude Sonnet pricing (~$3/M input, $15/M output).
+ */
+
+export const MINIMAL_PROMPTS = {
+  /**
+   * ~$0.01 - Cheapest done response
+   * Tests: basic spawn → completion flow, status parsing
+   */
+  done: `Output exactly this JSON with no other text:
+{"status":"done","result":"ok"}`,
+
+  /**
+   * ~$0.01 - Cheapest questions response
+   * Tests: waiting_for_input status, questions array parsing
+   */
+  questions: `Output exactly this JSON with no other text:
+{"status":"questions","questions":[{"id":"q1","question":"What is your name?"}]}`,
+
+  /**
+   * ~$0.03 - Slow task for timing tests
+   * Tests: streaming during long-running task, crash recovery
+   * Note: Agent may not actually wait 30 seconds, but will produce delayed output
+   */
+  slow: `Think through a simple problem step by step, counting from 1 to 10 slowly, then output:
+{"status":"done","result":"counted to 10"}`,
+
+  /**
+   * ~$0.02 - Produces text deltas for streaming tests
+   * Tests: text_delta event parsing, output buffering
+   */
+  streaming: `Count from 1 to 5, outputting each number, then output:
+{"status":"done","result":"counted"}`,
+
+  /**
+   * ~$0.03 - Deliberately produces non-JSON first
+   * Tests: schema validation failure, retry logic
+   */
+  badThenGood: `First say "thinking..." on its own line, then output:
+{"status":"done","result":"fixed"}`,
+
+  /**
+   * ~$0.02 - Multiple questions
+   * Tests: questions array with multiple items
+   */
+  multipleQuestions: `Output exactly this JSON with no other text:
+{"status":"questions","questions":[{"id":"q1","question":"First question?"},{"id":"q2","question":"Second question?"}]}`,
+
+  /**
+   * ~$0.01 - Error signal
+   * Tests: error status handling
+   */
+  error: `Output exactly this JSON with no other text:
+{"status":"error","error":"Test error message"}`,
+
+  /**
+   * ~$0.02 - Answer continuation
+   * Tests: session resume with answers
+   */
+  answerContinuation: (answers: Record<string, string>): string => {
+    const answerLines = Object.entries(answers)
+      .map(([id, answer]) => `${id}: ${answer}`)
+      .join('\n');
+    return `I received your answers:
+${answerLines}
+
+Now complete the task by outputting:
+{"status":"done","result":"completed with answers"}`;
+  },
+
+  /**
+   * ~$0.02 - Context complete for discuss mode
+   * Tests: discuss mode output handling (now uses universal done signal)
+   */
+  discussComplete: `Output exactly this JSON with no other text:
+{"status":"done"}`,
+
+  /**
+   * ~$0.02 - Plan complete
+   * Tests: plan mode output handling (now uses universal done signal)
+   */
+  planComplete: `Output exactly this JSON with no other text:
+{"status":"done"}`,
+
+  /**
+   * ~$0.02 - Detail complete
+   * Tests: detail mode output handling (now uses universal done signal)
+   */
+  detailComplete: `Output exactly this JSON with no other text:
+{"status":"done"}`,
+} as const;
+
+/**
+ * Prompts specifically for Codex provider testing.
+ * Codex may have different output format requirements.
+ */
+export const CODEX_PROMPTS = {
+  /**
+   * Basic completion for Codex
+   */
+  done: `Complete this simple task: output "done" and finish.`,
+
+  /**
+   * Produces streaming output
+   */
+  streaming: `Count from 1 to 5, saying each number aloud, then say "finished".`,
+} as const;
--- a/apps/server/test/integration/real-providers/sample-outputs/README.md
+++ b/apps/server/test/integration/real-providers/sample-outputs/README.md
@@ -0,0 +1,68 @@
+# Sample CLI Outputs
+
+This directory contains captured real CLI outputs for use in parser unit tests.
+These files allow testing stream parsers without incurring API costs.
+
+## Files
+
+### claude-stream-success.jsonl
+A successful Claude CLI session (v2.1.33) that:
+- Initializes with `system` event containing `session_id`
+- Emits `assistant` message with content
+- Completes with `result` event containing `done` status JSON
+
+### claude-stream-questions.jsonl
+A Claude CLI session that:
+- Initializes with `system` event containing `session_id`
+- Emits `assistant` message with content wrapped in markdown code block
+- Completes with `result` event containing `questions` status JSON
+
+### codex-stream-success.jsonl
+A successful Codex CLI session (v0.98.0) that:
+- Starts with `thread.started` event containing `thread_id`
+- Emits `turn.started`, `item.completed` events
+- Completes with `turn.completed` event containing usage stats
+
+## Event Type Differences
+
+### Claude CLI (`--output-format stream-json`)
+- `system` (subtype: `init`) - Contains `session_id`, tools, model info
+- `assistant` - Contains message content in `content[].text`
+- `result` - Contains final `result` text and `total_cost_usd`
+
+### Codex CLI (`--json`)
+- `thread.started` - Contains `thread_id` (equivalent to session_id)
+- `turn.started` - Marks beginning of turn
+- `item.completed` - Contains reasoning or agent_message items
+- `turn.completed` - Contains usage stats
+
+## Usage
+
+These files can be used to test stream parsers in isolation:
+
+```typescript
+import { readFileSync } from 'fs';
+import { ClaudeStreamParser } from '../../../agent/providers/parsers/claude.js';
+
+const output = readFileSync('sample-outputs/claude-stream-success.jsonl', 'utf-8');
+const parser = new ClaudeStreamParser();
+
+for (const line of output.split('\n')) {
+  if (line.trim()) {
+    const events = parser.parseLine(line);
+    // Assert on events...
+  }
+}
+```
+
+## Capturing New Outputs
+
+### Claude
+```bash
+claude -p "your prompt" --output-format stream-json --verbose > output.jsonl
+```
+
+### Codex
+```bash
+codex exec --full-auto --json "your prompt" > output.jsonl
+```
--- a/apps/server/test/integration/real-providers/sample-outputs/claude-stream-questions.jsonl
+++ b/apps/server/test/integration/real-providers/sample-outputs/claude-stream-questions.jsonl
@@ -0,0 +1,3 @@
+{"type":"system","subtype":"init","cwd":"/Users/lukasmay/development/projects/codewalk-district","session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","slash_commands":["keybindings-help","debug","gsd:define-requirements","gsd:list-phase-assumptions","gsd:debug","gsd:remove-phase","gsd:complete-milestone","gsd:research-phase","gsd:plan-phase","gsd:check-todos","gsd:pause-work","gsd:execute-plan","gsd:research-project","gsd:add-todo","gsd:plan-fix","gsd:resume-work","gsd:progress","gsd:help","gsd:discuss-milestone","gsd:add-phase","gsd:create-roadmap","gsd:map-codebase","gsd:whats-new","gsd:insert-phase","gsd:new-milestone","gsd:new-project","gsd:execute-phase","gsd:verify-work","gsd:discuss-phase","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.33","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","claude-code-guide","jira-sw-assessment"],"skills":["keybindings-help","debug"],"plugins":[],"uuid":"224c683c-41f4-4fdd-9af6-f8cdca366ec1"}
+{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01CfDymxvSRFodJ5Zm6NjLHV","type":"message","role":"assistant","content":[{"type":"text","text":"```json\n{\"status\":\"questions\",\"questions\":[{\"id\":\"q1\",\"question\":\"What is your name?\"},{\"id\":\"q2\",\"question\":\"What is the deadline?\"}]}\n```"}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":5983,"cache_read_input_tokens":18026,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":5983},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","uuid":"29288f20-766c-4047-82f5-679024188f52"}
+{"type":"result","subtype":"success","is_error":false,"duration_ms":3213,"duration_api_ms":3203,"num_turns":1,"result":"```json\n{\"status\":\"questions\",\"questions\":[{\"id\":\"q1\",\"question\":\"What is your name?\"},{\"id\":\"q2\",\"question\":\"What is the deadline?\"}]}\n```","stop_reason":null,"session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","total_cost_usd":0.04754675,"usage":{"input_tokens":3,"cache_creation_input_tokens":5983,"cache_read_input_tokens":18026,"output_tokens":45,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":5983,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":3,"outputTokens":45,"cacheReadInputTokens":18026,"cacheCreationInputTokens":5983,"webSearchRequests":0,"costUSD":0.04754675,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"uuid":"08db08cd-0f12-47ae-8c21-c29e11a6d7df"}
--- a/apps/server/test/integration/real-providers/sample-outputs/claude-stream-success.jsonl
+++ b/apps/server/test/integration/real-providers/sample-outputs/claude-stream-success.jsonl
@@ -0,0 +1,3 @@
+{"type":"system","subtype":"init","cwd":"/Users/lukasmay/development/projects/codewalk-district","session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","slash_commands":["keybindings-help","debug","gsd:define-requirements","gsd:list-phase-assumptions","gsd:debug","gsd:remove-phase","gsd:complete-milestone","gsd:research-phase","gsd:plan-phase","gsd:check-todos","gsd:pause-work","gsd:execute-plan","gsd:research-project","gsd:add-todo","gsd:plan-fix","gsd:resume-work","gsd:progress","gsd:help","gsd:discuss-milestone","gsd:add-phase","gsd:create-roadmap","gsd:map-codebase","gsd:whats-new","gsd:insert-phase","gsd:new-milestone","gsd:new-project","gsd:execute-phase","gsd:verify-work","gsd:discuss-phase","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.33","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","claude-code-guide","jira-sw-assessment"],"skills":["keybindings-help","debug"],"plugins":[],"uuid":"c1d6dced-ca04-4335-a624-624660479b7b"}
+{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RjSiQY1RUgT47j73Dom93j","type":"message","role":"assistant","content":[{"type":"text","text":"{\"status\":\"done\",\"result\":\"ok\"}"}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":5958,"cache_read_input_tokens":18026,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":5958},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","uuid":"f1c8695a-534e-4de2-a684-fa4a1ec03749"}
+{"type":"result","subtype":"success","is_error":false,"duration_ms":2465,"duration_api_ms":2453,"num_turns":1,"result":"{\"status\":\"done\",\"result\":\"ok\"}","stop_reason":null,"session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","total_cost_usd":0.046565499999999996,"usage":{"input_tokens":3,"cache_creation_input_tokens":5958,"cache_read_input_tokens":18026,"output_tokens":12,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":5958,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":3,"outputTokens":12,"cacheReadInputTokens":18026,"cacheCreationInputTokens":5958,"webSearchRequests":0,"costUSD":0.046565499999999996,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"uuid":"53139e08-b4f3-4f94-b129-82759f77fdca"}
--- a/apps/server/test/integration/real-providers/sample-outputs/codex-stream-success.jsonl
+++ b/apps/server/test/integration/real-providers/sample-outputs/codex-stream-success.jsonl
@@ -0,0 +1,5 @@
+{"type":"thread.started","thread_id":"019c3242-955e-7140-9978-517f0b5a22cb"}
+{"type":"turn.started"}
+{"type":"item.completed","item":{"id":"item_0","type":"reasoning","text":"**Confirming simple greeting task**"}}
+{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"Hello!"}}
+{"type":"turn.completed","usage":{"input_tokens":8458,"cached_input_tokens":6912,"output_tokens":32}}
--- a/apps/server/test/integration/real-providers/schema-retry.test.ts
+++ b/apps/server/test/integration/real-providers/schema-retry.test.ts
@@ -0,0 +1,306 @@
+/**
+ * Schema Validation & Retry Integration Tests
+ *
+ * IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/schema-retry.test.ts --test-timeout=300000
+ * ```
+ *
+ * Tests covered:
+ * - Valid JSON output validation
+ * - Questions status parsing
+ * - Schema validation failure with retry
+ * - Max retry limit handling
+ *
+ * Estimated cost: ~$0.20 per full run (includes retries)
+ */
+
+import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
+import {
+  createRealProviderHarness,
+  describeRealClaude,
+  REAL_TEST_TIMEOUT,
+  EXTENDED_TEST_TIMEOUT,
+  type RealProviderHarness,
+} from './harness.js';
+import { MINIMAL_PROMPTS } from './prompts.js';
+import type { AgentResumedEvent, AgentCrashedEvent } from '../../../events/types.js';
+
+describeRealClaude('Schema Validation & Retry', () => {
+  let harness: RealProviderHarness;
+
+  beforeAll(async () => {
+    console.log('\n=== Running Schema Validation & Retry Tests ===');
+    console.log('These tests call the real Claude API and incur costs.');
+    console.log('Retry tests may take longer and cost more.\n');
+    harness = await createRealProviderHarness({ provider: 'claude' });
+  });
+
+  afterAll(async () => {
+    await harness.cleanup();
+  });
+
+  beforeEach(() => {
+    harness.clearEvents();
+  });
+
+  describe('Valid Output', () => {
+    it(
+      'validates done status output',
+      async () => {
+        // Spawn agent with minimal done prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.done,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify completion
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('idle');
+        expect(result?.success).toBe(true);
+
+        // No retry events should have been emitted
+        const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
+        expect(resumeEvents.length).toBe(0);
+
+        console.log('  Status: idle (valid done output)');
+        console.log('  Result:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'validates questions status output',
+      async () => {
+        // Spawn agent with questions prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.questions,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for waiting_for_input
+        const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify questions were validated
+        expect(questions).toBeTruthy();
+        expect(questions?.questions).toBeInstanceOf(Array);
+        expect(questions?.questions.length).toBeGreaterThan(0);
+
+        // Each question should have id and question fields
+        for (const q of questions?.questions ?? []) {
+          expect(q.id).toBeTruthy();
+          expect(q.question).toBeTruthy();
+        }
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('waiting_for_input');
+
+        // No retry events
+        const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
+        expect(resumeEvents.length).toBe(0);
+
+        console.log('  Status: waiting_for_input (valid questions output)');
+        console.log('  Questions:', questions?.questions.length);
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'validates multiple questions',
+      async () => {
+        // Spawn agent with multiple questions prompt
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.multipleQuestions,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for waiting_for_input
+        const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
+
+        // Verify multiple questions
+        expect(questions?.questions.length).toBeGreaterThanOrEqual(2);
+
+        // Each question should have unique ID
+        const ids = questions?.questions.map((q) => q.id) ?? [];
+        const uniqueIds = new Set(ids);
+        expect(uniqueIds.size).toBe(ids.length);
+
+        console.log('  Questions:', questions?.questions.map((q) => q.id).join(', '));
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+
+  describe('Retry Logic', () => {
+    it(
+      'retries when output does not match schema',
+      async () => {
+        // Prompt that produces non-JSON first, then valid JSON
+        // Note: Claude may or may not produce invalid output first
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.badThenGood,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion (may involve retries)
+        const result = await harness.waitForAgentCompletion(agent.id, EXTENDED_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+
+        // Either succeeded with retry OR succeeded first time
+        expect(['idle', 'crashed']).toContain(dbAgent?.status);
+
+        // Check for retry events
+        const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
+        console.log('  Retry attempts:', resumeEvents.length);
+        console.log('  Final status:', dbAgent?.status);
+
+        if (dbAgent?.status === 'idle') {
+          expect(result?.success).toBe(true);
+          console.log('  Result:', result?.message);
+        } else {
+          // Crashed after max retries
+          const crashedEvents = harness.getEventsByType<AgentCrashedEvent>('agent:crashed');
+          expect(crashedEvents.length).toBeGreaterThan(0);
+          console.log('  Crashed after retries');
+        }
+      },
+      EXTENDED_TEST_TIMEOUT
+    );
+
+    it(
+      'extracts JSON from markdown code blocks',
+      async () => {
+        // Prompt that produces JSON wrapped in markdown
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: `Output the result wrapped in a markdown code block like this:
+\`\`\`json
+{"status":"done","result":"extracted from markdown"}
+\`\`\``,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  Status:', dbAgent?.status);
+        console.log('  Result:', result?.message);
+
+        // Should succeed (JSON extraction from code block)
+        if (dbAgent?.status === 'idle') {
+          expect(result?.success).toBe(true);
+        }
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'extracts JSON from text with surrounding content',
+      async () => {
+        // Prompt that produces JSON with text before it
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: `First say "Here is my response:" then output the JSON:
+{"status":"done","result":"extracted from text"}`,
+          mode: 'execute',
+          provider: 'claude',
+        });
+
+        // Wait for completion
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        console.log('  Status:', dbAgent?.status);
+        console.log('  Result:', result?.message);
+
+        // Should succeed (JSON extraction from last {...} block)
+        if (dbAgent?.status === 'idle') {
+          expect(result?.success).toBe(true);
+        }
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+
+  describe('Mode-Specific Schemas', () => {
+    it(
+      'validates discuss mode output',
+      async () => {
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.discussComplete,
+          mode: 'discuss',
+          provider: 'claude',
+        });
+
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('idle');
+        expect(result?.success).toBe(true);
+
+        console.log('  Discuss mode result:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'validates plan mode output',
+      async () => {
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.planComplete,
+          mode: 'plan',
+          provider: 'claude',
+        });
+
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('idle');
+        expect(result?.success).toBe(true);
+
+        console.log('  Plan mode result:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+
+    it(
+      'validates detail mode output',
+      async () => {
+        const agent = await harness.agentManager.spawn({
+          taskId: null,
+          prompt: MINIMAL_PROMPTS.detailComplete,
+          mode: 'detail',
+          provider: 'claude',
+        });
+
+        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
+
+        const dbAgent = await harness.agentRepository.findById(agent.id);
+        expect(dbAgent?.status).toBe('idle');
+        expect(result?.success).toBe(true);
+
+        console.log('  Detail mode result:', result?.message);
+      },
+      REAL_TEST_TIMEOUT
+    );
+  });
+});