test: Rewrite conversation integration test with mock server and real tasks

Replace full CoordinationServer with a lightweight mock that serves only conversation tRPC procedures backed by an in-memory repository. Agents now have real coding tasks (write spec, ask questions, create summary) and the two-question flow proves the listen→answer→re-listen cycle works.
2026-02-10 14:37:12 +01:00
parent f8c5dce588
commit 9f5421f6bc
1 changed files with 479 additions and 256 deletions
--- a/src/test/integration/real-providers/conversation.test.ts
+++ b/src/test/integration/real-providers/conversation.test.ts
@@ -9,309 +9,532 @@
 * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/conversation.test.ts --test-timeout=300000
 * ```
 *
- * Tests covered:
- * - Two real Claude sessions communicating via cw ask / cw listen / cw answer
- * - Agent identity (agentId/agentName) in manifest.json
- * - Conversation lifecycle: create → pending → answered
+ * Architecture:
+ * - Mock conversation server (only cw listen/ask/answer endpoints, no full CoordinationServer)
+ * - In-memory ConversationRepository (no SQLite, no FK constraints)
+ * - Real agent harness for spawning two Claude sessions with actual coding tasks
+ * - Two sequential questions prove the listen→answer→re-listen cycle works
 *
- * Estimated cost: ~$0.20 per full run (two Claude sessions)
+ * Estimated cost: ~$0.30 per full run (two Claude sessions)
 */

-import { describe, it, expect, beforeAll, afterAll } from 'vitest';
-import { mkdtemp, rm, readFile } from 'node:fs/promises';
-import { tmpdir } from 'node:os';
+import { it, expect, beforeAll, afterAll } from 'vitest';
+import { createServer } from 'node:http';
+import type { Server } from 'node:http';
+import { readFileSync, existsSync } from 'node:fs';
 import { join } from 'node:path';
-import Database from 'better-sqlite3';
-import { drizzle } from 'drizzle-orm/better-sqlite3';
-import * as schema from '../../../db/schema.js';
-import { ensureSchema } from '../../../db/ensure-schema.js';
+import { nanoid } from 'nanoid';
+import { fetchRequestHandler } from '@trpc/server/adapters/fetch';
+import { router, publicProcedure } from '../../../trpc/trpc.js';
+import { conversationProcedures } from '../../../trpc/routers/conversation.js';
 import { EventEmitterBus } from '../../../events/bus.js';
-import type { DomainEvent } from '../../../events/types.js';
-import { MultiProviderAgentManager } from '../../../agent/manager.js';
+import type { ConversationRepository, CreateConversationData } from '../../../db/repositories/conversation-repository.js';
+import type { Conversation } from '../../../db/schema.js';
 import {
-  DrizzleAgentRepository,
-  DrizzleProjectRepository,
-  DrizzleAccountRepository,
-  DrizzleConversationRepository,
-  DrizzleTaskRepository,
-  DrizzlePhaseRepository,
-  DrizzlePageRepository,
-  DrizzleLogChunkRepository,
-  DrizzleChangeSetRepository,
-} from '../../../db/repositories/drizzle/index.js';
-import { CoordinationServer } from '../../../server/index.js';
-import { ProcessManager, ProcessRegistry } from '../../../process/index.js';
-import { LogManager } from '../../../logging/index.js';
-import { createTrpcClient } from '../../../cli/trpc-client.js';
-import type { TrpcClient } from '../../../cli/trpc-client.js';
-import { describeRealClaude, sleep } from './harness.js';
+  createRealProviderHarness,
+  describeRealClaude,
+  sleep,
+  type RealProviderHarness,
+} from './harness.js';

-const CONVERSATION_TEST_TIMEOUT = 180000; // 3 minutes per test
+const TEST_TIMEOUT = 300000; // 5 minutes — agents do real coding + conversation

-describeRealClaude('Real Inter-Agent Conversation', () => {
-  let workspaceRoot: string;
-  let agentManager: MultiProviderAgentManager;
-  let agentRepository: DrizzleAgentRepository;
-  let conversationRepository: DrizzleConversationRepository;
-  let server: CoordinationServer;
-  let client: TrpcClient;
-  let testPort: number;
-  let eventBus: EventEmitterBus;
+// ---------------------------------------------------------------------------
+// In-memory ConversationRepository — no SQLite, no FK constraints
+// ---------------------------------------------------------------------------
+
+class InMemoryConversationRepository implements ConversationRepository {
+  private store = new Map<string, Conversation>();
+
+  async create(data: CreateConversationData): Promise<Conversation> {
+    const now = new Date();
+    const conversation: Conversation = {
+      id: nanoid(),
+      fromAgentId: data.fromAgentId,
+      toAgentId: data.toAgentId,
+      initiativeId: data.initiativeId ?? null,
+      phaseId: data.phaseId ?? null,
+      taskId: data.taskId ?? null,
+      question: data.question,
+      answer: null,
+      status: 'pending',
+      createdAt: now,
+      updatedAt: now,
+    };
+    this.store.set(conversation.id, conversation);
+    return conversation;
+  }
+
+  async findById(id: string): Promise<Conversation | null> {
+    return this.store.get(id) ?? null;
+  }
+
+  async findPendingForAgent(toAgentId: string): Promise<Conversation[]> {
+    return [...this.store.values()]
+      .filter((c) => c.toAgentId === toAgentId && c.status === 'pending')
+      .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime());
+  }
+
+  async answer(id: string, answer: string): Promise<Conversation | null> {
+    const conv = this.store.get(id);
+    if (!conv) return null;
+    const updated: Conversation = {
+      ...conv,
+      answer,
+      status: 'answered' as const,
+      updatedAt: new Date(),
+    };
+    this.store.set(id, updated);
+    return updated;
+  }
+
+  /** Test helper — return all conversations */
+  getAll(): Conversation[] {
+    return [...this.store.values()];
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Mock conversation server — serves ONLY conversation tRPC procedures
+// ---------------------------------------------------------------------------
+
+async function startMockConversationServer(): Promise<{
+  server: Server;
+  port: number;
+  repo: InMemoryConversationRepository;
+}> {
+  const repo = new InMemoryConversationRepository();
+  const eventBus = new EventEmitterBus();
+
+  // Mini router with only conversation procedures
+  const miniRouter = router({
+    ...conversationProcedures(publicProcedure),
+  });
+
+  const httpServer = createServer(async (req, res) => {
+    if (!req.url?.startsWith('/trpc')) {
+      res.writeHead(404);
+      res.end('Not found');
+      return;
+    }
+
+    const host = req.headers.host ?? 'localhost';
+    const url = new URL(req.url, `http://${host}`);
+
+    let body: string | undefined;
+    if (req.method !== 'GET' && req.method !== 'HEAD') {
+      body = await new Promise<string>((resolve) => {
+        let data = '';
+        req.on('data', (chunk: Buffer) => {
+          data += chunk.toString();
+        });
+        req.on('end', () => resolve(data));
+      });
+    }
+
+    const headers = new Headers();
+    for (const [key, value] of Object.entries(req.headers)) {
+      if (value) {
+        if (Array.isArray(value)) {
+          value.forEach((v) => headers.append(key, v));
+        } else {
+          headers.set(key, value);
+        }
+      }
+    }
+
+    const fetchRequest = new Request(url.toString(), {
+      method: req.method,
+      headers,
+      body: body ?? undefined,
+    });
+
+    const fetchResponse = await fetchRequestHandler({
+      endpoint: '/trpc',
+      req: fetchRequest,
+      router: miniRouter,
+      createContext: () =>
+        ({
+          eventBus,
+          serverStartedAt: new Date(),
+          processCount: 0,
+          conversationRepository: repo,
+          // Stub — requireAgentManager is called unconditionally in createConversation,
+          // but list() is only invoked for taskId/phaseId resolution. With --agent-id
+          // targeting, list() is never called.
+          agentManager: { list: async () => [] },
+        }) as any,
+    });
+
+    res.statusCode = fetchResponse.status;
+    fetchResponse.headers.forEach((value, key) => {
+      res.setHeader(key, value);
+    });
+
+    if (fetchResponse.body) {
+      const reader = fetchResponse.body.getReader();
+      const pump = async () => {
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) {
+            res.end();
+            return;
+          }
+          res.write(value);
+        }
+      };
+      pump().catch(() => res.end());
+    } else {
+      res.end(await fetchResponse.text());
+    }
+  });
+
+  const port = 40000 + Math.floor(Math.random() * 10000);
+  await new Promise<void>((resolve) => {
+    httpServer.listen(port, '127.0.0.1', () => resolve());
+  });
+
+  return { server: httpServer, port, repo };
+}
+
+// ---------------------------------------------------------------------------
+// Diagnostic helpers
+// ---------------------------------------------------------------------------
+
+function dumpAgentLogs(workspaceRoot: string, agentName: string) {
+  const logDir = join(workspaceRoot, '.cw', 'agent-logs', agentName);
+  if (!existsSync(logDir)) {
+    console.log(`  [${agentName}] No log directory at ${logDir}`);
+    return;
+  }
+  // Dump output.jsonl (last 30 lines)
+  const outputPath = join(logDir, 'output.jsonl');
+  if (existsSync(outputPath)) {
+    const lines = readFileSync(outputPath, 'utf-8').trim().split('\n');
+    const last = lines.slice(-30);
+    console.log(`  [${agentName}] output.jsonl (last ${last.length}/${lines.length} lines):`);
+    for (const line of last) {
+      try {
+        const ev = JSON.parse(line);
+        if (ev.type === 'assistant' && ev.message?.content) {
+          for (const block of ev.message.content) {
+            if (block.type === 'text') {
+              console.log(`    TEXT: ${block.text.substring(0, 200)}`);
+            } else if (block.type === 'tool_use') {
+              console.log(`    TOOL: ${block.name} ${JSON.stringify(block.input).substring(0, 150)}`);
+            }
+          }
+        } else if (ev.type === 'result') {
+          console.log(`    RESULT: ${JSON.stringify(ev).substring(0, 300)}`);
+        }
+      } catch {
+        console.log(`    RAW: ${line.substring(0, 200)}`);
+      }
+    }
+  }
+  // Dump stderr
+  const stderrPath = join(logDir, 'stderr.log');
+  if (existsSync(stderrPath)) {
+    const stderr = readFileSync(stderrPath, 'utf-8').trim();
+    if (stderr) {
+      console.log(`  [${agentName}] stderr: ${stderr.substring(0, 500)}`);
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Test suite
+// ---------------------------------------------------------------------------
+
+describeRealClaude('Real Inter-Agent Conversation (mock server)', () => {
+  let harness: RealProviderHarness;
+  let mockServer: Server;
+  let mockPort: number;
+  let mockRepo: InMemoryConversationRepository;
  const originalCwPort = process.env.CW_PORT;

  beforeAll(async () => {
-    console.log('\n=== Running Real Inter-Agent Conversation Tests ===');
-    console.log('These tests spawn TWO Claude sessions and incur costs.\n');
+    console.log('\n=== Real Inter-Agent Conversation Test ===');
+    console.log('Mock conversation server + two Claude sessions.\n');

-    // Create temp workspace
-    workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-conv-test-'));
-    const { execSync } = await import('node:child_process');
-    execSync('git init && git config user.email "test@test.com" && git config user.name "Test" && touch .gitkeep && git add . && git commit -m "init"', {
-      cwd: workspaceRoot,
-      stdio: 'ignore',
-    });
+    // Start mock conversation server (only listen/ask/answer endpoints)
+    const mock = await startMockConversationServer();
+    mockServer = mock.server;
+    mockPort = mock.port;
+    mockRepo = mock.repo;
+    console.log(`  Mock server on port ${mockPort}`);

-    // Create in-memory DB
-    const sqlite = new Database(':memory:');
-    sqlite.pragma('foreign_keys = ON');
-    const db = drizzle(sqlite, { schema });
-    ensureSchema(db);
+    // Set CW_PORT so agents' cw commands hit the mock server
+    process.env.CW_PORT = String(mockPort);

-    // Create repositories
-    agentRepository = new DrizzleAgentRepository(db);
-    const projectRepository = new DrizzleProjectRepository(db);
-    const accountRepository = new DrizzleAccountRepository(db);
-    conversationRepository = new DrizzleConversationRepository(db);
-    const taskRepository = new DrizzleTaskRepository(db);
-    const phaseRepository = new DrizzlePhaseRepository(db);
-    const pageRepository = new DrizzlePageRepository(db);
-    const logChunkRepository = new DrizzleLogChunkRepository(db);
-    const changeSetRepository = new DrizzleChangeSetRepository(db);
-
-    // Event bus
-    eventBus = new EventEmitterBus();
-
-    // Agent manager
-    agentManager = new MultiProviderAgentManager(
-      agentRepository,
-      workspaceRoot,
-      projectRepository,
-      accountRepository,
-      eventBus,
-      undefined, // no credential manager
-      changeSetRepository,
-      phaseRepository,
-      taskRepository,
-      pageRepository,
-      logChunkRepository,
-    );
-
-    // Start server on random port
-    testPort = 40000 + Math.floor(Math.random() * 10000);
-    const registry = new ProcessRegistry();
-    const processManager = new ProcessManager(registry, eventBus);
-    const logManager = new LogManager();
-
-    server = new CoordinationServer(
-      { port: testPort, pidFile: `/tmp/cw-conv-test-${testPort}.pid` },
-      processManager,
-      logManager,
-      eventBus,
-      {
-        agentManager,
-        taskRepository,
-        projectRepository,
-        accountRepository,
-        conversationRepository,
-        phaseRepository,
-        pageRepository,
-        logChunkRepository,
-        changeSetRepository,
-        workspaceRoot,
-      },
-    );
-
-    await server.start();
-    console.log(`  Server started on port ${testPort}`);
-
-    // Set CW_PORT so spawned agents can reach the server
-    process.env.CW_PORT = String(testPort);
-
-    // Create tRPC client
-    client = createTrpcClient(testPort);
+    // Real agent harness for spawning + worktrees (no full CoordinationServer)
+    harness = await createRealProviderHarness({ provider: 'claude' });
+    console.log(`  Workspace: ${harness.workspaceRoot}`);
  });

  afterAll(async () => {
-    // Restore env
    if (originalCwPort) {
      process.env.CW_PORT = originalCwPort;
    } else {
      delete process.env.CW_PORT;
    }
-
-    // Kill agents
-    const agents = await agentRepository.findAll();
-    for (const a of agents) {
-      if (a.status === 'running') {
-        try { await agentManager.stop(a.id); } catch { /* ignore */ }
-      }
-    }
-
-    // Stop server
-    try { await server.stop(); } catch { /* ignore */ }
-
-    // Clean up
-    try { await rm(workspaceRoot, { recursive: true, force: true }); } catch { /* ignore */ }
+    await harness?.cleanup();
+    mockServer?.close();
  });

  it(
-    'two Claude agents communicate via cw ask/listen/answer',
+    'two agents with real tasks communicate via cw ask/listen/answer (two questions prove re-listen)',
    async () => {
-      // Spawn Agent A — the responder
-      // It starts a listener, waits for a question, answers it, then completes
-      const agentA = await agentManager.spawn({
-        taskId: null,
-        prompt: `You are Agent A in a multi-agent test. Your job:
+      const agentSuffix = nanoid(6); // unique suffix for temp files

-1. Read .cw/input/manifest.json to get your agentId
-2. Start a background listener: cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-output.txt &
+      // ---------------------------------------------------------------
+      // Agent A — builds a validator module WHILE answering questions
+      // in the background via cw listen
+      // ---------------------------------------------------------------
+      const agentA = await harness.agentManager.spawn({
+        taskId: null,
+        prompt: `You are Agent A in a multi-agent coordination test.
+
+You have TWO concurrent responsibilities:
+1. Build a TypeScript validator module (your main coding task)
+2. Answer questions from other agents via a background listener
+
+SETUP (do this first):
+- Read .cw/input/manifest.json to get your agentId
+- Start a background listener that writes to a temp file:
+    cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
    LISTEN_PID=$!
-3. Wait for the listener to find a question by polling the file:
-   while [ ! -s /tmp/cw-listen-output.txt ]; do sleep 1; done
-4. Parse the JSON from /tmp/cw-listen-output.txt to get conversationId
-5. Answer the conversation: cw answer "The answer is 42" --conversation-id <conversationId>
-6. Kill the listener: kill $LISTEN_PID 2>/dev/null
-7. Write .cw/output/signal.json with: {"status":"done"}

-IMPORTANT:
- The server is on port ${testPort}
- Use the exact answer text: "The answer is 42"
- Do NOT ask any questions or produce other output`,
-        mode: 'execute',
-        provider: 'claude',
-      });
+MAIN CODING TASK — implement a user registration validator:

-      console.log(`  Agent A spawned: ${agentA.id} (${agentA.name})`);
+1. Create types.ts:
+   export interface RegistrationInput { name: string; email: string; password: string; }
+   export interface ValidationResult { valid: boolean; errors: string[]; }

-      // Small delay to let Agent A start its listener
-      await sleep(5000);
+2. Create validator.ts:
+   Import from types.ts. Export function validateRegistration(input: RegistrationInput): ValidationResult
+   Rules: name min 2 chars, email must have exactly one @ and domain with a dot and no spaces and max 254 chars, password min 8 chars.

-      // Spawn Agent B — the questioner
-      // It asks Agent A a question, waits for the answer, writes it to signal.json result
-      const agentB = await agentManager.spawn({
-        taskId: null,
-        prompt: `You are Agent B in a multi-agent test. Your job:
+3. Create index.ts that re-exports everything from types.ts and validator.ts.

-1. Read .cw/input/manifest.json to get your agentId
-2. Ask Agent A a question using this command:
-   ANSWER=$(cw ask "What is the meaning of everything?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
-3. Write .cw/output/signal.json with: {"status":"done","result":"<the ANSWER you received>"}
+BETWEEN EACH FILE, check for incoming questions:
+   if [ -s /tmp/cw-listen-${agentSuffix}.txt ]; then
+     # parse the JSON, get conversationId and question
+     # answer: cw answer "<answer based on your code>" --conversation-id <id>
+     # clear and restart listener:
+     > /tmp/cw-listen-${agentSuffix}.txt
+     cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
+     LISTEN_PID=$!
+   fi

-IMPORTANT:
- The server is on port ${testPort}
- Agent A's ID is: ${agentA.id}
- Write the EXACT answer you received from Agent A into the result field
- Do NOT ask any questions or produce other output`,
-        mode: 'execute',
-        provider: 'claude',
-      });
+You will receive TWO questions total while you work. Answer them based on the code you are writing.

-      console.log(`  Agent B spawned: ${agentB.id} (${agentB.name})`);
+CLEANUP: After all 3 files are written and both questions answered:
+- kill $LISTEN_PID 2>/dev/null
+- Write .cw/output/signal.json: {"status":"done","result":"validator module complete, answered 2 questions"}

-      // Wait for both agents to complete
-      const deadline = Date.now() + CONVERSATION_TEST_TIMEOUT;
-
-      let agentADone = false;
-      let agentBDone = false;
-      let agentBResult: string | null = null;
-
-      while (Date.now() < deadline && (!agentADone || !agentBDone)) {
-        const a = await agentRepository.findById(agentA.id);
-        const b = await agentRepository.findById(agentB.id);
-
-        if (a && a.status !== 'running') {
-          agentADone = true;
-          console.log(`  Agent A status: ${a.status}`);
-        }
-        if (b && b.status !== 'running') {
-          agentBDone = true;
-          console.log(`  Agent B status: ${b.status}`);
-          if (b.result) {
-            try {
-              const parsed = JSON.parse(b.result);
-              agentBResult = parsed.message ?? parsed.result ?? b.result;
-            } catch {
-              agentBResult = b.result;
-            }
-          }
-        }
-
-        if (!agentADone || !agentBDone) {
-          await sleep(2000);
-        }
-      }
-
-      // Verify results
-      const finalA = await agentRepository.findById(agentA.id);
-      const finalB = await agentRepository.findById(agentB.id);
-
-      console.log(`  Agent A final status: ${finalA?.status}`);
-      console.log(`  Agent B final status: ${finalB?.status}`);
-      console.log(`  Agent B result: ${finalB?.result}`);
-
-      // Agent A should have completed (idle)
-      expect(finalA?.status).toBe('idle');
-
-      // Agent B should have completed (idle)
-      expect(finalB?.status).toBe('idle');
-
-      // Agent B's result should contain the answer from Agent A
-      expect(finalB?.result).toBeTruthy();
-      expect(finalB!.result).toContain('42');
-
-      // Verify the conversation exists in the database
-      const conversations = await conversationRepository.findPendingForAgent(agentA.id);
-      // Should be empty (all answered)
-      expect(conversations.length).toBe(0);
-    },
-    CONVERSATION_TEST_TIMEOUT,
-  );
-
-  it(
-    'agent manifest.json contains agentId and agentName',
-    async () => {
-      const agent = await agentManager.spawn({
-        taskId: null,
-        prompt: `Read .cw/input/manifest.json and write its contents to .cw/output/signal.json in this format:
-{"status":"done","result":"<raw contents of manifest.json>"}
-Do NOT modify the manifest contents. Just copy them into the result field as a string.`,
+CRITICAL:
+- The listener MUST run in the background while you write code.
+- Check for questions between files, not as blocking waits.
+- The CW_PORT environment variable is already set to ${mockPort}.`,
        mode: 'execute',
        provider: 'claude',
        inputContext: {},
      });

-      console.log(`  Identity test agent: ${agent.id} (${agent.name})`);
+      console.log(`  Agent A: ${agentA.id} (${agentA.name})`);

-      // Wait for completion
-      const deadline = Date.now() + CONVERSATION_TEST_TIMEOUT;
-      while (Date.now() < deadline) {
-        const a = await agentRepository.findById(agent.id);
-        if (a && a.status !== 'running') break;
-        await sleep(1000);
+      // Give Agent A time to start its background listener and begin coding
+      await sleep(15000);
+
+      // ---------------------------------------------------------------
+      // Agent B — builds a client module, asks Agent A questions to
+      // learn the validation rules, then uses answers in its code
+      // ---------------------------------------------------------------
+      const agentB = await harness.agentManager.spawn({
+        taskId: null,
+        prompt: `You are Agent B in a multi-agent coordination test.
+
+Read .cw/input/manifest.json to get your agentId. Agent A (ID: ${agentA.id}) is building a validator module.
+
+YOUR CODING TASK — build a registration API client that includes client-side validation matching Agent A's server-side rules:
+
+1. Create client-scaffold.ts with a basic RegistrationClient class that has a register(name, email, password) method that returns Promise<{ok: boolean}>.
+   Leave a TODO comment where validation will go.
+
+2. NOW ask Agent A what the validation rules are — you need this to write proper client-side checks:
+   FIELDS=$(cw ask "What are the required fields and their types for registration?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
+
+3. Ask Agent A about the specific email validation rules:
+   EMAIL_RULES=$(cw ask "What are the exact email validation rules you implemented?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
+
+4. Create validated-client.ts — a COMPLETE implementation using the answers:
+   Import the scaffold, add a validateBeforeSubmit(name, email, password) function
+   that implements the EXACT validation rules Agent A told you about.
+   Include a comment at the top with the rules you received.
+
+5. Write .cw/output/signal.json: {"status":"done","result":"client module complete with validation from agent A"}
+
+CRITICAL:
+- Create client-scaffold.ts BEFORE asking questions (you have independent work to do first).
+- Use the ACTUAL answers from Agent A in your validated-client.ts implementation.
+- The CW_PORT environment variable is already set to ${mockPort}.`,
+        mode: 'execute',
+        provider: 'claude',
+        inputContext: {},
+      });
+
+      console.log(`  Agent B: ${agentB.id} (${agentB.name})`);
+
+      // ---------------------------------------------------------------
+      // Wait for both agents to stop running, then verify conversations
+      // ---------------------------------------------------------------
+      const deadline = Date.now() + TEST_TIMEOUT;
+      let aDone = false;
+      let bDone = false;
+      let lastLogTime = 0;
+
+      while (Date.now() < deadline && (!aDone || !bDone)) {
+        const agentAInfo = await harness.agentRepository.findById(agentA.id);
+        const agentBInfo = await harness.agentRepository.findById(agentB.id);
+
+        // Periodic progress logging every 30s
+        if (Date.now() - lastLogTime > 30000) {
+          const elapsed = Math.round((Date.now() - (deadline - TEST_TIMEOUT)) / 1000);
+          console.log(`  [${elapsed}s] A=${agentAInfo?.status ?? '?'} B=${agentBInfo?.status ?? '?'} convs=${mockRepo.getAll().length}`);
+          lastLogTime = Date.now();
        }

-      const final = await agentRepository.findById(agent.id);
-      console.log(`  Agent status: ${final?.status}`);
-      console.log(`  Agent result: ${final?.result}`);
+        if (agentAInfo && agentAInfo.status !== 'running' && !aDone) {
+          aDone = true;
+          console.log(`  Agent A final status: ${agentAInfo.status}`);
+          dumpAgentLogs(harness.workspaceRoot, agentA.name);
+        }
+        if (agentBInfo && agentBInfo.status !== 'running' && !bDone) {
+          bDone = true;
+          console.log(`  Agent B final status: ${agentBInfo.status}`);
+          dumpAgentLogs(harness.workspaceRoot, agentB.name);
+        }

-      expect(final?.status).toBe('idle');
-      expect(final?.result).toBeTruthy();
+        if (!aDone || !bDone) await sleep(2000);
+      }

-      // The result should contain the manifest contents which include agentId and agentName
-      const result = final!.result!;
-      expect(result).toContain(agent.id);
-      expect(result).toContain(agent.name);
+      expect(aDone).toBe(true);
+      expect(bDone).toBe(true);
+
+      // ---------------------------------------------------------------
+      // Verify conversations in mock repo
+      // ---------------------------------------------------------------
+      const allConversations = mockRepo.getAll();
+      console.log(`  Total conversations: ${allConversations.length}`);
+      for (const c of allConversations) {
+        console.log(
+          `    ${c.id}: ${c.status} — Q: "${c.question}" A: "${c.answer?.substring(0, 80)}..."`,
+        );
+      }
+
+      // Exactly 2 conversations, both answered
+      expect(allConversations.length).toBe(2);
+      expect(allConversations.every((c) => c.status === 'answered')).toBe(true);
+
+      // Both target Agent A, both from Agent B
+      expect(allConversations.every((c) => c.toAgentId === agentA.id)).toBe(true);
+      expect(allConversations.every((c) => c.fromAgentId === agentB.id)).toBe(true);
+
+      // Questions should be distinct (one about fields, one about email validation)
+      const questions = allConversations.map((c) => c.question);
+      expect(questions.some((q) => q.toLowerCase().includes('field'))).toBe(true);
+      expect(questions.some((q) => q.toLowerCase().includes('email'))).toBe(true);
+
+      // Both answers should be non-empty
+      expect(allConversations.every((c) => c.answer && c.answer.length > 0)).toBe(true);
+
+      // ---------------------------------------------------------------
+      // Verify Agent A's coding output — validator module files exist
+      // ---------------------------------------------------------------
+      const aWorkdir = join(
+        harness.workspaceRoot,
+        'agent-workdirs',
+        agentA.name,
+        'workspace',
+      );
+      const aFiles = ['types.ts', 'validator.ts', 'index.ts'];
+      for (const f of aFiles) {
+        const filePath = join(aWorkdir, f);
+        const exists = existsSync(filePath);
+        console.log(`  Agent A file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
+        expect(exists).toBe(true);
+      }
+      // validator.ts should contain actual validation logic
+      const validatorContent = readFileSync(join(aWorkdir, 'validator.ts'), 'utf-8');
+      console.log(`  Agent A validator.ts (${validatorContent.length} chars): ${validatorContent.substring(0, 120)}...`);
+      expect(validatorContent.toLowerCase()).toContain('email');
+      expect(validatorContent.toLowerCase()).toContain('password');
+
+      // ---------------------------------------------------------------
+      // Verify Agent B's coding output — client module files exist
+      // ---------------------------------------------------------------
+      const bWorkdir = join(
+        harness.workspaceRoot,
+        'agent-workdirs',
+        agentB.name,
+        'workspace',
+      );
+      const bFiles = ['client-scaffold.ts', 'validated-client.ts'];
+      for (const f of bFiles) {
+        const filePath = join(bWorkdir, f);
+        const exists = existsSync(filePath);
+        console.log(`  Agent B file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
+        expect(exists).toBe(true);
+      }
+      // validated-client.ts should reference validation rules from Agent A's answers
+      const clientContent = readFileSync(join(bWorkdir, 'validated-client.ts'), 'utf-8');
+      console.log(`  Agent B validated-client.ts (${clientContent.length} chars): ${clientContent.substring(0, 120)}...`);
+      expect(clientContent.toLowerCase()).toContain('email');
+
+      // ---------------------------------------------------------------
+      // Verify interleaving: Agent A's JSONL log has coding tool calls
+      // (Write for .ts files) interleaved with conversation tool calls
+      // (Bash for cw listen/answer)
+      // ---------------------------------------------------------------
+      const aLogPath = join(harness.workspaceRoot, '.cw', 'agent-logs', agentA.name, 'output.jsonl');
+      const aLog = readFileSync(aLogPath, 'utf-8').trim().split('\n');
+      const toolCalls: { type: 'code' | 'conversation'; name: string; detail: string }[] = [];
+
+      for (const line of aLog) {
+        try {
+          const ev = JSON.parse(line);
+          if (ev.type !== 'assistant' || !ev.message?.content) continue;
+          for (const block of ev.message.content) {
+            if (block.type !== 'tool_use') continue;
+            const input = typeof block.input === 'string' ? block.input : JSON.stringify(block.input);
+            if (block.name === 'Write' && input.includes('.ts')) {
+              toolCalls.push({ type: 'code', name: 'Write', detail: input.substring(0, 80) });
+            } else if (block.name === 'Bash' && (input.includes('cw listen') || input.includes('cw answer'))) {
+              toolCalls.push({ type: 'conversation', name: 'Bash', detail: input.substring(0, 80) });
+            }
+          }
+        } catch { /* skip non-JSON lines */ }
+      }
+
+      console.log(`  Agent A interleaving (${toolCalls.length} relevant tool calls):`);
+      for (const tc of toolCalls) {
+        console.log(`    [${tc.type}] ${tc.name}: ${tc.detail}`);
+      }
+
+      // Must have both code and conversation tool calls
+      const hasCode = toolCalls.some((tc) => tc.type === 'code');
+      const hasConversation = toolCalls.some((tc) => tc.type === 'conversation');
+      expect(hasCode).toBe(true);
+      expect(hasConversation).toBe(true);
+
+      // Verify interleaving: at least one code call must appear AFTER a conversation call
+      // (proving coding continued after handling a question)
+      const firstConvIdx = toolCalls.findIndex((tc) => tc.type === 'conversation');
+      const lastCodeIdx = toolCalls.length - 1 - [...toolCalls].reverse().findIndex((tc) => tc.type === 'code');
+      console.log(`  First conversation at index ${firstConvIdx}, last code at index ${lastCodeIdx}`);
+      expect(lastCodeIdx).toBeGreaterThan(firstConvIdx);
    },
-    CONVERSATION_TEST_TIMEOUT,
+    TEST_TIMEOUT,
  );
 });