/** * Real Claude Inter-Agent Conversation Integration Tests * * IMPORTANT: These tests call the REAL Claude CLI and incur API costs! * They are SKIPPED by default and should only be run manually for validation. * * To run: * ```bash * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/conversation.test.ts --test-timeout=300000 * ``` * * Architecture: * - Mock conversation server (only cw listen/ask/answer endpoints, no full CoordinationServer) * - In-memory ConversationRepository (no SQLite, no FK constraints) * - Real agent harness for spawning two Claude sessions with actual coding tasks * - Two sequential questions prove the listen→answer→re-listen cycle works * * Estimated cost: ~$0.30 per full run (two Claude sessions) */ import { it, expect, beforeAll, afterAll } from 'vitest'; import { createServer } from 'node:http'; import type { Server } from 'node:http'; import { readFileSync, existsSync } from 'node:fs'; import { join } from 'node:path'; import { nanoid } from 'nanoid'; import { fetchRequestHandler } from '@trpc/server/adapters/fetch'; import { router, publicProcedure } from '../../../trpc/trpc.js'; import { conversationProcedures } from '../../../trpc/routers/conversation.js'; import { EventEmitterBus } from '../../../events/bus.js'; import type { ConversationRepository, CreateConversationData } from '../../../db/repositories/conversation-repository.js'; import type { Conversation } from '../../../db/schema.js'; import { createRealProviderHarness, describeRealClaude, sleep, type RealProviderHarness, } from './harness.js'; const TEST_TIMEOUT = 300000; // 5 minutes — agents do real coding + conversation // --------------------------------------------------------------------------- // In-memory ConversationRepository — no SQLite, no FK constraints // --------------------------------------------------------------------------- class InMemoryConversationRepository implements ConversationRepository { private store = new Map(); async create(data: CreateConversationData): Promise { const now = new Date(); const conversation: Conversation = { id: nanoid(), fromAgentId: data.fromAgentId, toAgentId: data.toAgentId, initiativeId: data.initiativeId ?? null, phaseId: data.phaseId ?? null, taskId: data.taskId ?? null, question: data.question, answer: null, status: 'pending', createdAt: now, updatedAt: now, }; this.store.set(conversation.id, conversation); return conversation; } async findById(id: string): Promise { return this.store.get(id) ?? null; } async findPendingForAgent(toAgentId: string): Promise { return [...this.store.values()] .filter((c) => c.toAgentId === toAgentId && c.status === 'pending') .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime()); } async answer(id: string, answer: string): Promise { const conv = this.store.get(id); if (!conv) return null; const updated: Conversation = { ...conv, answer, status: 'answered' as const, updatedAt: new Date(), }; this.store.set(id, updated); return updated; } async countByFromAgentIds(agentIds: string[]): Promise<{ agentId: string; count: number }[]> { if (agentIds.length === 0) return []; const counts = new Map(); for (const conv of this.store.values()) { if (agentIds.includes(conv.fromAgentId)) { counts.set(conv.fromAgentId, (counts.get(conv.fromAgentId) ?? 0) + 1); } } return [...counts.entries()].map(([agentId, count]) => ({ agentId, count })); } async findByFromAgentId(agentId: string): Promise { return [...this.store.values()] .filter((c) => c.fromAgentId === agentId) .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime()) .slice(0, 200); } /** Test helper — return all conversations */ getAll(): Conversation[] { return [...this.store.values()]; } } // --------------------------------------------------------------------------- // Mock conversation server — serves ONLY conversation tRPC procedures // --------------------------------------------------------------------------- async function startMockConversationServer(): Promise<{ server: Server; port: number; repo: InMemoryConversationRepository; }> { const repo = new InMemoryConversationRepository(); const eventBus = new EventEmitterBus(); // Mini router with only conversation procedures const miniRouter = router({ ...conversationProcedures(publicProcedure), }); const httpServer = createServer(async (req, res) => { if (!req.url?.startsWith('/trpc')) { res.writeHead(404); res.end('Not found'); return; } const host = req.headers.host ?? 'localhost'; const url = new URL(req.url, `http://${host}`); let body: string | undefined; if (req.method !== 'GET' && req.method !== 'HEAD') { body = await new Promise((resolve) => { let data = ''; req.on('data', (chunk: Buffer) => { data += chunk.toString(); }); req.on('end', () => resolve(data)); }); } const headers = new Headers(); for (const [key, value] of Object.entries(req.headers)) { if (value) { if (Array.isArray(value)) { value.forEach((v) => headers.append(key, v)); } else { headers.set(key, value); } } } const fetchRequest = new Request(url.toString(), { method: req.method, headers, body: body ?? undefined, }); const fetchResponse = await fetchRequestHandler({ endpoint: '/trpc', req: fetchRequest, router: miniRouter, createContext: () => ({ eventBus, serverStartedAt: new Date(), processCount: 0, conversationRepository: repo, // Stub — requireAgentManager is called unconditionally in createConversation, // but list() is only invoked for taskId/phaseId resolution. With --agent-id // targeting, list() is never called. agentManager: { list: async () => [] }, }) as any, }); res.statusCode = fetchResponse.status; fetchResponse.headers.forEach((value, key) => { res.setHeader(key, value); }); if (fetchResponse.body) { const reader = fetchResponse.body.getReader(); const pump = async () => { while (true) { const { done, value } = await reader.read(); if (done) { res.end(); return; } res.write(value); } }; pump().catch(() => res.end()); } else { res.end(await fetchResponse.text()); } }); const port = 40000 + Math.floor(Math.random() * 10000); await new Promise((resolve) => { httpServer.listen(port, '127.0.0.1', () => resolve()); }); return { server: httpServer, port, repo }; } // --------------------------------------------------------------------------- // Diagnostic helpers // --------------------------------------------------------------------------- function dumpAgentLogs(workspaceRoot: string, agentName: string) { const logDir = join(workspaceRoot, '.cw', 'agent-logs', agentName); if (!existsSync(logDir)) { console.log(` [${agentName}] No log directory at ${logDir}`); return; } // Dump output.jsonl (last 30 lines) const outputPath = join(logDir, 'output.jsonl'); if (existsSync(outputPath)) { const lines = readFileSync(outputPath, 'utf-8').trim().split('\n'); const last = lines.slice(-30); console.log(` [${agentName}] output.jsonl (last ${last.length}/${lines.length} lines):`); for (const line of last) { try { const ev = JSON.parse(line); if (ev.type === 'assistant' && ev.message?.content) { for (const block of ev.message.content) { if (block.type === 'text') { console.log(` TEXT: ${block.text.substring(0, 200)}`); } else if (block.type === 'tool_use') { console.log(` TOOL: ${block.name} ${JSON.stringify(block.input).substring(0, 150)}`); } } } else if (ev.type === 'result') { console.log(` RESULT: ${JSON.stringify(ev).substring(0, 300)}`); } } catch { console.log(` RAW: ${line.substring(0, 200)}`); } } } // Dump stderr const stderrPath = join(logDir, 'stderr.log'); if (existsSync(stderrPath)) { const stderr = readFileSync(stderrPath, 'utf-8').trim(); if (stderr) { console.log(` [${agentName}] stderr: ${stderr.substring(0, 500)}`); } } } // --------------------------------------------------------------------------- // Test suite // --------------------------------------------------------------------------- describeRealClaude('Real Inter-Agent Conversation (mock server)', () => { let harness: RealProviderHarness; let mockServer: Server; let mockPort: number; let mockRepo: InMemoryConversationRepository; const originalCwPort = process.env.CW_PORT; beforeAll(async () => { console.log('\n=== Real Inter-Agent Conversation Test ==='); console.log('Mock conversation server + two Claude sessions.\n'); // Start mock conversation server (only listen/ask/answer endpoints) const mock = await startMockConversationServer(); mockServer = mock.server; mockPort = mock.port; mockRepo = mock.repo; console.log(` Mock server on port ${mockPort}`); // Set CW_PORT so agents' cw commands hit the mock server process.env.CW_PORT = String(mockPort); // Real agent harness for spawning + worktrees (no full CoordinationServer) harness = await createRealProviderHarness({ provider: 'claude' }); console.log(` Workspace: ${harness.workspaceRoot}`); }); afterAll(async () => { if (originalCwPort) { process.env.CW_PORT = originalCwPort; } else { delete process.env.CW_PORT; } await harness?.cleanup(); mockServer?.close(); }); it( 'two agents with real tasks communicate via cw ask/listen/answer (two questions prove re-listen)', async () => { const agentSuffix = nanoid(6); // unique suffix for temp files // --------------------------------------------------------------- // Agent A — builds a validator module WHILE answering questions // in the background via cw listen // --------------------------------------------------------------- const agentA = await harness.agentManager.spawn({ taskId: null, prompt: `You are Agent A in a multi-agent coordination test. You have TWO concurrent responsibilities: 1. Build a TypeScript validator module (your main coding task) 2. Answer questions from other agents via a background listener SETUP (do this first): - Read .cw/input/manifest.json to get your agentId - Start a background listener that writes to a temp file: cw listen --agent-id --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 & LISTEN_PID=$! MAIN CODING TASK — implement a user registration validator: 1. Create types.ts: export interface RegistrationInput { name: string; email: string; password: string; } export interface ValidationResult { valid: boolean; errors: string[]; } 2. Create validator.ts: Import from types.ts. Export function validateRegistration(input: RegistrationInput): ValidationResult Rules: name min 2 chars, email must have exactly one @ and domain with a dot and no spaces and max 254 chars, password min 8 chars. 3. Create index.ts that re-exports everything from types.ts and validator.ts. BETWEEN EACH FILE, check for incoming questions: if [ -s /tmp/cw-listen-${agentSuffix}.txt ]; then # parse the JSON, get conversationId and question # answer: cw answer "" --conversation-id # clear and restart listener: > /tmp/cw-listen-${agentSuffix}.txt cw listen --agent-id --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 & LISTEN_PID=$! fi You will receive TWO questions total while you work. Answer them based on the code you are writing. CLEANUP: After all 3 files are written and both questions answered: - kill $LISTEN_PID 2>/dev/null - Write .cw/output/signal.json: {"status":"done","result":"validator module complete, answered 2 questions"} CRITICAL: - The listener MUST run in the background while you write code. - Check for questions between files, not as blocking waits. - The CW_PORT environment variable is already set to ${mockPort}.`, mode: 'execute', provider: 'claude', inputContext: {}, }); console.log(` Agent A: ${agentA.id} (${agentA.name})`); // Give Agent A time to start its background listener and begin coding await sleep(15000); // --------------------------------------------------------------- // Agent B — builds a client module, asks Agent A questions to // learn the validation rules, then uses answers in its code // --------------------------------------------------------------- const agentB = await harness.agentManager.spawn({ taskId: null, prompt: `You are Agent B in a multi-agent coordination test. Read .cw/input/manifest.json to get your agentId. Agent A (ID: ${agentA.id}) is building a validator module. YOUR CODING TASK — build a registration API client that includes client-side validation matching Agent A's server-side rules: 1. Create client-scaffold.ts with a basic RegistrationClient class that has a register(name, email, password) method that returns Promise<{ok: boolean}>. Leave a TODO comment where validation will go. 2. NOW ask Agent A what the validation rules are — you need this to write proper client-side checks: FIELDS=$(cw ask "What are the required fields and their types for registration?" --from --agent-id ${agentA.id} --timeout 120000) 3. Ask Agent A about the specific email validation rules: EMAIL_RULES=$(cw ask "What are the exact email validation rules you implemented?" --from --agent-id ${agentA.id} --timeout 120000) 4. Create validated-client.ts — a COMPLETE implementation using the answers: Import the scaffold, add a validateBeforeSubmit(name, email, password) function that implements the EXACT validation rules Agent A told you about. Include a comment at the top with the rules you received. 5. Write .cw/output/signal.json: {"status":"done","result":"client module complete with validation from agent A"} CRITICAL: - Create client-scaffold.ts BEFORE asking questions (you have independent work to do first). - Use the ACTUAL answers from Agent A in your validated-client.ts implementation. - The CW_PORT environment variable is already set to ${mockPort}.`, mode: 'execute', provider: 'claude', inputContext: {}, }); console.log(` Agent B: ${agentB.id} (${agentB.name})`); // --------------------------------------------------------------- // Wait for both agents to stop running, then verify conversations // --------------------------------------------------------------- const deadline = Date.now() + TEST_TIMEOUT; let aDone = false; let bDone = false; let lastLogTime = 0; while (Date.now() < deadline && (!aDone || !bDone)) { const agentAInfo = await harness.agentRepository.findById(agentA.id); const agentBInfo = await harness.agentRepository.findById(agentB.id); // Periodic progress logging every 30s if (Date.now() - lastLogTime > 30000) { const elapsed = Math.round((Date.now() - (deadline - TEST_TIMEOUT)) / 1000); console.log(` [${elapsed}s] A=${agentAInfo?.status ?? '?'} B=${agentBInfo?.status ?? '?'} convs=${mockRepo.getAll().length}`); lastLogTime = Date.now(); } if (agentAInfo && agentAInfo.status !== 'running' && !aDone) { aDone = true; console.log(` Agent A final status: ${agentAInfo.status}`); dumpAgentLogs(harness.workspaceRoot, agentA.name); } if (agentBInfo && agentBInfo.status !== 'running' && !bDone) { bDone = true; console.log(` Agent B final status: ${agentBInfo.status}`); dumpAgentLogs(harness.workspaceRoot, agentB.name); } if (!aDone || !bDone) await sleep(2000); } expect(aDone).toBe(true); expect(bDone).toBe(true); // --------------------------------------------------------------- // Verify conversations in mock repo // --------------------------------------------------------------- const allConversations = mockRepo.getAll(); console.log(` Total conversations: ${allConversations.length}`); for (const c of allConversations) { console.log( ` ${c.id}: ${c.status} — Q: "${c.question}" A: "${c.answer?.substring(0, 80)}..."`, ); } // Exactly 2 conversations, both answered expect(allConversations.length).toBe(2); expect(allConversations.every((c) => c.status === 'answered')).toBe(true); // Both target Agent A, both from Agent B expect(allConversations.every((c) => c.toAgentId === agentA.id)).toBe(true); expect(allConversations.every((c) => c.fromAgentId === agentB.id)).toBe(true); // Questions should be distinct (one about fields, one about email validation) const questions = allConversations.map((c) => c.question); expect(questions.some((q) => q.toLowerCase().includes('field'))).toBe(true); expect(questions.some((q) => q.toLowerCase().includes('email'))).toBe(true); // Both answers should be non-empty expect(allConversations.every((c) => c.answer && c.answer.length > 0)).toBe(true); // --------------------------------------------------------------- // Verify Agent A's coding output — validator module files exist // --------------------------------------------------------------- const aWorkdir = join( harness.workspaceRoot, 'agent-workdirs', agentA.name, 'workspace', ); const aFiles = ['types.ts', 'validator.ts', 'index.ts']; for (const f of aFiles) { const filePath = join(aWorkdir, f); const exists = existsSync(filePath); console.log(` Agent A file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`); expect(exists).toBe(true); } // validator.ts should contain actual validation logic const validatorContent = readFileSync(join(aWorkdir, 'validator.ts'), 'utf-8'); console.log(` Agent A validator.ts (${validatorContent.length} chars): ${validatorContent.substring(0, 120)}...`); expect(validatorContent.toLowerCase()).toContain('email'); expect(validatorContent.toLowerCase()).toContain('password'); // --------------------------------------------------------------- // Verify Agent B's coding output — client module files exist // --------------------------------------------------------------- const bWorkdir = join( harness.workspaceRoot, 'agent-workdirs', agentB.name, 'workspace', ); const bFiles = ['client-scaffold.ts', 'validated-client.ts']; for (const f of bFiles) { const filePath = join(bWorkdir, f); const exists = existsSync(filePath); console.log(` Agent B file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`); expect(exists).toBe(true); } // validated-client.ts should reference validation rules from Agent A's answers const clientContent = readFileSync(join(bWorkdir, 'validated-client.ts'), 'utf-8'); console.log(` Agent B validated-client.ts (${clientContent.length} chars): ${clientContent.substring(0, 120)}...`); expect(clientContent.toLowerCase()).toContain('email'); // --------------------------------------------------------------- // Verify interleaving: Agent A's JSONL log has coding tool calls // (Write for .ts files) interleaved with conversation tool calls // (Bash for cw listen/answer) // --------------------------------------------------------------- const aLogPath = join(harness.workspaceRoot, '.cw', 'agent-logs', agentA.name, 'output.jsonl'); const aLog = readFileSync(aLogPath, 'utf-8').trim().split('\n'); const toolCalls: { type: 'code' | 'conversation'; name: string; detail: string }[] = []; for (const line of aLog) { try { const ev = JSON.parse(line); if (ev.type !== 'assistant' || !ev.message?.content) continue; for (const block of ev.message.content) { if (block.type !== 'tool_use') continue; const input = typeof block.input === 'string' ? block.input : JSON.stringify(block.input); if (block.name === 'Write' && input.includes('.ts')) { toolCalls.push({ type: 'code', name: 'Write', detail: input.substring(0, 80) }); } else if (block.name === 'Bash' && (input.includes('cw listen') || input.includes('cw answer'))) { toolCalls.push({ type: 'conversation', name: 'Bash', detail: input.substring(0, 80) }); } } } catch { /* skip non-JSON lines */ } } console.log(` Agent A interleaving (${toolCalls.length} relevant tool calls):`); for (const tc of toolCalls) { console.log(` [${tc.type}] ${tc.name}: ${tc.detail}`); } // Must have both code and conversation tool calls const hasCode = toolCalls.some((tc) => tc.type === 'code'); const hasConversation = toolCalls.some((tc) => tc.type === 'conversation'); expect(hasCode).toBe(true); expect(hasConversation).toBe(true); // Verify interleaving: at least one code call must appear AFTER a conversation call // (proving coding continued after handling a question) const firstConvIdx = toolCalls.findIndex((tc) => tc.type === 'conversation'); const lastCodeIdx = toolCalls.length - 1 - [...toolCalls].reverse().findIndex((tc) => tc.type === 'code'); console.log(` First conversation at index ${firstConvIdx}, last code at index ${lastCodeIdx}`); expect(lastCodeIdx).toBeGreaterThan(firstConvIdx); }, TEST_TIMEOUT, ); });