Codewalkers/apps/server/test/integration/real-providers/conversation.test.ts

/**
 * Real Claude Inter-Agent Conversation Integration Tests
 *
 * IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
 * They are SKIPPED by default and should only be run manually for validation.
 *
 * To run:
 * ```bash
 * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/conversation.test.ts --test-timeout=300000
 * ```
 *
 * Architecture:
 * - Mock conversation server (only cw listen/ask/answer endpoints, no full CoordinationServer)
 * - In-memory ConversationRepository (no SQLite, no FK constraints)
 * - Real agent harness for spawning two Claude sessions with actual coding tasks
 * - Two sequential questions prove the listen→answer→re-listen cycle works
 *
 * Estimated cost: ~$0.30 per full run (two Claude sessions)
 */

import { it, expect, beforeAll, afterAll } from 'vitest';
import { createServer } from 'node:http';
import type { Server } from 'node:http';
import { readFileSync, existsSync } from 'node:fs';
import { join } from 'node:path';
import { nanoid } from 'nanoid';
import { fetchRequestHandler } from '@trpc/server/adapters/fetch';
import { router, publicProcedure } from '../../../trpc/trpc.js';
import { conversationProcedures } from '../../../trpc/routers/conversation.js';
import { EventEmitterBus } from '../../../events/bus.js';
import type { ConversationRepository, CreateConversationData } from '../../../db/repositories/conversation-repository.js';
import type { Conversation } from '../../../db/schema.js';
import {
  createRealProviderHarness,
  describeRealClaude,
  sleep,
  type RealProviderHarness,
} from './harness.js';

const TEST_TIMEOUT = 300000; // 5 minutes — agents do real coding + conversation

// ---------------------------------------------------------------------------
// In-memory ConversationRepository — no SQLite, no FK constraints
// ---------------------------------------------------------------------------

class InMemoryConversationRepository implements ConversationRepository {
  private store = new Map<string, Conversation>();

  async create(data: CreateConversationData): Promise<Conversation> {
    const now = new Date();
    const conversation: Conversation = {
      id: nanoid(),
      fromAgentId: data.fromAgentId,
      toAgentId: data.toAgentId,
      initiativeId: data.initiativeId ?? null,
      phaseId: data.phaseId ?? null,
      taskId: data.taskId ?? null,
      question: data.question,
      answer: null,
      status: 'pending',
      createdAt: now,
      updatedAt: now,
    };
    this.store.set(conversation.id, conversation);
    return conversation;
  }

  async findById(id: string): Promise<Conversation | null> {
    return this.store.get(id) ?? null;
  }

  async findPendingForAgent(toAgentId: string): Promise<Conversation[]> {
    return [...this.store.values()]
      .filter((c) => c.toAgentId === toAgentId && c.status === 'pending')
      .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime());
  }

  async answer(id: string, answer: string): Promise<Conversation | null> {
    const conv = this.store.get(id);
    if (!conv) return null;
    const updated: Conversation = {
      ...conv,
      answer,
      status: 'answered' as const,
      updatedAt: new Date(),
    };
    this.store.set(id, updated);
    return updated;
  }

  /** Test helper — return all conversations */
  getAll(): Conversation[] {
    return [...this.store.values()];
  }
}

// ---------------------------------------------------------------------------
// Mock conversation server — serves ONLY conversation tRPC procedures
// ---------------------------------------------------------------------------

async function startMockConversationServer(): Promise<{
  server: Server;
  port: number;
  repo: InMemoryConversationRepository;
}> {
  const repo = new InMemoryConversationRepository();
  const eventBus = new EventEmitterBus();

  // Mini router with only conversation procedures
  const miniRouter = router({
    ...conversationProcedures(publicProcedure),
  });

  const httpServer = createServer(async (req, res) => {
    if (!req.url?.startsWith('/trpc')) {
      res.writeHead(404);
      res.end('Not found');
      return;
    }

    const host = req.headers.host ?? 'localhost';
    const url = new URL(req.url, `http://${host}`);

    let body: string | undefined;
    if (req.method !== 'GET' && req.method !== 'HEAD') {
      body = await new Promise<string>((resolve) => {
        let data = '';
        req.on('data', (chunk: Buffer) => {
          data += chunk.toString();
        });
        req.on('end', () => resolve(data));
      });
    }

    const headers = new Headers();
    for (const [key, value] of Object.entries(req.headers)) {
      if (value) {
        if (Array.isArray(value)) {
          value.forEach((v) => headers.append(key, v));
        } else {
          headers.set(key, value);
        }
      }
    }

    const fetchRequest = new Request(url.toString(), {
      method: req.method,
      headers,
      body: body ?? undefined,
    });

    const fetchResponse = await fetchRequestHandler({
      endpoint: '/trpc',
      req: fetchRequest,
      router: miniRouter,
      createContext: () =>
        ({
          eventBus,
          serverStartedAt: new Date(),
          processCount: 0,
          conversationRepository: repo,
          // Stub — requireAgentManager is called unconditionally in createConversation,
          // but list() is only invoked for taskId/phaseId resolution. With --agent-id
          // targeting, list() is never called.
          agentManager: { list: async () => [] },
        }) as any,
    });

    res.statusCode = fetchResponse.status;
    fetchResponse.headers.forEach((value, key) => {
      res.setHeader(key, value);
    });

    if (fetchResponse.body) {
      const reader = fetchResponse.body.getReader();
      const pump = async () => {
        while (true) {
          const { done, value } = await reader.read();
          if (done) {
            res.end();
            return;
          }
          res.write(value);
        }
      };
      pump().catch(() => res.end());
    } else {
      res.end(await fetchResponse.text());
    }
  });

  const port = 40000 + Math.floor(Math.random() * 10000);
  await new Promise<void>((resolve) => {
    httpServer.listen(port, '127.0.0.1', () => resolve());
  });

  return { server: httpServer, port, repo };
}

// ---------------------------------------------------------------------------
// Diagnostic helpers
// ---------------------------------------------------------------------------

function dumpAgentLogs(workspaceRoot: string, agentName: string) {
  const logDir = join(workspaceRoot, '.cw', 'agent-logs', agentName);
  if (!existsSync(logDir)) {
    console.log(`  [${agentName}] No log directory at ${logDir}`);
    return;
  }
  // Dump output.jsonl (last 30 lines)
  const outputPath = join(logDir, 'output.jsonl');
  if (existsSync(outputPath)) {
    const lines = readFileSync(outputPath, 'utf-8').trim().split('\n');
    const last = lines.slice(-30);
    console.log(`  [${agentName}] output.jsonl (last ${last.length}/${lines.length} lines):`);
    for (const line of last) {
      try {
        const ev = JSON.parse(line);
        if (ev.type === 'assistant' && ev.message?.content) {
          for (const block of ev.message.content) {
            if (block.type === 'text') {
              console.log(`    TEXT: ${block.text.substring(0, 200)}`);
            } else if (block.type === 'tool_use') {
              console.log(`    TOOL: ${block.name} ${JSON.stringify(block.input).substring(0, 150)}`);
            }
          }
        } else if (ev.type === 'result') {
          console.log(`    RESULT: ${JSON.stringify(ev).substring(0, 300)}`);
        }
      } catch {
        console.log(`    RAW: ${line.substring(0, 200)}`);
      }
    }
  }
  // Dump stderr
  const stderrPath = join(logDir, 'stderr.log');
  if (existsSync(stderrPath)) {
    const stderr = readFileSync(stderrPath, 'utf-8').trim();
    if (stderr) {
      console.log(`  [${agentName}] stderr: ${stderr.substring(0, 500)}`);
    }
  }
}

// ---------------------------------------------------------------------------
// Test suite
// ---------------------------------------------------------------------------

describeRealClaude('Real Inter-Agent Conversation (mock server)', () => {
  let harness: RealProviderHarness;
  let mockServer: Server;
  let mockPort: number;
  let mockRepo: InMemoryConversationRepository;
  const originalCwPort = process.env.CW_PORT;

  beforeAll(async () => {
    console.log('\n=== Real Inter-Agent Conversation Test ===');
    console.log('Mock conversation server + two Claude sessions.\n');

    // Start mock conversation server (only listen/ask/answer endpoints)
    const mock = await startMockConversationServer();
    mockServer = mock.server;
    mockPort = mock.port;
    mockRepo = mock.repo;
    console.log(`  Mock server on port ${mockPort}`);

    // Set CW_PORT so agents' cw commands hit the mock server
    process.env.CW_PORT = String(mockPort);

    // Real agent harness for spawning + worktrees (no full CoordinationServer)
    harness = await createRealProviderHarness({ provider: 'claude' });
    console.log(`  Workspace: ${harness.workspaceRoot}`);
  });

  afterAll(async () => {
    if (originalCwPort) {
      process.env.CW_PORT = originalCwPort;
    } else {
      delete process.env.CW_PORT;
    }
    await harness?.cleanup();
    mockServer?.close();
  });

  it(
    'two agents with real tasks communicate via cw ask/listen/answer (two questions prove re-listen)',
    async () => {
      const agentSuffix = nanoid(6); // unique suffix for temp files

      // ---------------------------------------------------------------
      // Agent A — builds a validator module WHILE answering questions
      // in the background via cw listen
      // ---------------------------------------------------------------
      const agentA = await harness.agentManager.spawn({
        taskId: null,
        prompt: `You are Agent A in a multi-agent coordination test.

You have TWO concurrent responsibilities:
1. Build a TypeScript validator module (your main coding task)
2. Answer questions from other agents via a background listener

SETUP (do this first):
- Read .cw/input/manifest.json to get your agentId
- Start a background listener that writes to a temp file:
    cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
    LISTEN_PID=$!

MAIN CODING TASK — implement a user registration validator:

1. Create types.ts:
   export interface RegistrationInput { name: string; email: string; password: string; }
   export interface ValidationResult { valid: boolean; errors: string[]; }

2. Create validator.ts:
   Import from types.ts. Export function validateRegistration(input: RegistrationInput): ValidationResult
   Rules: name min 2 chars, email must have exactly one @ and domain with a dot and no spaces and max 254 chars, password min 8 chars.

3. Create index.ts that re-exports everything from types.ts and validator.ts.

BETWEEN EACH FILE, check for incoming questions:
   if [ -s /tmp/cw-listen-${agentSuffix}.txt ]; then
     # parse the JSON, get conversationId and question
     # answer: cw answer "<answer based on your code>" --conversation-id <id>
     # clear and restart listener:
     > /tmp/cw-listen-${agentSuffix}.txt
     cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
     LISTEN_PID=$!
   fi

You will receive TWO questions total while you work. Answer them based on the code you are writing.

CLEANUP: After all 3 files are written and both questions answered:
- kill $LISTEN_PID 2>/dev/null
- Write .cw/output/signal.json: {"status":"done","result":"validator module complete, answered 2 questions"}

CRITICAL:
- The listener MUST run in the background while you write code.
- Check for questions between files, not as blocking waits.
- The CW_PORT environment variable is already set to ${mockPort}.`,
        mode: 'execute',
        provider: 'claude',
        inputContext: {},
      });

      console.log(`  Agent A: ${agentA.id} (${agentA.name})`);

      // Give Agent A time to start its background listener and begin coding
      await sleep(15000);

      // ---------------------------------------------------------------
      // Agent B — builds a client module, asks Agent A questions to
      // learn the validation rules, then uses answers in its code
      // ---------------------------------------------------------------
      const agentB = await harness.agentManager.spawn({
        taskId: null,
        prompt: `You are Agent B in a multi-agent coordination test.

Read .cw/input/manifest.json to get your agentId. Agent A (ID: ${agentA.id}) is building a validator module.

YOUR CODING TASK — build a registration API client that includes client-side validation matching Agent A's server-side rules:

1. Create client-scaffold.ts with a basic RegistrationClient class that has a register(name, email, password) method that returns Promise<{ok: boolean}>.
   Leave a TODO comment where validation will go.

2. NOW ask Agent A what the validation rules are — you need this to write proper client-side checks:
   FIELDS=$(cw ask "What are the required fields and their types for registration?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)

3. Ask Agent A about the specific email validation rules:
   EMAIL_RULES=$(cw ask "What are the exact email validation rules you implemented?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)

4. Create validated-client.ts — a COMPLETE implementation using the answers:
   Import the scaffold, add a validateBeforeSubmit(name, email, password) function
   that implements the EXACT validation rules Agent A told you about.
   Include a comment at the top with the rules you received.

5. Write .cw/output/signal.json: {"status":"done","result":"client module complete with validation from agent A"}

CRITICAL:
- Create client-scaffold.ts BEFORE asking questions (you have independent work to do first).
- Use the ACTUAL answers from Agent A in your validated-client.ts implementation.
- The CW_PORT environment variable is already set to ${mockPort}.`,
        mode: 'execute',
        provider: 'claude',
        inputContext: {},
      });

      console.log(`  Agent B: ${agentB.id} (${agentB.name})`);

      // ---------------------------------------------------------------
      // Wait for both agents to stop running, then verify conversations
      // ---------------------------------------------------------------
      const deadline = Date.now() + TEST_TIMEOUT;
      let aDone = false;
      let bDone = false;
      let lastLogTime = 0;

      while (Date.now() < deadline && (!aDone || !bDone)) {
        const agentAInfo = await harness.agentRepository.findById(agentA.id);
        const agentBInfo = await harness.agentRepository.findById(agentB.id);

        // Periodic progress logging every 30s
        if (Date.now() - lastLogTime > 30000) {
          const elapsed = Math.round((Date.now() - (deadline - TEST_TIMEOUT)) / 1000);
          console.log(`  [${elapsed}s] A=${agentAInfo?.status ?? '?'} B=${agentBInfo?.status ?? '?'} convs=${mockRepo.getAll().length}`);
          lastLogTime = Date.now();
        }

        if (agentAInfo && agentAInfo.status !== 'running' && !aDone) {
          aDone = true;
          console.log(`  Agent A final status: ${agentAInfo.status}`);
          dumpAgentLogs(harness.workspaceRoot, agentA.name);
        }
        if (agentBInfo && agentBInfo.status !== 'running' && !bDone) {
          bDone = true;
          console.log(`  Agent B final status: ${agentBInfo.status}`);
          dumpAgentLogs(harness.workspaceRoot, agentB.name);
        }

        if (!aDone || !bDone) await sleep(2000);
      }

      expect(aDone).toBe(true);
      expect(bDone).toBe(true);

      // ---------------------------------------------------------------
      // Verify conversations in mock repo
      // ---------------------------------------------------------------
      const allConversations = mockRepo.getAll();
      console.log(`  Total conversations: ${allConversations.length}`);
      for (const c of allConversations) {
        console.log(
          `    ${c.id}: ${c.status} — Q: "${c.question}" A: "${c.answer?.substring(0, 80)}..."`,
        );
      }

      // Exactly 2 conversations, both answered
      expect(allConversations.length).toBe(2);
      expect(allConversations.every((c) => c.status === 'answered')).toBe(true);

      // Both target Agent A, both from Agent B
      expect(allConversations.every((c) => c.toAgentId === agentA.id)).toBe(true);
      expect(allConversations.every((c) => c.fromAgentId === agentB.id)).toBe(true);

      // Questions should be distinct (one about fields, one about email validation)
      const questions = allConversations.map((c) => c.question);
      expect(questions.some((q) => q.toLowerCase().includes('field'))).toBe(true);
      expect(questions.some((q) => q.toLowerCase().includes('email'))).toBe(true);

      // Both answers should be non-empty
      expect(allConversations.every((c) => c.answer && c.answer.length > 0)).toBe(true);

      // ---------------------------------------------------------------
      // Verify Agent A's coding output — validator module files exist
      // ---------------------------------------------------------------
      const aWorkdir = join(
        harness.workspaceRoot,
        'agent-workdirs',
        agentA.name,
        'workspace',
      );
      const aFiles = ['types.ts', 'validator.ts', 'index.ts'];
      for (const f of aFiles) {
        const filePath = join(aWorkdir, f);
        const exists = existsSync(filePath);
        console.log(`  Agent A file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
        expect(exists).toBe(true);
      }
      // validator.ts should contain actual validation logic
      const validatorContent = readFileSync(join(aWorkdir, 'validator.ts'), 'utf-8');
      console.log(`  Agent A validator.ts (${validatorContent.length} chars): ${validatorContent.substring(0, 120)}...`);
      expect(validatorContent.toLowerCase()).toContain('email');
      expect(validatorContent.toLowerCase()).toContain('password');

      // ---------------------------------------------------------------
      // Verify Agent B's coding output — client module files exist
      // ---------------------------------------------------------------
      const bWorkdir = join(
        harness.workspaceRoot,
        'agent-workdirs',
        agentB.name,
        'workspace',
      );
      const bFiles = ['client-scaffold.ts', 'validated-client.ts'];
      for (const f of bFiles) {
        const filePath = join(bWorkdir, f);
        const exists = existsSync(filePath);
        console.log(`  Agent B file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
        expect(exists).toBe(true);
      }
      // validated-client.ts should reference validation rules from Agent A's answers
      const clientContent = readFileSync(join(bWorkdir, 'validated-client.ts'), 'utf-8');
      console.log(`  Agent B validated-client.ts (${clientContent.length} chars): ${clientContent.substring(0, 120)}...`);
      expect(clientContent.toLowerCase()).toContain('email');

      // ---------------------------------------------------------------
      // Verify interleaving: Agent A's JSONL log has coding tool calls
      // (Write for .ts files) interleaved with conversation tool calls
      // (Bash for cw listen/answer)
      // ---------------------------------------------------------------
      const aLogPath = join(harness.workspaceRoot, '.cw', 'agent-logs', agentA.name, 'output.jsonl');
      const aLog = readFileSync(aLogPath, 'utf-8').trim().split('\n');
      const toolCalls: { type: 'code' | 'conversation'; name: string; detail: string }[] = [];

      for (const line of aLog) {
        try {
          const ev = JSON.parse(line);
          if (ev.type !== 'assistant' || !ev.message?.content) continue;
          for (const block of ev.message.content) {
            if (block.type !== 'tool_use') continue;
            const input = typeof block.input === 'string' ? block.input : JSON.stringify(block.input);
            if (block.name === 'Write' && input.includes('.ts')) {
              toolCalls.push({ type: 'code', name: 'Write', detail: input.substring(0, 80) });
            } else if (block.name === 'Bash' && (input.includes('cw listen') || input.includes('cw answer'))) {
              toolCalls.push({ type: 'conversation', name: 'Bash', detail: input.substring(0, 80) });
            }
          }
        } catch { /* skip non-JSON lines */ }
      }

      console.log(`  Agent A interleaving (${toolCalls.length} relevant tool calls):`);
      for (const tc of toolCalls) {
        console.log(`    [${tc.type}] ${tc.name}: ${tc.detail}`);
      }

      // Must have both code and conversation tool calls
      const hasCode = toolCalls.some((tc) => tc.type === 'code');
      const hasConversation = toolCalls.some((tc) => tc.type === 'conversation');
      expect(hasCode).toBe(true);
      expect(hasConversation).toBe(true);

      // Verify interleaving: at least one code call must appear AFTER a conversation call
      // (proving coding continued after handling a question)
      const firstConvIdx = toolCalls.findIndex((tc) => tc.type === 'conversation');
      const lastCodeIdx = toolCalls.length - 1 - [...toolCalls].reverse().findIndex((tc) => tc.type === 'code');
      console.log(`  First conversation at index ${firstConvIdx}, last code at index ${lastCodeIdx}`);
      expect(lastCodeIdx).toBeGreaterThan(firstConvIdx);
    },
    TEST_TIMEOUT,
  );
});