Codewalkers/apps/server/test/integration/real-claude.test.ts

/**
 * Real Claude CLI Integration Tests
 *
 * IMPORTANT: These tests call the real Claude CLI and incur API costs.
 * They are SKIPPED by default and should only be run manually for validation.
 *
 * To run these tests:
 * ```bash
 * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts --test-timeout=120000
 * ```
 *
 * Purpose:
 * - Validate that JSON schemas work correctly with Claude CLI --json-schema flag
 * - Confirm MockAgentManager accurately simulates real CLI behavior
 * - Document actual response structure and costs
 *
 * Updated (2026-02-06): Now uses the universal agentSignalSchema instead of
 * per-mode schemas. Agents output trivial signals (done/questions/error) and
 * write files instead of producing mode-specific JSON.
 *
 * Total validation cost: ~$0.10 (3 tests)
 */

import { describe, it, expect, beforeAll } from 'vitest';
import { execa } from 'execa';
import {
  agentSignalJsonSchema,
  agentSignalSchema,
} from '../../agent/schema.js';

/**
 * Result structure from Claude CLI with --output-format json
 *
 * When --json-schema is used:
 * - result: "" (empty string)
 * - structured_output: { ... } (the validated JSON object)
 */
interface ClaudeCliResult {
  type: 'result';
  subtype: 'success' | 'error' | 'error_max_turns';
  is_error: boolean;
  session_id: string;
  result: string;
  structured_output?: unknown;
  total_cost_usd?: number;
}

/**
 * Helper to call Claude CLI directly with a prompt and JSON schema.
 *
 * @param prompt - The prompt to send to Claude
 * @param jsonSchema - JSON schema to enforce structured output
 * @param timeoutMs - Timeout in milliseconds (default 90s)
 * @returns Parsed CLI result with structured_output
 */
async function callClaudeCli(
  prompt: string,
  jsonSchema: object,
  timeoutMs = 90000
): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
  const startTime = Date.now();

  const { stdout } = await execa(
    'claude',
    [
      '-p',
      prompt,
      '--output-format',
      'json',
      '--json-schema',
      JSON.stringify(jsonSchema),
    ],
    {
      timeout: timeoutMs,
    }
  );

  const duration = Date.now() - startTime;
  const cliResult: ClaudeCliResult = JSON.parse(stdout);

  console.log(`\n  Duration: ${(duration / 1000).toFixed(1)}s`);
  console.log(`  Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
  console.log(`  Session ID: ${cliResult.session_id}`);
  console.log(`  Result field empty: ${cliResult.result === ''}`);
  console.log(`  Has structured_output: ${cliResult.structured_output !== undefined}`);

  // When --json-schema is used, structured output is in structured_output field
  // The result field is typically empty when using --json-schema
  const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);

  return { cliResult, structuredOutput };
}

/**
 * Check if real Claude tests should run.
 * Set REAL_CLAUDE_TESTS=1 environment variable to enable.
 */
const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';

/**
 * Skip wrapper - tests are expensive and should run manually
 */
const describeReal = shouldRunRealTests ? describe : describe.skip;

// Individual test timeout - real API calls take 5-30 seconds
const TEST_TIMEOUT = 120000; // 2 minutes

describeReal('Real Claude CLI Integration', () => {
  beforeAll(() => {
    console.log('\n=== Running Real Claude CLI Tests ===');
    console.log('These tests call the real Claude API and incur costs.\n');
  });

  describe('Universal Signal Schema', () => {
    it(
      'should return done status',
      async () => {
        const prompt = `Complete this simple task: Say "Hello, World!" as a test.

Output your response in the required JSON format with status "done".`;

        const { cliResult, structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);

        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

        // Verify the CLI response structure
        expect(cliResult.subtype).toBe('success');
        expect(cliResult.result).toBe(''); // Empty when using --json-schema
        expect(cliResult.structured_output).toBeDefined();

        // Validate against Zod schema
        const parsed = agentSignalSchema.parse(structuredOutput);
        expect(parsed.status).toBe('done');
      },
      TEST_TIMEOUT
    );

    it(
      'should return questions status with array',
      async () => {
        const prompt = `You are working on a vague task: "Make it better"

You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.

Output your response with status "questions" and include at least 2 questions with unique IDs.`;

        const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);

        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

        // Validate against Zod schema
        const parsed = agentSignalSchema.parse(structuredOutput);
        expect(parsed.status).toBe('questions');
        if (parsed.status === 'questions') {
          expect(Array.isArray(parsed.questions)).toBe(true);
          expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
          expect(parsed.questions[0].id).toBeTruthy();
          expect(parsed.questions[0].question).toBeTruthy();
        }
      },
      TEST_TIMEOUT
    );

    it(
      'should return error status',
      async () => {
        const prompt = `You have encountered an unrecoverable error. Output your response with status "error" and a descriptive error message.`;

        const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);

        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

        // Validate against Zod schema
        const parsed = agentSignalSchema.parse(structuredOutput);
        expect(parsed.status).toBe('error');
        if (parsed.status === 'error') {
          expect(parsed.error).toBeTruthy();
        }
      },
      TEST_TIMEOUT
    );
  });
});