Codewalkers/apps/server/test/integration/real-providers/codex-manager.test.ts

/**
 * Real Codex CLI Manager Integration Tests
 *
 * IMPORTANT: These tests call the REAL Codex CLI and incur API costs!
 * They are SKIPPED by default and should only be run manually for validation.
 *
 * To run these tests:
 * ```bash
 * REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts --test-timeout=300000
 * ```
 *
 * Tests covered:
 * - Codex spawn and thread_id extraction
 * - Generic output parsing (non-schema)
 * - Streaming output
 *
 * Estimated cost: ~$0.10 per full run
 *
 * Note: Codex uses different output format and session ID field (thread_id).
 */

import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import {
  createRealProviderHarness,
  describeRealCodex,
  REAL_TEST_TIMEOUT,
  type RealProviderHarness,
} from './harness.js';
import { CODEX_PROMPTS } from './prompts.js';
import type { AgentSpawnedEvent, AgentOutputEvent } from '../../../events/types.js';

describeRealCodex('Real Codex Manager Integration', () => {
  let harness: RealProviderHarness;

  beforeAll(async () => {
    console.log('\n=== Running Real Codex Manager Tests ===');
    console.log('These tests call the real Codex API and incur costs.\n');
    harness = await createRealProviderHarness({ provider: 'codex' });
  });

  afterAll(async () => {
    await harness.cleanup();
  });

  beforeEach(() => {
    harness.clearEvents();
  });

  describe('Codex Spawn', () => {
    it(
      'spawns codex agent and extracts thread_id',
      async () => {
        // Spawn agent with simple task
        const agent = await harness.agentManager.spawn({
          taskId: null,
          prompt: CODEX_PROMPTS.done,
          mode: 'execute',
          provider: 'codex',
        });

        expect(agent.id).toBeTruthy();
        expect(agent.provider).toBe('codex');
        expect(agent.status).toBe('running');

        // Verify spawned event
        const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
        expect(spawnedEvents.length).toBe(1);
        expect(spawnedEvents[0].payload.provider).toBe('codex');

        // Wait for completion
        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);

        // Verify session ID (thread_id) was extracted
        const dbAgent = await harness.agentRepository.findById(agent.id);
        console.log('  Thread ID:', dbAgent?.sessionId);
        console.log('  Status:', dbAgent?.status);
        console.log('  Result:', result?.message);

        // Codex should complete or crash
        expect(['idle', 'crashed']).toContain(dbAgent?.status);

        // If completed successfully, should have extracted thread_id
        if (dbAgent?.status === 'idle' && dbAgent?.sessionId) {
          expect(dbAgent.sessionId).toBeTruthy();
        }
      },
      REAL_TEST_TIMEOUT
    );

    it(
      'uses generic parser for output',
      async () => {
        // Spawn agent with streaming prompt
        const agent = await harness.agentManager.spawn({
          taskId: null,
          prompt: CODEX_PROMPTS.streaming,
          mode: 'execute',
          provider: 'codex',
        });

        // Wait for completion
        const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);

        // Verify output events were captured
        const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
        console.log('  Output events:', outputEvents.length);

        // For generic provider, result should be captured
        const dbAgent = await harness.agentRepository.findById(agent.id);
        console.log('  Status:', dbAgent?.status);
        console.log('  Result:', result?.message?.substring(0, 100) + '...');

        expect(['idle', 'crashed']).toContain(dbAgent?.status);
      },
      REAL_TEST_TIMEOUT
    );
  });

  describe('Codex Provider Config', () => {
    it(
      'uses correct command and args for codex',
      async () => {
        // This is more of a config verification test
        // The actual command execution is validated by the spawn test

        const agent = await harness.agentManager.spawn({
          taskId: null,
          prompt: 'Say hello',
          mode: 'execute',
          provider: 'codex',
        });

        // Verify agent was created with codex provider
        const dbAgent = await harness.agentRepository.findById(agent.id);
        expect(dbAgent?.provider).toBe('codex');

        // Wait for completion (or timeout)
        try {
          await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
        } catch {
          // Codex might fail if not installed, that's OK for config test
        }

        const finalAgent = await harness.agentRepository.findById(agent.id);
        console.log('  Provider:', finalAgent?.provider);
        console.log('  Status:', finalAgent?.status);
      },
      REAL_TEST_TIMEOUT
    );
  });
});

/**
 * Codex-specific observations from testing:
 *
 * 1. Output Format:
 *    - Codex uses JSONL streaming with different event types
 *    - thread.started event contains thread_id
 *    - Output parsing is more generic (not JSON schema validated)
 *
 * 2. Command Structure:
 *    - codex exec --full-auto --json -p "prompt"
 *    - resume: codex exec resume <thread_id>
 *
 * 3. Session ID:
 *    - Called "thread_id" in Codex
 *    - Extracted from thread.started event
 *
 * 4. Resume:
 *    - Uses subcommand style: codex exec resume <thread_id>
 *    - Different from Claude's flag style: claude --resume <session_id>
 */