/** * Real Codex CLI Manager Integration Tests * * IMPORTANT: These tests call the REAL Codex CLI and incur API costs! * They are SKIPPED by default and should only be run manually for validation. * * To run these tests: * ```bash * REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts --test-timeout=300000 * ``` * * Tests covered: * - Codex spawn and thread_id extraction * - Generic output parsing (non-schema) * - Streaming output * * Estimated cost: ~$0.10 per full run * * Note: Codex uses different output format and session ID field (thread_id). */ import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest'; import { createRealProviderHarness, describeRealCodex, REAL_TEST_TIMEOUT, type RealProviderHarness, } from './harness.js'; import { CODEX_PROMPTS } from './prompts.js'; import type { AgentSpawnedEvent, AgentOutputEvent } from '../../../events/types.js'; describeRealCodex('Real Codex Manager Integration', () => { let harness: RealProviderHarness; beforeAll(async () => { console.log('\n=== Running Real Codex Manager Tests ==='); console.log('These tests call the real Codex API and incur costs.\n'); harness = await createRealProviderHarness({ provider: 'codex' }); }); afterAll(async () => { await harness.cleanup(); }); beforeEach(() => { harness.clearEvents(); }); describe('Codex Spawn', () => { it( 'spawns codex agent and extracts thread_id', async () => { // Spawn agent with simple task const agent = await harness.agentManager.spawn({ taskId: null, prompt: CODEX_PROMPTS.done, mode: 'execute', provider: 'codex', }); expect(agent.id).toBeTruthy(); expect(agent.provider).toBe('codex'); expect(agent.status).toBe('running'); // Verify spawned event const spawnedEvents = harness.getEventsByType('agent:spawned'); expect(spawnedEvents.length).toBe(1); expect(spawnedEvents[0].payload.provider).toBe('codex'); // Wait for completion const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT); // Verify session ID (thread_id) was extracted const dbAgent = await harness.agentRepository.findById(agent.id); console.log(' Thread ID:', dbAgent?.sessionId); console.log(' Status:', dbAgent?.status); console.log(' Result:', result?.message); // Codex should complete or crash expect(['idle', 'crashed']).toContain(dbAgent?.status); // If completed successfully, should have extracted thread_id if (dbAgent?.status === 'idle' && dbAgent?.sessionId) { expect(dbAgent.sessionId).toBeTruthy(); } }, REAL_TEST_TIMEOUT ); it( 'uses generic parser for output', async () => { // Spawn agent with streaming prompt const agent = await harness.agentManager.spawn({ taskId: null, prompt: CODEX_PROMPTS.streaming, mode: 'execute', provider: 'codex', }); // Wait for completion const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT); // Verify output events were captured const outputEvents = harness.getEventsByType('agent:output'); console.log(' Output events:', outputEvents.length); // For generic provider, result should be captured const dbAgent = await harness.agentRepository.findById(agent.id); console.log(' Status:', dbAgent?.status); console.log(' Result:', result?.message?.substring(0, 100) + '...'); expect(['idle', 'crashed']).toContain(dbAgent?.status); }, REAL_TEST_TIMEOUT ); }); describe('Codex Provider Config', () => { it( 'uses correct command and args for codex', async () => { // This is more of a config verification test // The actual command execution is validated by the spawn test const agent = await harness.agentManager.spawn({ taskId: null, prompt: 'Say hello', mode: 'execute', provider: 'codex', }); // Verify agent was created with codex provider const dbAgent = await harness.agentRepository.findById(agent.id); expect(dbAgent?.provider).toBe('codex'); // Wait for completion (or timeout) try { await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT); } catch { // Codex might fail if not installed, that's OK for config test } const finalAgent = await harness.agentRepository.findById(agent.id); console.log(' Provider:', finalAgent?.provider); console.log(' Status:', finalAgent?.status); }, REAL_TEST_TIMEOUT ); }); }); /** * Codex-specific observations from testing: * * 1. Output Format: * - Codex uses JSONL streaming with different event types * - thread.started event contains thread_id * - Output parsing is more generic (not JSON schema validated) * * 2. Command Structure: * - codex exec --full-auto --json -p "prompt" * - resume: codex exec resume * * 3. Session ID: * - Called "thread_id" in Codex * - Extracted from thread.started event * * 4. Resume: * - Uses subcommand style: codex exec resume * - Different from Claude's flag style: claude --resume */