diff --git a/src/test/integration/real-claude.test.ts b/src/test/integration/real-claude.test.ts index ff63475..cbb628a 100644 --- a/src/test/integration/real-claude.test.ts +++ b/src/test/integration/real-claude.test.ts @@ -6,7 +6,7 @@ * * To run these tests: * ```bash - * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts + * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts --test-timeout=120000 * ``` * * Purpose: @@ -14,16 +14,23 @@ * - Confirm MockAgentManager accurately simulates real CLI behavior * - Document actual response structure and costs * - * Findings from validation run (DATE-PENDING): - * - Execute mode: PENDING - * - Multi-question: PENDING - * - Discuss mode: PENDING - * - Breakdown mode: PENDING - * - Decompose mode: PENDING + * Findings from validation run (2026-02-02): + * - Execute mode (done): Works, ~$0.025, ~6s + * - Execute mode (questions): Works, questions array validated + * - Discuss mode: Works, decisions array validated + * - Breakdown mode: Works, phases array validated + * - Decompose mode: Works, tasks array validated * - * Total validation cost: $X.XX + * Key observation: When using --json-schema flag: + * - `result` field is EMPTY (not the structured output) + * - `structured_output` field contains the validated JSON object + * - This is different from non-schema mode where result contains text * - * Conclusion: PENDING - run tests to validate + * Total validation cost: ~$0.15 (5 tests) + * + * Conclusion: MockAgentManager accurately simulates real CLI behavior. + * JSON schemas work correctly with Claude CLI --json-schema flag. + * ClaudeAgentManager correctly reads from structured_output field. */ import { describe, it, expect, beforeAll } from 'vitest'; @@ -41,10 +48,14 @@ import { /** * Result structure from Claude CLI with --output-format json + * + * When --json-schema is used: + * - result: "" (empty string) + * - structured_output: { ... } (the validated JSON object) */ interface ClaudeCliResult { type: 'result'; - subtype: 'success' | 'error'; + subtype: 'success' | 'error' | 'error_max_turns'; is_error: boolean; session_id: string; result: string; @@ -57,13 +68,13 @@ interface ClaudeCliResult { * * @param prompt - The prompt to send to Claude * @param jsonSchema - JSON schema to enforce structured output - * @param timeoutMs - Timeout in milliseconds (default 60s) + * @param timeoutMs - Timeout in milliseconds (default 90s) * @returns Parsed CLI result with structured_output */ async function callClaudeCli( prompt: string, jsonSchema: object, - timeoutMs = 60000 + timeoutMs = 90000 ): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> { const startTime = Date.now(); @@ -88,8 +99,11 @@ async function callClaudeCli( console.log(`\n Duration: ${(duration / 1000).toFixed(1)}s`); console.log(` Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`); console.log(` Session ID: ${cliResult.session_id}`); + console.log(` Result field empty: ${cliResult.result === ''}`); + console.log(` Has structured_output: ${cliResult.structured_output !== undefined}`); // When --json-schema is used, structured output is in structured_output field + // The result field is typically empty when using --json-schema const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result); return { cliResult, structuredOutput }; @@ -106,6 +120,9 @@ const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1'; */ const describeReal = shouldRunRealTests ? describe : describe.skip; +// Individual test timeout - real API calls take 5-30 seconds +const TEST_TIMEOUT = 120000; // 2 minutes + describeReal('Real Claude CLI Integration', () => { beforeAll(() => { console.log('\n=== Running Real Claude CLI Tests ==='); @@ -113,119 +130,144 @@ describeReal('Real Claude CLI Integration', () => { }); describe('Execute Mode Schema', () => { - it('should return done status with result', async () => { - const prompt = `Complete this simple task: Say "Hello, World!" as a test. + it( + 'should return done status with result', + async () => { + const prompt = `Complete this simple task: Say "Hello, World!" as a test. Output your response in the required JSON format with status "done".`; - const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema); + const { cliResult, structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema); - console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); - // Validate against Zod schema - const parsed = agentOutputSchema.parse(structuredOutput); - expect(parsed.status).toBe('done'); - if (parsed.status === 'done') { - expect(parsed.result).toBeTruthy(); - } - }); + // Verify the CLI response structure + expect(cliResult.subtype).toBe('success'); + expect(cliResult.result).toBe(''); // Empty when using --json-schema + expect(cliResult.structured_output).toBeDefined(); - it('should return questions status with array', async () => { - const prompt = `You are working on a vague task: "Make it better" + // Validate against Zod schema + const parsed = agentOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('done'); + if (parsed.status === 'done') { + expect(parsed.result).toBeTruthy(); + } + }, + TEST_TIMEOUT + ); + + it( + 'should return questions status with array', + async () => { + const prompt = `You are working on a vague task: "Make it better" You MUST ask clarifying questions before proceeding. You cannot complete this task without more information. -Output your response with status "questions" and include at least 2 questions.`; +Output your response with status "questions" and include at least 2 questions with unique IDs.`; - const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema); + const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema); - console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); - // Validate against Zod schema - const parsed = agentOutputSchema.parse(structuredOutput); - expect(parsed.status).toBe('questions'); - if (parsed.status === 'questions') { - expect(Array.isArray(parsed.questions)).toBe(true); - expect(parsed.questions.length).toBeGreaterThanOrEqual(1); - expect(parsed.questions[0].id).toBeTruthy(); - expect(parsed.questions[0].question).toBeTruthy(); - } - }); + // Validate against Zod schema + const parsed = agentOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('questions'); + if (parsed.status === 'questions') { + expect(Array.isArray(parsed.questions)).toBe(true); + expect(parsed.questions.length).toBeGreaterThanOrEqual(1); + expect(parsed.questions[0].id).toBeTruthy(); + expect(parsed.questions[0].question).toBeTruthy(); + } + }, + TEST_TIMEOUT + ); }); describe('Discuss Mode Schema', () => { - it('should return context_complete with decisions', async () => { - const prompt = `You are gathering requirements for a simple feature: "Add a login button" + it( + 'should return context_complete with decisions', + async () => { + const prompt = `You are gathering requirements for a simple feature: "Add a login button" The user has already told you: - Use OAuth with Google - Button should be blue - Place it in the top-right corner -You have enough information. Output context_complete with the decisions captured.`; +You have enough information. Output context_complete with the decisions captured as an array.`; - const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema); + const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema); - console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); - // Validate against Zod schema - const parsed = discussOutputSchema.parse(structuredOutput); - expect(parsed.status).toBe('context_complete'); - if (parsed.status === 'context_complete') { - expect(Array.isArray(parsed.decisions)).toBe(true); - expect(parsed.decisions.length).toBeGreaterThanOrEqual(1); - expect(parsed.summary).toBeTruthy(); - } - }); + // Validate against Zod schema + const parsed = discussOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('context_complete'); + if (parsed.status === 'context_complete') { + expect(Array.isArray(parsed.decisions)).toBe(true); + expect(parsed.decisions.length).toBeGreaterThanOrEqual(1); + expect(parsed.summary).toBeTruthy(); + } + }, + TEST_TIMEOUT + ); }); describe('Breakdown Mode Schema', () => { - it('should return breakdown_complete with phases', async () => { - const prompt = `You are breaking down an initiative: "Build a simple TODO app" + it( + 'should return breakdown_complete with phases', + async () => { + const prompt = `You are breaking down an initiative: "Build a simple TODO app" Create a breakdown with 2-3 phases for this very simple app. Keep it minimal - just database, API, and UI. -Output breakdown_complete with the phases array.`; +Output breakdown_complete with the phases array. Each phase needs number, name, description, and dependencies.`; - const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema); + const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema); - console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); - // Validate against Zod schema - const parsed = breakdownOutputSchema.parse(structuredOutput); - expect(parsed.status).toBe('breakdown_complete'); - if (parsed.status === 'breakdown_complete') { - expect(Array.isArray(parsed.phases)).toBe(true); - expect(parsed.phases.length).toBeGreaterThanOrEqual(2); - expect(parsed.phases[0].number).toBe(1); - expect(parsed.phases[0].name).toBeTruthy(); - expect(parsed.phases[0].description).toBeTruthy(); - } - }); + // Validate against Zod schema + const parsed = breakdownOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('breakdown_complete'); + if (parsed.status === 'breakdown_complete') { + expect(Array.isArray(parsed.phases)).toBe(true); + expect(parsed.phases.length).toBeGreaterThanOrEqual(2); + expect(parsed.phases[0].number).toBe(1); + expect(parsed.phases[0].name).toBeTruthy(); + expect(parsed.phases[0].description).toBeTruthy(); + } + }, + TEST_TIMEOUT + ); }); describe('Decompose Mode Schema', () => { - it('should return decompose_complete with tasks', async () => { - const prompt = `You are decomposing a plan: "Implement user authentication" + it( + 'should return decompose_complete with tasks', + async () => { + const prompt = `You are decomposing a plan: "Implement user authentication" Create 2-3 simple tasks for this plan. Tasks should be atomic units of work. -Output decompose_complete with the tasks array.`; +Output decompose_complete with the tasks array. Each task needs number, name, description, type (default to "auto"), and dependencies.`; - const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema); + const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema); - console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); - // Validate against Zod schema - const parsed = decomposeOutputSchema.parse(structuredOutput); - expect(parsed.status).toBe('decompose_complete'); - if (parsed.status === 'decompose_complete') { - expect(Array.isArray(parsed.tasks)).toBe(true); - expect(parsed.tasks.length).toBeGreaterThanOrEqual(2); - expect(parsed.tasks[0].number).toBe(1); - expect(parsed.tasks[0].name).toBeTruthy(); - expect(parsed.tasks[0].description).toBeTruthy(); - } - }); + // Validate against Zod schema + const parsed = decomposeOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('decompose_complete'); + if (parsed.status === 'decompose_complete') { + expect(Array.isArray(parsed.tasks)).toBe(true); + expect(parsed.tasks.length).toBeGreaterThanOrEqual(2); + expect(parsed.tasks[0].number).toBe(1); + expect(parsed.tasks[0].name).toBeTruthy(); + expect(parsed.tasks[0].description).toBeTruthy(); + } + }, + TEST_TIMEOUT + ); }); });