diff --git a/src/test/integration/real-claude.test.ts b/src/test/integration/real-claude.test.ts new file mode 100644 index 0000000..ff63475 --- /dev/null +++ b/src/test/integration/real-claude.test.ts @@ -0,0 +1,231 @@ +/** + * Real Claude CLI Integration Tests + * + * IMPORTANT: These tests call the real Claude CLI and incur API costs. + * They are SKIPPED by default and should only be run manually for validation. + * + * To run these tests: + * ```bash + * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts + * ``` + * + * Purpose: + * - Validate that JSON schemas work correctly with Claude CLI --json-schema flag + * - Confirm MockAgentManager accurately simulates real CLI behavior + * - Document actual response structure and costs + * + * Findings from validation run (DATE-PENDING): + * - Execute mode: PENDING + * - Multi-question: PENDING + * - Discuss mode: PENDING + * - Breakdown mode: PENDING + * - Decompose mode: PENDING + * + * Total validation cost: $X.XX + * + * Conclusion: PENDING - run tests to validate + */ + +import { describe, it, expect, beforeAll } from 'vitest'; +import { execa } from 'execa'; +import { + agentOutputJsonSchema, + agentOutputSchema, + discussOutputJsonSchema, + discussOutputSchema, + breakdownOutputJsonSchema, + breakdownOutputSchema, + decomposeOutputJsonSchema, + decomposeOutputSchema, +} from '../../agent/schema.js'; + +/** + * Result structure from Claude CLI with --output-format json + */ +interface ClaudeCliResult { + type: 'result'; + subtype: 'success' | 'error'; + is_error: boolean; + session_id: string; + result: string; + structured_output?: unknown; + total_cost_usd?: number; +} + +/** + * Helper to call Claude CLI directly with a prompt and JSON schema. + * + * @param prompt - The prompt to send to Claude + * @param jsonSchema - JSON schema to enforce structured output + * @param timeoutMs - Timeout in milliseconds (default 60s) + * @returns Parsed CLI result with structured_output + */ +async function callClaudeCli( + prompt: string, + jsonSchema: object, + timeoutMs = 60000 +): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> { + const startTime = Date.now(); + + const { stdout } = await execa( + 'claude', + [ + '-p', + prompt, + '--output-format', + 'json', + '--json-schema', + JSON.stringify(jsonSchema), + ], + { + timeout: timeoutMs, + } + ); + + const duration = Date.now() - startTime; + const cliResult: ClaudeCliResult = JSON.parse(stdout); + + console.log(`\n Duration: ${(duration / 1000).toFixed(1)}s`); + console.log(` Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`); + console.log(` Session ID: ${cliResult.session_id}`); + + // When --json-schema is used, structured output is in structured_output field + const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result); + + return { cliResult, structuredOutput }; +} + +/** + * Check if real Claude tests should run. + * Set REAL_CLAUDE_TESTS=1 environment variable to enable. + */ +const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1'; + +/** + * Skip wrapper - tests are expensive and should run manually + */ +const describeReal = shouldRunRealTests ? describe : describe.skip; + +describeReal('Real Claude CLI Integration', () => { + beforeAll(() => { + console.log('\n=== Running Real Claude CLI Tests ==='); + console.log('These tests call the real Claude API and incur costs.\n'); + }); + + describe('Execute Mode Schema', () => { + it('should return done status with result', async () => { + const prompt = `Complete this simple task: Say "Hello, World!" as a test. + +Output your response in the required JSON format with status "done".`; + + const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema); + + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + + // Validate against Zod schema + const parsed = agentOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('done'); + if (parsed.status === 'done') { + expect(parsed.result).toBeTruthy(); + } + }); + + it('should return questions status with array', async () => { + const prompt = `You are working on a vague task: "Make it better" + +You MUST ask clarifying questions before proceeding. You cannot complete this task without more information. + +Output your response with status "questions" and include at least 2 questions.`; + + const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema); + + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + + // Validate against Zod schema + const parsed = agentOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('questions'); + if (parsed.status === 'questions') { + expect(Array.isArray(parsed.questions)).toBe(true); + expect(parsed.questions.length).toBeGreaterThanOrEqual(1); + expect(parsed.questions[0].id).toBeTruthy(); + expect(parsed.questions[0].question).toBeTruthy(); + } + }); + }); + + describe('Discuss Mode Schema', () => { + it('should return context_complete with decisions', async () => { + const prompt = `You are gathering requirements for a simple feature: "Add a login button" + +The user has already told you: +- Use OAuth with Google +- Button should be blue +- Place it in the top-right corner + +You have enough information. Output context_complete with the decisions captured.`; + + const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema); + + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + + // Validate against Zod schema + const parsed = discussOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('context_complete'); + if (parsed.status === 'context_complete') { + expect(Array.isArray(parsed.decisions)).toBe(true); + expect(parsed.decisions.length).toBeGreaterThanOrEqual(1); + expect(parsed.summary).toBeTruthy(); + } + }); + }); + + describe('Breakdown Mode Schema', () => { + it('should return breakdown_complete with phases', async () => { + const prompt = `You are breaking down an initiative: "Build a simple TODO app" + +Create a breakdown with 2-3 phases for this very simple app. Keep it minimal - just database, API, and UI. + +Output breakdown_complete with the phases array.`; + + const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema); + + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + + // Validate against Zod schema + const parsed = breakdownOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('breakdown_complete'); + if (parsed.status === 'breakdown_complete') { + expect(Array.isArray(parsed.phases)).toBe(true); + expect(parsed.phases.length).toBeGreaterThanOrEqual(2); + expect(parsed.phases[0].number).toBe(1); + expect(parsed.phases[0].name).toBeTruthy(); + expect(parsed.phases[0].description).toBeTruthy(); + } + }); + }); + + describe('Decompose Mode Schema', () => { + it('should return decompose_complete with tasks', async () => { + const prompt = `You are decomposing a plan: "Implement user authentication" + +Create 2-3 simple tasks for this plan. Tasks should be atomic units of work. + +Output decompose_complete with the tasks array.`; + + const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema); + + console.log(' Output:', JSON.stringify(structuredOutput, null, 2)); + + // Validate against Zod schema + const parsed = decomposeOutputSchema.parse(structuredOutput); + expect(parsed.status).toBe('decompose_complete'); + if (parsed.status === 'decompose_complete') { + expect(Array.isArray(parsed.tasks)).toBe(true); + expect(parsed.tasks.length).toBeGreaterThanOrEqual(2); + expect(parsed.tasks[0].number).toBe(1); + expect(parsed.tasks[0].name).toBeTruthy(); + expect(parsed.tasks[0].description).toBeTruthy(); + } + }); + }); +});