test(13-01): create real Claude CLI integration tests
- Add test file for validating JSON schemas with real Claude CLI - Tests are skipped by default (REAL_CLAUDE_TESTS=1 to enable) - Covers execute, discuss, breakdown, and decompose modes - Helper function callClaudeCli() handles CLI invocation
This commit is contained in:
231
src/test/integration/real-claude.test.ts
Normal file
231
src/test/integration/real-claude.test.ts
Normal file
@@ -0,0 +1,231 @@
|
||||
/**
|
||||
* Real Claude CLI Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the real Claude CLI and incur API costs.
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts
|
||||
* ```
|
||||
*
|
||||
* Purpose:
|
||||
* - Validate that JSON schemas work correctly with Claude CLI --json-schema flag
|
||||
* - Confirm MockAgentManager accurately simulates real CLI behavior
|
||||
* - Document actual response structure and costs
|
||||
*
|
||||
* Findings from validation run (DATE-PENDING):
|
||||
* - Execute mode: PENDING
|
||||
* - Multi-question: PENDING
|
||||
* - Discuss mode: PENDING
|
||||
* - Breakdown mode: PENDING
|
||||
* - Decompose mode: PENDING
|
||||
*
|
||||
* Total validation cost: $X.XX
|
||||
*
|
||||
* Conclusion: PENDING - run tests to validate
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll } from 'vitest';
|
||||
import { execa } from 'execa';
|
||||
import {
|
||||
agentOutputJsonSchema,
|
||||
agentOutputSchema,
|
||||
discussOutputJsonSchema,
|
||||
discussOutputSchema,
|
||||
breakdownOutputJsonSchema,
|
||||
breakdownOutputSchema,
|
||||
decomposeOutputJsonSchema,
|
||||
decomposeOutputSchema,
|
||||
} from '../../agent/schema.js';
|
||||
|
||||
/**
|
||||
* Result structure from Claude CLI with --output-format json
|
||||
*/
|
||||
interface ClaudeCliResult {
|
||||
type: 'result';
|
||||
subtype: 'success' | 'error';
|
||||
is_error: boolean;
|
||||
session_id: string;
|
||||
result: string;
|
||||
structured_output?: unknown;
|
||||
total_cost_usd?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to call Claude CLI directly with a prompt and JSON schema.
|
||||
*
|
||||
* @param prompt - The prompt to send to Claude
|
||||
* @param jsonSchema - JSON schema to enforce structured output
|
||||
* @param timeoutMs - Timeout in milliseconds (default 60s)
|
||||
* @returns Parsed CLI result with structured_output
|
||||
*/
|
||||
async function callClaudeCli(
|
||||
prompt: string,
|
||||
jsonSchema: object,
|
||||
timeoutMs = 60000
|
||||
): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
|
||||
const startTime = Date.now();
|
||||
|
||||
const { stdout } = await execa(
|
||||
'claude',
|
||||
[
|
||||
'-p',
|
||||
prompt,
|
||||
'--output-format',
|
||||
'json',
|
||||
'--json-schema',
|
||||
JSON.stringify(jsonSchema),
|
||||
],
|
||||
{
|
||||
timeout: timeoutMs,
|
||||
}
|
||||
);
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
const cliResult: ClaudeCliResult = JSON.parse(stdout);
|
||||
|
||||
console.log(`\n Duration: ${(duration / 1000).toFixed(1)}s`);
|
||||
console.log(` Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
|
||||
console.log(` Session ID: ${cliResult.session_id}`);
|
||||
|
||||
// When --json-schema is used, structured output is in structured_output field
|
||||
const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);
|
||||
|
||||
return { cliResult, structuredOutput };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if real Claude tests should run.
|
||||
* Set REAL_CLAUDE_TESTS=1 environment variable to enable.
|
||||
*/
|
||||
const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';
|
||||
|
||||
/**
|
||||
* Skip wrapper - tests are expensive and should run manually
|
||||
*/
|
||||
const describeReal = shouldRunRealTests ? describe : describe.skip;
|
||||
|
||||
describeReal('Real Claude CLI Integration', () => {
|
||||
beforeAll(() => {
|
||||
console.log('\n=== Running Real Claude CLI Tests ===');
|
||||
console.log('These tests call the real Claude API and incur costs.\n');
|
||||
});
|
||||
|
||||
describe('Execute Mode Schema', () => {
|
||||
it('should return done status with result', async () => {
|
||||
const prompt = `Complete this simple task: Say "Hello, World!" as a test.
|
||||
|
||||
Output your response in the required JSON format with status "done".`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = agentOutputSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('done');
|
||||
if (parsed.status === 'done') {
|
||||
expect(parsed.result).toBeTruthy();
|
||||
}
|
||||
});
|
||||
|
||||
it('should return questions status with array', async () => {
|
||||
const prompt = `You are working on a vague task: "Make it better"
|
||||
|
||||
You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.
|
||||
|
||||
Output your response with status "questions" and include at least 2 questions.`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = agentOutputSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('questions');
|
||||
if (parsed.status === 'questions') {
|
||||
expect(Array.isArray(parsed.questions)).toBe(true);
|
||||
expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
|
||||
expect(parsed.questions[0].id).toBeTruthy();
|
||||
expect(parsed.questions[0].question).toBeTruthy();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('Discuss Mode Schema', () => {
|
||||
it('should return context_complete with decisions', async () => {
|
||||
const prompt = `You are gathering requirements for a simple feature: "Add a login button"
|
||||
|
||||
The user has already told you:
|
||||
- Use OAuth with Google
|
||||
- Button should be blue
|
||||
- Place it in the top-right corner
|
||||
|
||||
You have enough information. Output context_complete with the decisions captured.`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = discussOutputSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('context_complete');
|
||||
if (parsed.status === 'context_complete') {
|
||||
expect(Array.isArray(parsed.decisions)).toBe(true);
|
||||
expect(parsed.decisions.length).toBeGreaterThanOrEqual(1);
|
||||
expect(parsed.summary).toBeTruthy();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('Breakdown Mode Schema', () => {
|
||||
it('should return breakdown_complete with phases', async () => {
|
||||
const prompt = `You are breaking down an initiative: "Build a simple TODO app"
|
||||
|
||||
Create a breakdown with 2-3 phases for this very simple app. Keep it minimal - just database, API, and UI.
|
||||
|
||||
Output breakdown_complete with the phases array.`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = breakdownOutputSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('breakdown_complete');
|
||||
if (parsed.status === 'breakdown_complete') {
|
||||
expect(Array.isArray(parsed.phases)).toBe(true);
|
||||
expect(parsed.phases.length).toBeGreaterThanOrEqual(2);
|
||||
expect(parsed.phases[0].number).toBe(1);
|
||||
expect(parsed.phases[0].name).toBeTruthy();
|
||||
expect(parsed.phases[0].description).toBeTruthy();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('Decompose Mode Schema', () => {
|
||||
it('should return decompose_complete with tasks', async () => {
|
||||
const prompt = `You are decomposing a plan: "Implement user authentication"
|
||||
|
||||
Create 2-3 simple tasks for this plan. Tasks should be atomic units of work.
|
||||
|
||||
Output decompose_complete with the tasks array.`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = decomposeOutputSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('decompose_complete');
|
||||
if (parsed.status === 'decompose_complete') {
|
||||
expect(Array.isArray(parsed.tasks)).toBe(true);
|
||||
expect(parsed.tasks.length).toBeGreaterThanOrEqual(2);
|
||||
expect(parsed.tasks[0].number).toBe(1);
|
||||
expect(parsed.tasks[0].name).toBeTruthy();
|
||||
expect(parsed.tasks[0].description).toBeTruthy();
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user