test(13-01): create real Claude CLI integration tests

- Add test file for validating JSON schemas with real Claude CLI - Tests are skipped by default (REAL_CLAUDE_TESTS=1 to enable) - Covers execute, discuss, breakdown, and decompose modes - Helper function callClaudeCli() handles CLI invocation
2026-02-02 10:37:40 +01:00
parent 6835dd45d5
commit 3c98dbe541
1 changed files with 231 additions and 0 deletions
--- a/src/test/integration/real-claude.test.ts
+++ b/src/test/integration/real-claude.test.ts
@@ -0,0 +1,231 @@
+/**
+ * Real Claude CLI Integration Tests
+ *
+ * IMPORTANT: These tests call the real Claude CLI and incur API costs.
+ * They are SKIPPED by default and should only be run manually for validation.
+ *
+ * To run these tests:
+ * ```bash
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts
+ * ```
+ *
+ * Purpose:
+ * - Validate that JSON schemas work correctly with Claude CLI --json-schema flag
+ * - Confirm MockAgentManager accurately simulates real CLI behavior
+ * - Document actual response structure and costs
+ *
+ * Findings from validation run (DATE-PENDING):
+ * - Execute mode: PENDING
+ * - Multi-question: PENDING
+ * - Discuss mode: PENDING
+ * - Breakdown mode: PENDING
+ * - Decompose mode: PENDING
+ *
+ * Total validation cost: $X.XX
+ *
+ * Conclusion: PENDING - run tests to validate
+ */
+
+import { describe, it, expect, beforeAll } from 'vitest';
+import { execa } from 'execa';
+import {
+  agentOutputJsonSchema,
+  agentOutputSchema,
+  discussOutputJsonSchema,
+  discussOutputSchema,
+  breakdownOutputJsonSchema,
+  breakdownOutputSchema,
+  decomposeOutputJsonSchema,
+  decomposeOutputSchema,
+} from '../../agent/schema.js';
+
+/**
+ * Result structure from Claude CLI with --output-format json
+ */
+interface ClaudeCliResult {
+  type: 'result';
+  subtype: 'success' | 'error';
+  is_error: boolean;
+  session_id: string;
+  result: string;
+  structured_output?: unknown;
+  total_cost_usd?: number;
+}
+
+/**
+ * Helper to call Claude CLI directly with a prompt and JSON schema.
+ *
+ * @param prompt - The prompt to send to Claude
+ * @param jsonSchema - JSON schema to enforce structured output
+ * @param timeoutMs - Timeout in milliseconds (default 60s)
+ * @returns Parsed CLI result with structured_output
+ */
+async function callClaudeCli(
+  prompt: string,
+  jsonSchema: object,
+  timeoutMs = 60000
+): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
+  const startTime = Date.now();
+
+  const { stdout } = await execa(
+    'claude',
+    [
+      '-p',
+      prompt,
+      '--output-format',
+      'json',
+      '--json-schema',
+      JSON.stringify(jsonSchema),
+    ],
+    {
+      timeout: timeoutMs,
+    }
+  );
+
+  const duration = Date.now() - startTime;
+  const cliResult: ClaudeCliResult = JSON.parse(stdout);
+
+  console.log(`\n  Duration: ${(duration / 1000).toFixed(1)}s`);
+  console.log(`  Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
+  console.log(`  Session ID: ${cliResult.session_id}`);
+
+  // When --json-schema is used, structured output is in structured_output field
+  const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);
+
+  return { cliResult, structuredOutput };
+}
+
+/**
+ * Check if real Claude tests should run.
+ * Set REAL_CLAUDE_TESTS=1 environment variable to enable.
+ */
+const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';
+
+/**
+ * Skip wrapper - tests are expensive and should run manually
+ */
+const describeReal = shouldRunRealTests ? describe : describe.skip;
+
+describeReal('Real Claude CLI Integration', () => {
+  beforeAll(() => {
+    console.log('\n=== Running Real Claude CLI Tests ===');
+    console.log('These tests call the real Claude API and incur costs.\n');
+  });
+
+  describe('Execute Mode Schema', () => {
+    it('should return done status with result', async () => {
+      const prompt = `Complete this simple task: Say "Hello, World!" as a test.
+
+Output your response in the required JSON format with status "done".`;
+
+      const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);
+
+      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+      // Validate against Zod schema
+      const parsed = agentOutputSchema.parse(structuredOutput);
+      expect(parsed.status).toBe('done');
+      if (parsed.status === 'done') {
+        expect(parsed.result).toBeTruthy();
+      }
+    });
+
+    it('should return questions status with array', async () => {
+      const prompt = `You are working on a vague task: "Make it better"
+
+You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.
+
+Output your response with status "questions" and include at least 2 questions.`;
+
+      const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);
+
+      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+      // Validate against Zod schema
+      const parsed = agentOutputSchema.parse(structuredOutput);
+      expect(parsed.status).toBe('questions');
+      if (parsed.status === 'questions') {
+        expect(Array.isArray(parsed.questions)).toBe(true);
+        expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
+        expect(parsed.questions[0].id).toBeTruthy();
+        expect(parsed.questions[0].question).toBeTruthy();
+      }
+    });
+  });
+
+  describe('Discuss Mode Schema', () => {
+    it('should return context_complete with decisions', async () => {
+      const prompt = `You are gathering requirements for a simple feature: "Add a login button"
+
+The user has already told you:
+- Use OAuth with Google
+- Button should be blue
+- Place it in the top-right corner
+
+You have enough information. Output context_complete with the decisions captured.`;
+
+      const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema);
+
+      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+      // Validate against Zod schema
+      const parsed = discussOutputSchema.parse(structuredOutput);
+      expect(parsed.status).toBe('context_complete');
+      if (parsed.status === 'context_complete') {
+        expect(Array.isArray(parsed.decisions)).toBe(true);
+        expect(parsed.decisions.length).toBeGreaterThanOrEqual(1);
+        expect(parsed.summary).toBeTruthy();
+      }
+    });
+  });
+
+  describe('Breakdown Mode Schema', () => {
+    it('should return breakdown_complete with phases', async () => {
+      const prompt = `You are breaking down an initiative: "Build a simple TODO app"
+
+Create a breakdown with 2-3 phases for this very simple app. Keep it minimal - just database, API, and UI.
+
+Output breakdown_complete with the phases array.`;
+
+      const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema);
+
+      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+      // Validate against Zod schema
+      const parsed = breakdownOutputSchema.parse(structuredOutput);
+      expect(parsed.status).toBe('breakdown_complete');
+      if (parsed.status === 'breakdown_complete') {
+        expect(Array.isArray(parsed.phases)).toBe(true);
+        expect(parsed.phases.length).toBeGreaterThanOrEqual(2);
+        expect(parsed.phases[0].number).toBe(1);
+        expect(parsed.phases[0].name).toBeTruthy();
+        expect(parsed.phases[0].description).toBeTruthy();
+      }
+    });
+  });
+
+  describe('Decompose Mode Schema', () => {
+    it('should return decompose_complete with tasks', async () => {
+      const prompt = `You are decomposing a plan: "Implement user authentication"
+
+Create 2-3 simple tasks for this plan. Tasks should be atomic units of work.
+
+Output decompose_complete with the tasks array.`;
+
+      const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema);
+
+      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+
+      // Validate against Zod schema
+      const parsed = decomposeOutputSchema.parse(structuredOutput);
+      expect(parsed.status).toBe('decompose_complete');
+      if (parsed.status === 'decompose_complete') {
+        expect(Array.isArray(parsed.tasks)).toBe(true);
+        expect(parsed.tasks.length).toBeGreaterThanOrEqual(2);
+        expect(parsed.tasks[0].number).toBe(1);
+        expect(parsed.tasks[0].name).toBeTruthy();
+        expect(parsed.tasks[0].description).toBeTruthy();
+      }
+    });
+  });
+});