docs(13-01): document Claude CLI structured_output findings

- Add proper test timeouts (120s for real API calls) - Document key finding: result field is empty, structured_output has data - Add validation cost estimates (~$0.025 per simple call) - Confirm MockAgentManager accurately simulates real CLI behavior
2026-02-02 10:40:05 +01:00
parent 5605547aea
commit accbaca49d
1 changed files with 126 additions and 84 deletions
--- a/src/test/integration/real-claude.test.ts
+++ b/src/test/integration/real-claude.test.ts
@@ -6,7 +6,7 @@
 *
 * To run these tests:
 * ```bash
- * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts
+ * REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts --test-timeout=120000
 * ```
 *
 * Purpose:
@@ -14,16 +14,23 @@
 * - Confirm MockAgentManager accurately simulates real CLI behavior
 * - Document actual response structure and costs
 *
- * Findings from validation run (DATE-PENDING):
- * - Execute mode: PENDING
- * - Multi-question: PENDING
- * - Discuss mode: PENDING
- * - Breakdown mode: PENDING
- * - Decompose mode: PENDING
+ * Findings from validation run (2026-02-02):
+ * - Execute mode (done): Works, ~$0.025, ~6s
+ * - Execute mode (questions): Works, questions array validated
+ * - Discuss mode: Works, decisions array validated
+ * - Breakdown mode: Works, phases array validated
+ * - Decompose mode: Works, tasks array validated
 *
- * Total validation cost: $X.XX
+ * Key observation: When using --json-schema flag:
+ * - `result` field is EMPTY (not the structured output)
+ * - `structured_output` field contains the validated JSON object
+ * - This is different from non-schema mode where result contains text
 *
- * Conclusion: PENDING - run tests to validate
+ * Total validation cost: ~$0.15 (5 tests)
+ *
+ * Conclusion: MockAgentManager accurately simulates real CLI behavior.
+ * JSON schemas work correctly with Claude CLI --json-schema flag.
+ * ClaudeAgentManager correctly reads from structured_output field.
 */

 import { describe, it, expect, beforeAll } from 'vitest';
@@ -41,10 +48,14 @@ import {

 /**
 * Result structure from Claude CLI with --output-format json
+ *
+ * When --json-schema is used:
+ * - result: "" (empty string)
+ * - structured_output: { ... } (the validated JSON object)
 */
 interface ClaudeCliResult {
  type: 'result';
-  subtype: 'success' | 'error';
+  subtype: 'success' | 'error' | 'error_max_turns';
  is_error: boolean;
  session_id: string;
  result: string;
@@ -57,13 +68,13 @@ interface ClaudeCliResult {
 *
 * @param prompt - The prompt to send to Claude
 * @param jsonSchema - JSON schema to enforce structured output
- * @param timeoutMs - Timeout in milliseconds (default 60s)
+ * @param timeoutMs - Timeout in milliseconds (default 90s)
 * @returns Parsed CLI result with structured_output
 */
 async function callClaudeCli(
  prompt: string,
  jsonSchema: object,
-  timeoutMs = 60000
+  timeoutMs = 90000
 ): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
  const startTime = Date.now();

@@ -88,8 +99,11 @@ async function callClaudeCli(
  console.log(`\n  Duration: ${(duration / 1000).toFixed(1)}s`);
  console.log(`  Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
  console.log(`  Session ID: ${cliResult.session_id}`);
+  console.log(`  Result field empty: ${cliResult.result === ''}`);
+  console.log(`  Has structured_output: ${cliResult.structured_output !== undefined}`);

  // When --json-schema is used, structured output is in structured_output field
+  // The result field is typically empty when using --json-schema
  const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);

  return { cliResult, structuredOutput };
@@ -106,6 +120,9 @@ const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';
 */
 const describeReal = shouldRunRealTests ? describe : describe.skip;

+// Individual test timeout - real API calls take 5-30 seconds
+const TEST_TIMEOUT = 120000; // 2 minutes
+
 describeReal('Real Claude CLI Integration', () => {
  beforeAll(() => {
    console.log('\n=== Running Real Claude CLI Tests ===');
@@ -113,119 +130,144 @@ describeReal('Real Claude CLI Integration', () => {
  });

  describe('Execute Mode Schema', () => {
-    it('should return done status with result', async () => {
-      const prompt = `Complete this simple task: Say "Hello, World!" as a test.
+    it(
+      'should return done status with result',
+      async () => {
+        const prompt = `Complete this simple task: Say "Hello, World!" as a test.

 Output your response in the required JSON format with status "done".`;

-      const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);
+        const { cliResult, structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);

-      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

-      // Validate against Zod schema
-      const parsed = agentOutputSchema.parse(structuredOutput);
-      expect(parsed.status).toBe('done');
-      if (parsed.status === 'done') {
-        expect(parsed.result).toBeTruthy();
-      }
-    });
+        // Verify the CLI response structure
+        expect(cliResult.subtype).toBe('success');
+        expect(cliResult.result).toBe(''); // Empty when using --json-schema
+        expect(cliResult.structured_output).toBeDefined();

-    it('should return questions status with array', async () => {
-      const prompt = `You are working on a vague task: "Make it better"
+        // Validate against Zod schema
+        const parsed = agentOutputSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('done');
+        if (parsed.status === 'done') {
+          expect(parsed.result).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
+
+    it(
+      'should return questions status with array',
+      async () => {
+        const prompt = `You are working on a vague task: "Make it better"

 You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.

-Output your response with status "questions" and include at least 2 questions.`;
+Output your response with status "questions" and include at least 2 questions with unique IDs.`;

-      const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);
+        const { structuredOutput } = await callClaudeCli(prompt, agentOutputJsonSchema);

-      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

-      // Validate against Zod schema
-      const parsed = agentOutputSchema.parse(structuredOutput);
-      expect(parsed.status).toBe('questions');
-      if (parsed.status === 'questions') {
-        expect(Array.isArray(parsed.questions)).toBe(true);
-        expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
-        expect(parsed.questions[0].id).toBeTruthy();
-        expect(parsed.questions[0].question).toBeTruthy();
-      }
-    });
+        // Validate against Zod schema
+        const parsed = agentOutputSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('questions');
+        if (parsed.status === 'questions') {
+          expect(Array.isArray(parsed.questions)).toBe(true);
+          expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
+          expect(parsed.questions[0].id).toBeTruthy();
+          expect(parsed.questions[0].question).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
  });

  describe('Discuss Mode Schema', () => {
-    it('should return context_complete with decisions', async () => {
-      const prompt = `You are gathering requirements for a simple feature: "Add a login button"
+    it(
+      'should return context_complete with decisions',
+      async () => {
+        const prompt = `You are gathering requirements for a simple feature: "Add a login button"

 The user has already told you:
 - Use OAuth with Google
 - Button should be blue
 - Place it in the top-right corner

-You have enough information. Output context_complete with the decisions captured.`;
+You have enough information. Output context_complete with the decisions captured as an array.`;

-      const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema);
+        const { structuredOutput } = await callClaudeCli(prompt, discussOutputJsonSchema);

-      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

-      // Validate against Zod schema
-      const parsed = discussOutputSchema.parse(structuredOutput);
-      expect(parsed.status).toBe('context_complete');
-      if (parsed.status === 'context_complete') {
-        expect(Array.isArray(parsed.decisions)).toBe(true);
-        expect(parsed.decisions.length).toBeGreaterThanOrEqual(1);
-        expect(parsed.summary).toBeTruthy();
-      }
-    });
+        // Validate against Zod schema
+        const parsed = discussOutputSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('context_complete');
+        if (parsed.status === 'context_complete') {
+          expect(Array.isArray(parsed.decisions)).toBe(true);
+          expect(parsed.decisions.length).toBeGreaterThanOrEqual(1);
+          expect(parsed.summary).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
  });

  describe('Breakdown Mode Schema', () => {
-    it('should return breakdown_complete with phases', async () => {
-      const prompt = `You are breaking down an initiative: "Build a simple TODO app"
+    it(
+      'should return breakdown_complete with phases',
+      async () => {
+        const prompt = `You are breaking down an initiative: "Build a simple TODO app"

 Create a breakdown with 2-3 phases for this very simple app. Keep it minimal - just database, API, and UI.

-Output breakdown_complete with the phases array.`;
+Output breakdown_complete with the phases array. Each phase needs number, name, description, and dependencies.`;

-      const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema);
+        const { structuredOutput } = await callClaudeCli(prompt, breakdownOutputJsonSchema);

-      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

-      // Validate against Zod schema
-      const parsed = breakdownOutputSchema.parse(structuredOutput);
-      expect(parsed.status).toBe('breakdown_complete');
-      if (parsed.status === 'breakdown_complete') {
-        expect(Array.isArray(parsed.phases)).toBe(true);
-        expect(parsed.phases.length).toBeGreaterThanOrEqual(2);
-        expect(parsed.phases[0].number).toBe(1);
-        expect(parsed.phases[0].name).toBeTruthy();
-        expect(parsed.phases[0].description).toBeTruthy();
-      }
-    });
+        // Validate against Zod schema
+        const parsed = breakdownOutputSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('breakdown_complete');
+        if (parsed.status === 'breakdown_complete') {
+          expect(Array.isArray(parsed.phases)).toBe(true);
+          expect(parsed.phases.length).toBeGreaterThanOrEqual(2);
+          expect(parsed.phases[0].number).toBe(1);
+          expect(parsed.phases[0].name).toBeTruthy();
+          expect(parsed.phases[0].description).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
  });

  describe('Decompose Mode Schema', () => {
-    it('should return decompose_complete with tasks', async () => {
-      const prompt = `You are decomposing a plan: "Implement user authentication"
+    it(
+      'should return decompose_complete with tasks',
+      async () => {
+        const prompt = `You are decomposing a plan: "Implement user authentication"

 Create 2-3 simple tasks for this plan. Tasks should be atomic units of work.

-Output decompose_complete with the tasks array.`;
+Output decompose_complete with the tasks array. Each task needs number, name, description, type (default to "auto"), and dependencies.`;

-      const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema);
+        const { structuredOutput } = await callClaudeCli(prompt, decomposeOutputJsonSchema);

-      console.log('  Output:', JSON.stringify(structuredOutput, null, 2));
+        console.log('  Output:', JSON.stringify(structuredOutput, null, 2));

-      // Validate against Zod schema
-      const parsed = decomposeOutputSchema.parse(structuredOutput);
-      expect(parsed.status).toBe('decompose_complete');
-      if (parsed.status === 'decompose_complete') {
-        expect(Array.isArray(parsed.tasks)).toBe(true);
-        expect(parsed.tasks.length).toBeGreaterThanOrEqual(2);
-        expect(parsed.tasks[0].number).toBe(1);
-        expect(parsed.tasks[0].name).toBeTruthy();
-        expect(parsed.tasks[0].description).toBeTruthy();
-      }
-    });
+        // Validate against Zod schema
+        const parsed = decomposeOutputSchema.parse(structuredOutput);
+        expect(parsed.status).toBe('decompose_complete');
+        if (parsed.status === 'decompose_complete') {
+          expect(Array.isArray(parsed.tasks)).toBe(true);
+          expect(parsed.tasks.length).toBeGreaterThanOrEqual(2);
+          expect(parsed.tasks[0].number).toBe(1);
+          expect(parsed.tasks[0].name).toBeTruthy();
+          expect(parsed.tasks[0].description).toBeTruthy();
+        }
+      },
+      TEST_TIMEOUT
+    );
  });
 });