Files
Codewalkers/apps/server/test/integration/real-providers/schema-retry.test.ts
Lukas May 34578d39c6 refactor: Restructure monorepo to apps/server/ and apps/web/ layout
Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt
standard monorepo conventions (apps/ for runnable apps, packages/
for reusable libraries). Update all config files, shared package
imports, test fixtures, and documentation to reflect new paths.

Key fixes:
- Update workspace config to ["apps/*", "packages/*"]
- Update tsconfig.json rootDir/include for apps/server/
- Add apps/web/** to vitest exclude list
- Update drizzle.config.ts schema path
- Fix ensure-schema.ts migration path detection (3 levels up in dev,
  2 levels up in dist)
- Fix tests/integration/cli-server.test.ts import paths
- Update packages/shared imports to apps/server/ paths
- Update all docs/ files with new paths
2026-03-03 11:22:53 +01:00

307 lines
9.8 KiB
TypeScript

/**
* Schema Validation & Retry Integration Tests
*
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
* They are SKIPPED by default and should only be run manually for validation.
*
* To run these tests:
* ```bash
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/schema-retry.test.ts --test-timeout=300000
* ```
*
* Tests covered:
* - Valid JSON output validation
* - Questions status parsing
* - Schema validation failure with retry
* - Max retry limit handling
*
* Estimated cost: ~$0.20 per full run (includes retries)
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import {
createRealProviderHarness,
describeRealClaude,
REAL_TEST_TIMEOUT,
EXTENDED_TEST_TIMEOUT,
type RealProviderHarness,
} from './harness.js';
import { MINIMAL_PROMPTS } from './prompts.js';
import type { AgentResumedEvent, AgentCrashedEvent } from '../../../events/types.js';
describeRealClaude('Schema Validation & Retry', () => {
let harness: RealProviderHarness;
beforeAll(async () => {
console.log('\n=== Running Schema Validation & Retry Tests ===');
console.log('These tests call the real Claude API and incur costs.');
console.log('Retry tests may take longer and cost more.\n');
harness = await createRealProviderHarness({ provider: 'claude' });
});
afterAll(async () => {
await harness.cleanup();
});
beforeEach(() => {
harness.clearEvents();
});
describe('Valid Output', () => {
it(
'validates done status output',
async () => {
// Spawn agent with minimal done prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.done,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify completion
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
// No retry events should have been emitted
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
expect(resumeEvents.length).toBe(0);
console.log(' Status: idle (valid done output)');
console.log(' Result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'validates questions status output',
async () => {
// Spawn agent with questions prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.questions,
mode: 'execute',
provider: 'claude',
});
// Wait for waiting_for_input
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
// Verify questions were validated
expect(questions).toBeTruthy();
expect(questions?.questions).toBeInstanceOf(Array);
expect(questions?.questions.length).toBeGreaterThan(0);
// Each question should have id and question fields
for (const q of questions?.questions ?? []) {
expect(q.id).toBeTruthy();
expect(q.question).toBeTruthy();
}
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('waiting_for_input');
// No retry events
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
expect(resumeEvents.length).toBe(0);
console.log(' Status: waiting_for_input (valid questions output)');
console.log(' Questions:', questions?.questions.length);
},
REAL_TEST_TIMEOUT
);
it(
'validates multiple questions',
async () => {
// Spawn agent with multiple questions prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.multipleQuestions,
mode: 'execute',
provider: 'claude',
});
// Wait for waiting_for_input
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
// Verify multiple questions
expect(questions?.questions.length).toBeGreaterThanOrEqual(2);
// Each question should have unique ID
const ids = questions?.questions.map((q) => q.id) ?? [];
const uniqueIds = new Set(ids);
expect(uniqueIds.size).toBe(ids.length);
console.log(' Questions:', questions?.questions.map((q) => q.id).join(', '));
},
REAL_TEST_TIMEOUT
);
});
describe('Retry Logic', () => {
it(
'retries when output does not match schema',
async () => {
// Prompt that produces non-JSON first, then valid JSON
// Note: Claude may or may not produce invalid output first
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.badThenGood,
mode: 'execute',
provider: 'claude',
});
// Wait for completion (may involve retries)
const result = await harness.waitForAgentCompletion(agent.id, EXTENDED_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
// Either succeeded with retry OR succeeded first time
expect(['idle', 'crashed']).toContain(dbAgent?.status);
// Check for retry events
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
console.log(' Retry attempts:', resumeEvents.length);
console.log(' Final status:', dbAgent?.status);
if (dbAgent?.status === 'idle') {
expect(result?.success).toBe(true);
console.log(' Result:', result?.message);
} else {
// Crashed after max retries
const crashedEvents = harness.getEventsByType<AgentCrashedEvent>('agent:crashed');
expect(crashedEvents.length).toBeGreaterThan(0);
console.log(' Crashed after retries');
}
},
EXTENDED_TEST_TIMEOUT
);
it(
'extracts JSON from markdown code blocks',
async () => {
// Prompt that produces JSON wrapped in markdown
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: `Output the result wrapped in a markdown code block like this:
\`\`\`json
{"status":"done","result":"extracted from markdown"}
\`\`\``,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Status:', dbAgent?.status);
console.log(' Result:', result?.message);
// Should succeed (JSON extraction from code block)
if (dbAgent?.status === 'idle') {
expect(result?.success).toBe(true);
}
},
REAL_TEST_TIMEOUT
);
it(
'extracts JSON from text with surrounding content',
async () => {
// Prompt that produces JSON with text before it
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: `First say "Here is my response:" then output the JSON:
{"status":"done","result":"extracted from text"}`,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Status:', dbAgent?.status);
console.log(' Result:', result?.message);
// Should succeed (JSON extraction from last {...} block)
if (dbAgent?.status === 'idle') {
expect(result?.success).toBe(true);
}
},
REAL_TEST_TIMEOUT
);
});
describe('Mode-Specific Schemas', () => {
it(
'validates discuss mode output',
async () => {
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.discussComplete,
mode: 'discuss',
provider: 'claude',
});
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
console.log(' Discuss mode result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'validates plan mode output',
async () => {
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.planComplete,
mode: 'plan',
provider: 'claude',
});
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
console.log(' Plan mode result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'validates detail mode output',
async () => {
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.detailComplete,
mode: 'detail',
provider: 'claude',
});
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
console.log(' Detail mode result:', result?.message);
},
REAL_TEST_TIMEOUT
);
});
});