refactor: Restructure monorepo to apps/server/ and apps/web/ layout
Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt standard monorepo conventions (apps/ for runnable apps, packages/ for reusable libraries). Update all config files, shared package imports, test fixtures, and documentation to reflect new paths. Key fixes: - Update workspace config to ["apps/*", "packages/*"] - Update tsconfig.json rootDir/include for apps/server/ - Add apps/web/** to vitest exclude list - Update drizzle.config.ts schema path - Fix ensure-schema.ts migration path detection (3 levels up in dev, 2 levels up in dist) - Fix tests/integration/cli-server.test.ts import paths - Update packages/shared imports to apps/server/ paths - Update all docs/ files with new paths
This commit is contained in:
203
apps/server/test/integration/agent-workdir-verification.test.ts
Normal file
203
apps/server/test/integration/agent-workdir-verification.test.ts
Normal file
@@ -0,0 +1,203 @@
|
||||
/**
|
||||
* Agent Working Directory Verification Tests
|
||||
*
|
||||
* Tests that verify agents actually run in their intended working directories.
|
||||
* These tests use simple shell commands to prove the agent execution location.
|
||||
*
|
||||
* IMPORTANT: These tests spawn real CLI processes and may incur API costs.
|
||||
* They are SKIPPED by default to prevent accidental charges.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_WORKDIR_TESTS=1 npm test -- src/test/integration/agent-workdir-verification.test.ts --test-timeout=120000
|
||||
* ```
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { mkdtemp, rm, readFile } from 'node:fs/promises';
|
||||
import { existsSync } from 'node:fs';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { MultiProviderAgentManager } from '../../agent/manager.js';
|
||||
import { createTestDatabase } from '../../db/repositories/drizzle/test-helpers.js';
|
||||
import {
|
||||
DrizzleAgentRepository,
|
||||
DrizzleProjectRepository,
|
||||
DrizzleAccountRepository,
|
||||
DrizzleInitiativeRepository,
|
||||
} from '../../db/repositories/drizzle/index.js';
|
||||
import { EventEmitterBus } from '../../events/bus.js';
|
||||
|
||||
const SHOULD_SKIP = !process.env.REAL_WORKDIR_TESTS;
|
||||
const TEST_TIMEOUT = 60000;
|
||||
|
||||
describe.skipIf(SHOULD_SKIP)('Agent Working Directory Verification', () => {
|
||||
let tempDir: string;
|
||||
let agentManager: MultiProviderAgentManager;
|
||||
let agentRepository: DrizzleAgentRepository;
|
||||
|
||||
beforeAll(async () => {
|
||||
if (SHOULD_SKIP) return;
|
||||
|
||||
console.log('\n=== Running Agent Working Directory Tests ===');
|
||||
console.log('These tests verify agents run in correct working directories.\n');
|
||||
|
||||
// Create temp directory for test workspace
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'cw-workdir-test-'));
|
||||
|
||||
// Set up test database and repositories
|
||||
const db = await createTestDatabase();
|
||||
const eventBus = new EventEmitterBus();
|
||||
|
||||
agentRepository = new DrizzleAgentRepository(db);
|
||||
const projectRepository = new DrizzleProjectRepository(db);
|
||||
const accountRepository = new DrizzleAccountRepository(db);
|
||||
|
||||
agentManager = new MultiProviderAgentManager(
|
||||
agentRepository,
|
||||
tempDir,
|
||||
projectRepository,
|
||||
accountRepository,
|
||||
eventBus,
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (SHOULD_SKIP || !tempDir) return;
|
||||
try {
|
||||
await rm(tempDir, { recursive: true });
|
||||
} catch (err) {
|
||||
console.warn('Failed to cleanup temp directory:', err);
|
||||
}
|
||||
});
|
||||
|
||||
it('spawns agent in correct standalone working directory', async () => {
|
||||
const prompt = `
|
||||
Write your current working directory to a file called 'verify-pwd.txt'.
|
||||
Use this exact bash command:
|
||||
|
||||
pwd > verify-pwd.txt
|
||||
|
||||
Then output the signal: {"done": true}
|
||||
`.trim();
|
||||
|
||||
// Spawn standalone agent
|
||||
const agent = await agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
expect(agent.id).toBeTruthy();
|
||||
expect(agent.status).toBe('running');
|
||||
|
||||
// Wait for completion (poll agent status)
|
||||
let attempts = 0;
|
||||
const maxAttempts = 60; // 60 seconds timeout
|
||||
|
||||
while (attempts < maxAttempts) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
attempts++;
|
||||
|
||||
const currentAgent = await agentRepository.findById(agent.id);
|
||||
if (!currentAgent || currentAgent.status !== 'running') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify final agent state
|
||||
const completedAgent = await agentRepository.findById(agent.id);
|
||||
expect(completedAgent).toBeTruthy();
|
||||
expect(completedAgent!.status).not.toBe('running');
|
||||
|
||||
// Get the agent's expected working directory
|
||||
const expectedWorkdir = join(tempDir, 'agent-workdirs', agent.name, 'workspace');
|
||||
|
||||
// Read diagnostic files
|
||||
const diagnosticFile = join(expectedWorkdir, '.cw', 'spawn-diagnostic.json');
|
||||
const expectedPwdFile = join(expectedWorkdir, '.cw', 'expected-pwd.txt');
|
||||
const verifyPwdFile = join(expectedWorkdir, 'verify-pwd.txt');
|
||||
|
||||
// Verify diagnostic files exist
|
||||
expect(existsSync(diagnosticFile), 'spawn diagnostic file should exist').toBe(true);
|
||||
expect(existsSync(expectedPwdFile), 'expected pwd file should exist').toBe(true);
|
||||
|
||||
// Read diagnostic data
|
||||
const diagnostic = JSON.parse(await readFile(diagnosticFile, 'utf-8'));
|
||||
const expectedPwd = (await readFile(expectedPwdFile, 'utf-8')).trim();
|
||||
|
||||
console.log('Diagnostic data:', diagnostic);
|
||||
console.log('Expected working directory:', expectedPwd);
|
||||
|
||||
// Verify diagnostic consistency
|
||||
expect(diagnostic.intendedCwd).toBe(expectedWorkdir);
|
||||
expect(diagnostic.cwdExistsAtSpawn).toBe(true);
|
||||
expect(expectedPwd).toBe(expectedWorkdir);
|
||||
|
||||
// The critical test: verify the agent actually wrote the file in the expected location
|
||||
if (existsSync(verifyPwdFile)) {
|
||||
const actualPwd = (await readFile(verifyPwdFile, 'utf-8')).trim();
|
||||
console.log('Agent reported working directory:', actualPwd);
|
||||
|
||||
// This is the key verification: the pwd reported by the agent should match expected
|
||||
expect(actualPwd).toBe(expectedWorkdir);
|
||||
} else {
|
||||
// If the file doesn't exist, the agent either failed or ran somewhere else
|
||||
console.warn('Agent did not create verify-pwd.txt file');
|
||||
console.log('Expected at:', verifyPwdFile);
|
||||
|
||||
// Let's check if it was created elsewhere (debugging)
|
||||
const alternativeLocations = [
|
||||
join(tempDir, 'verify-pwd.txt'),
|
||||
join(process.cwd(), 'verify-pwd.txt'),
|
||||
];
|
||||
|
||||
for (const loc of alternativeLocations) {
|
||||
if (existsSync(loc)) {
|
||||
const content = await readFile(loc, 'utf-8');
|
||||
console.log(`Found verify-pwd.txt at unexpected location ${loc}:`, content.trim());
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error('Agent did not create pwd verification file in expected location');
|
||||
}
|
||||
}, TEST_TIMEOUT);
|
||||
|
||||
it('creates diagnostic files with correct metadata', async () => {
|
||||
const prompt = `Output the signal: {"done": true}`;
|
||||
|
||||
const agent = await agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait a bit for spawn to complete
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
const expectedWorkdir = join(tempDir, 'agent-workdirs', agent.name, 'workspace');
|
||||
const diagnosticFile = join(expectedWorkdir, '.cw', 'spawn-diagnostic.json');
|
||||
const expectedPwdFile = join(expectedWorkdir, '.cw', 'expected-pwd.txt');
|
||||
|
||||
// Verify files exist immediately after spawn
|
||||
expect(existsSync(diagnosticFile), 'diagnostic file should be created after spawn').toBe(true);
|
||||
expect(existsSync(expectedPwdFile), 'expected pwd file should be created').toBe(true);
|
||||
|
||||
// Verify diagnostic content
|
||||
const diagnostic = JSON.parse(await readFile(diagnosticFile, 'utf-8'));
|
||||
const expectedPwd = (await readFile(expectedPwdFile, 'utf-8')).trim();
|
||||
|
||||
expect(diagnostic.agentId).toBe(agent.id);
|
||||
expect(diagnostic.alias).toBe(agent.name);
|
||||
expect(diagnostic.intendedCwd).toBe(expectedWorkdir);
|
||||
expect(diagnostic.provider).toBe('claude');
|
||||
expect(diagnostic.cwdExistsAtSpawn).toBe(true);
|
||||
expect(diagnostic.customCwdProvided).toBe(false);
|
||||
expect(typeof diagnostic.timestamp).toBe('string');
|
||||
expect(Array.isArray(diagnostic.args)).toBe(true);
|
||||
|
||||
expect(expectedPwd).toBe(expectedWorkdir);
|
||||
});
|
||||
});
|
||||
232
apps/server/test/integration/crash-race-condition.test.ts
Normal file
232
apps/server/test/integration/crash-race-condition.test.ts
Normal file
@@ -0,0 +1,232 @@
|
||||
/**
|
||||
* Integration test to reproduce and fix the crash marking race condition.
|
||||
*
|
||||
* This test simulates the exact scenario where agents complete successfully
|
||||
* but get marked as crashed due to timing issues in the output handler.
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
||||
import { writeFile, mkdir, rm } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { randomBytes } from 'node:crypto';
|
||||
import { OutputHandler } from '../../agent/output-handler.js';
|
||||
import type { AgentRepository } from '../../db/repositories/agent-repository.js';
|
||||
|
||||
interface TestAgent {
|
||||
id: string;
|
||||
name: string;
|
||||
status: 'idle' | 'running' | 'waiting_for_input' | 'stopped' | 'crashed';
|
||||
mode: 'execute' | 'discuss' | 'plan' | 'detail' | 'refine';
|
||||
taskId: string | null;
|
||||
sessionId: string | null;
|
||||
worktreeId: string;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
provider: string;
|
||||
accountId: string | null;
|
||||
pid: number | null;
|
||||
outputFilePath: string | null;
|
||||
result: string | null;
|
||||
pendingQuestions: string | null;
|
||||
initiativeId: string | null;
|
||||
userDismissedAt: Date | null;
|
||||
exitCode: number | null;
|
||||
}
|
||||
|
||||
describe('Crash marking race condition', () => {
|
||||
let outputHandler: OutputHandler;
|
||||
let testAgent: TestAgent;
|
||||
let testDir: string;
|
||||
let mockRepo: AgentRepository;
|
||||
|
||||
// Track all repository calls
|
||||
let updateCalls: Array<{ id: string; data: any }> = [];
|
||||
let finalAgentStatus: string | null = null;
|
||||
|
||||
beforeEach(async () => {
|
||||
updateCalls = [];
|
||||
finalAgentStatus = null;
|
||||
|
||||
// Create test directory structure
|
||||
testDir = join(tmpdir(), `crash-test-${randomBytes(8).toString('hex')}`);
|
||||
const outputDir = join(testDir, '.cw/output');
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
|
||||
// Create test agent
|
||||
testAgent = {
|
||||
id: 'test-agent-id',
|
||||
name: 'test-agent',
|
||||
status: 'running',
|
||||
mode: 'refine',
|
||||
taskId: 'task-1',
|
||||
sessionId: 'session-1',
|
||||
worktreeId: 'worktree-1',
|
||||
createdAt: new Date(),
|
||||
updatedAt: new Date(),
|
||||
provider: 'claude',
|
||||
accountId: null,
|
||||
pid: 12345,
|
||||
outputFilePath: join(testDir, 'output.jsonl'),
|
||||
result: null,
|
||||
pendingQuestions: null,
|
||||
initiativeId: 'init-1',
|
||||
userDismissedAt: null,
|
||||
exitCode: null
|
||||
};
|
||||
|
||||
// Mock repository that tracks all update calls
|
||||
mockRepo = {
|
||||
async findById(id: string) {
|
||||
return id === testAgent.id ? { ...testAgent } : null;
|
||||
},
|
||||
async update(id: string, data: any) {
|
||||
updateCalls.push({ id, data });
|
||||
if (data.status) {
|
||||
finalAgentStatus = data.status;
|
||||
testAgent.status = data.status;
|
||||
}
|
||||
return { ...testAgent, ...data };
|
||||
},
|
||||
async create() { throw new Error('Not implemented'); },
|
||||
async findAll() { throw new Error('Not implemented'); },
|
||||
async findByStatus() { throw new Error('Not implemented'); },
|
||||
async findByTaskId() { throw new Error('Not implemented'); },
|
||||
async findByName() { throw new Error('Not implemented'); },
|
||||
async findBySessionId() { throw new Error('Not implemented'); },
|
||||
async delete() { throw new Error('Not implemented'); }
|
||||
};
|
||||
|
||||
outputHandler = new OutputHandler(mockRepo);
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
try {
|
||||
await rm(testDir, { recursive: true });
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
});
|
||||
|
||||
it('should NOT mark agent as crashed when signal.json indicates completion', async () => {
|
||||
// SETUP: Create a valid completion signal that should prevent crash marking
|
||||
const signalPath = join(testDir, '.cw/output/signal.json');
|
||||
const signalContent = {
|
||||
status: 'questions',
|
||||
questions: [
|
||||
{ id: 'q1', question: 'Test question?' }
|
||||
]
|
||||
};
|
||||
await writeFile(signalPath, JSON.stringify(signalContent, null, 2));
|
||||
|
||||
// SETUP: Create empty output file to simulate "no new output detected" scenario
|
||||
const outputFilePath = join(testDir, 'output.jsonl');
|
||||
await writeFile(outputFilePath, ''); // Empty file simulates the race condition
|
||||
|
||||
// Mock active agent with output file path
|
||||
const mockActive = {
|
||||
outputFilePath,
|
||||
streamSessionId: 'session-1'
|
||||
};
|
||||
|
||||
// Mock getAgentWorkdir function — receives worktreeId, not agentId
|
||||
const getAgentWorkdir = (worktreeId: string) => {
|
||||
expect(worktreeId).toBe(testAgent.worktreeId);
|
||||
return testDir;
|
||||
};
|
||||
|
||||
// EXECUTE: Call handleCompletion which should trigger the race condition scenario
|
||||
// This simulates: no stream text + no new file content + valid signal.json
|
||||
await (outputHandler as any).handleCompletion(
|
||||
testAgent.id,
|
||||
mockActive,
|
||||
getAgentWorkdir
|
||||
);
|
||||
|
||||
// VERIFY: Agent should NOT be marked as crashed
|
||||
console.log('Update calls:', updateCalls);
|
||||
console.log('Final agent status:', finalAgentStatus);
|
||||
|
||||
expect(updateCalls.length).toBeGreaterThan(0);
|
||||
expect(finalAgentStatus).not.toBe('crashed');
|
||||
|
||||
// Should be marked with the appropriate completion status
|
||||
expect(['idle', 'waiting_for_input', 'stopped']).toContain(finalAgentStatus);
|
||||
});
|
||||
|
||||
it('should mark agent as crashed when no completion signal exists', async () => {
|
||||
// SETUP: No signal.json file exists - agent should be marked as crashed
|
||||
const outputFilePath = join(testDir, 'output.jsonl');
|
||||
await writeFile(outputFilePath, ''); // Empty file
|
||||
|
||||
const mockActive = {
|
||||
outputFilePath,
|
||||
streamSessionId: 'session-1'
|
||||
};
|
||||
|
||||
const getAgentWorkdir = (agentId: string) => testDir;
|
||||
|
||||
// EXECUTE: This should mark agent as crashed since no completion signal exists
|
||||
await (outputHandler as any).handleCompletion(
|
||||
testAgent.id,
|
||||
mockActive,
|
||||
getAgentWorkdir
|
||||
);
|
||||
|
||||
// VERIFY: Agent SHOULD be marked as crashed
|
||||
expect(finalAgentStatus).toBe('crashed');
|
||||
});
|
||||
|
||||
it('should handle the exact slim-wildebeest scenario', async () => {
|
||||
// SETUP: Reproduce the exact conditions that slim-wildebeest had
|
||||
const signalPath = join(testDir, '.cw/output/signal.json');
|
||||
const exactSignalContent = {
|
||||
"status": "questions",
|
||||
"questions": [
|
||||
{
|
||||
"id": "q1",
|
||||
"question": "What UI framework/styling system is the admin UI currently using that needs to be replaced?"
|
||||
},
|
||||
{
|
||||
"id": "q2",
|
||||
"question": "What specific problems with the current admin UI are we solving? (e.g., poor developer experience, design inconsistency, performance issues, lack of accessibility)"
|
||||
}
|
||||
]
|
||||
};
|
||||
await writeFile(signalPath, JSON.stringify(exactSignalContent, null, 2));
|
||||
|
||||
// Create SUMMARY.md like slim-wildebeest had
|
||||
const summaryPath = join(testDir, '.cw/output/SUMMARY.md');
|
||||
const summaryContent = `---
|
||||
files_modified: []
|
||||
---
|
||||
Initiative page is essentially empty — lacks context, scope, goals, and technical approach. Requested clarification on current state, problems being solved, scope boundaries, and success criteria before proposing meaningful improvements.`;
|
||||
await writeFile(summaryPath, summaryContent);
|
||||
|
||||
// Simulate the output file scenario
|
||||
const outputFilePath = join(testDir, 'output.jsonl');
|
||||
await writeFile(outputFilePath, 'some initial content\n'); // Some content but no new lines
|
||||
|
||||
const mockActive = {
|
||||
outputFilePath,
|
||||
streamSessionId: 'session-1'
|
||||
};
|
||||
|
||||
const getAgentWorkdir = (agentId: string) => testDir;
|
||||
|
||||
// EXECUTE: This is the exact scenario that caused slim-wildebeest to be marked as crashed
|
||||
await (outputHandler as any).handleCompletion(
|
||||
testAgent.id,
|
||||
mockActive,
|
||||
getAgentWorkdir
|
||||
);
|
||||
|
||||
// VERIFY: This should NOT be marked as crashed
|
||||
console.log('slim-wildebeest scenario - Final status:', finalAgentStatus);
|
||||
console.log('slim-wildebeest scenario - Update calls:', updateCalls);
|
||||
|
||||
expect(finalAgentStatus).not.toBe('crashed');
|
||||
expect(['idle', 'waiting_for_input', 'stopped']).toContain(finalAgentStatus);
|
||||
});
|
||||
|
||||
});
|
||||
@@ -0,0 +1,244 @@
|
||||
/**
|
||||
* Full-Flow Cassette Integration Test
|
||||
*
|
||||
* Cassette-backed variant of the full multi-agent workflow test.
|
||||
* Runs the same discuss → plan → detail → execute pipeline but intercepts
|
||||
* subprocess spawning with CassetteProcessManager — no real API calls in CI.
|
||||
*
|
||||
* Recording (one-time, costs ~$2–5):
|
||||
* CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
|
||||
* # Commit the generated src/test/cassettes/<hash>.json files afterward
|
||||
*
|
||||
* Replay (default — runs in seconds):
|
||||
* npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts
|
||||
*
|
||||
* Force re-record (overwrites existing cassettes):
|
||||
* CW_CASSETTE_FORCE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { existsSync, readdirSync } from 'node:fs';
|
||||
import { join, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import type { Phase, Task } from '../../../db/schema.js';
|
||||
import type { AgentResult } from '../../../agent/types.js';
|
||||
import { buildExecutePrompt } from '../../../agent/prompts/index.js';
|
||||
import { CassetteStore } from '../../cassette/store.js';
|
||||
import { CassetteProcessManager, type CassetteMode } from '../../cassette/process-manager.js';
|
||||
import {
|
||||
createFullFlowHarness,
|
||||
type FullFlowHarness,
|
||||
} from './harness.js';
|
||||
import {
|
||||
printHeader,
|
||||
printDiscussResult,
|
||||
printPlanResult,
|
||||
printDetailResult,
|
||||
printExecuteResult,
|
||||
printFinalSummary,
|
||||
type ExecutedTask,
|
||||
} from './report.js';
|
||||
|
||||
// =============================================================================
|
||||
// Constants
|
||||
// =============================================================================
|
||||
|
||||
const RECORDING =
|
||||
process.env.CW_CASSETTE_FORCE_RECORD === '1' || process.env.CW_CASSETTE_RECORD === '1';
|
||||
|
||||
/**
|
||||
* Test timeout.
|
||||
* - Replay: 5 min (cassettes complete in seconds; cap is generous headroom)
|
||||
* - Record: 60 min (real agents doing discuss/plan/detail/execute take API time)
|
||||
*/
|
||||
const CASSETTE_FLOW_TIMEOUT = RECORDING ? 60 * 60_000 : 5 * 60_000;
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const CASSETTE_DIR =
|
||||
process.env.CW_CASSETTE_DIR ?? join(__dirname, '../../cassettes');
|
||||
|
||||
// =============================================================================
|
||||
// Mode helper
|
||||
// =============================================================================
|
||||
|
||||
function cassetteMode(): CassetteMode {
|
||||
if (process.env.CW_CASSETTE_FORCE_RECORD === '1') return 'record';
|
||||
if (process.env.CW_CASSETTE_RECORD === '1') return 'auto';
|
||||
return 'replay';
|
||||
}
|
||||
|
||||
/**
|
||||
* True when cassettes are available (at least one .json file) OR we're in a
|
||||
* recording run. Skips the suite if no cassettes have been recorded yet so
|
||||
* that `npm test` doesn't fail on a fresh clone before cassettes are committed.
|
||||
*/
|
||||
function cassettesAvailable(): boolean {
|
||||
const mode = cassetteMode();
|
||||
if (mode !== 'replay') return true; // recording runs always proceed
|
||||
if (!existsSync(CASSETTE_DIR)) return false;
|
||||
return readdirSync(CASSETTE_DIR).some((f) => f.endsWith('.json'));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Test
|
||||
// =============================================================================
|
||||
|
||||
describe.skipIf(!cassettesAvailable())('full flow (cassette replay)', () => {
|
||||
let harness: FullFlowHarness;
|
||||
const startedAt = Date.now();
|
||||
|
||||
beforeAll(async () => {
|
||||
const store = new CassetteStore(CASSETTE_DIR);
|
||||
const mode = cassetteMode();
|
||||
|
||||
harness = await createFullFlowHarness('Add complete() method to TodoStore', {
|
||||
processManagerFactory: (workspaceRoot, projectRepo) =>
|
||||
new CassetteProcessManager(workspaceRoot, projectRepo, store, mode),
|
||||
});
|
||||
|
||||
printHeader(harness.initiative.name);
|
||||
console.log(` Cassette mode : ${mode}`);
|
||||
console.log(` Cassette dir : ${CASSETTE_DIR}`);
|
||||
console.log(` Initiative ID : ${harness.initiative.id}`);
|
||||
console.log(` Workspace : ${harness.workspaceRoot}`);
|
||||
}, CASSETTE_FLOW_TIMEOUT);
|
||||
|
||||
afterAll(async () => {
|
||||
if (harness) await harness.cleanup();
|
||||
});
|
||||
|
||||
it(
|
||||
'runs the complete multi-agent workflow from cassettes',
|
||||
async () => {
|
||||
const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness;
|
||||
const initiativeId = initiative.id;
|
||||
|
||||
// ── Stage 2: Discuss ───────────────────────────────────────────────────
|
||||
console.log('\n\n>>> Stage 2: DISCUSS <<<');
|
||||
const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId });
|
||||
expect(discussAgent.id).toBeTruthy();
|
||||
console.log(` Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`);
|
||||
|
||||
const discussResult = await harness.driveToCompletion(
|
||||
discussAgent.id,
|
||||
'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.',
|
||||
CASSETTE_FLOW_TIMEOUT,
|
||||
);
|
||||
printDiscussResult(discussAgent.id, discussResult);
|
||||
|
||||
if (!discussResult?.success) {
|
||||
console.warn(' [WARN] discuss agent did not succeed; continuing to plan stage');
|
||||
}
|
||||
|
||||
// ── Stage 3: Plan ──────────────────────────────────────────────────────
|
||||
console.log('\n\n>>> Stage 3: PLAN <<<');
|
||||
const planAgent = await caller.spawnArchitectPlan({ initiativeId });
|
||||
expect(planAgent.id).toBeTruthy();
|
||||
console.log(` Spawned plan agent: ${planAgent.name} (${planAgent.id})`);
|
||||
|
||||
const planResult = await harness.driveToCompletion(
|
||||
planAgent.id,
|
||||
'Keep it simple.',
|
||||
CASSETTE_FLOW_TIMEOUT,
|
||||
);
|
||||
expect(planResult).toBeTruthy();
|
||||
|
||||
const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId);
|
||||
expect(phases.length).toBeGreaterThan(0);
|
||||
printPlanResult(phases);
|
||||
|
||||
// ── Stage 4: Detail (per phase) ────────────────────────────────────────
|
||||
console.log('\n\n>>> Stage 4: DETAIL <<<');
|
||||
for (const phase of phases) {
|
||||
const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id });
|
||||
expect(detailAgent.id).toBeTruthy();
|
||||
console.log(` Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`);
|
||||
|
||||
const detailResult = await harness.driveToCompletion(
|
||||
detailAgent.id,
|
||||
'Keep it simple.',
|
||||
CASSETTE_FLOW_TIMEOUT,
|
||||
);
|
||||
expect(detailResult).toBeTruthy();
|
||||
|
||||
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
|
||||
const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
|
||||
expect(executeTasks.length).toBeGreaterThan(0);
|
||||
printDetailResult(phase, phaseTasks);
|
||||
}
|
||||
|
||||
// ── Stage 5: Execute ───────────────────────────────────────────────────
|
||||
console.log('\n\n>>> Stage 5: EXECUTE <<<');
|
||||
const allTasks = await gatherAllExecuteTasks(taskRepository, phases);
|
||||
console.log(` Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`);
|
||||
|
||||
const executed: ExecutedTask[] = [];
|
||||
for (const task of allTasks) {
|
||||
console.log(` Spawning execute agent for: "${task.name}"`);
|
||||
const execAgent = await agentManager.spawn({
|
||||
taskId: task.id,
|
||||
prompt: buildExecutePrompt(task.description ?? task.name),
|
||||
mode: 'execute',
|
||||
initiativeId,
|
||||
phaseId: task.phaseId ?? undefined,
|
||||
inputContext: {
|
||||
initiative,
|
||||
task,
|
||||
},
|
||||
});
|
||||
console.log(` Agent: ${execAgent.name} (${execAgent.id})`);
|
||||
|
||||
const result = await harness.driveToCompletion(
|
||||
execAgent.id,
|
||||
'Use your best judgment and keep it simple.',
|
||||
CASSETTE_FLOW_TIMEOUT,
|
||||
);
|
||||
executed.push({ task, result });
|
||||
|
||||
const icon = result?.success ? '✓' : '✗';
|
||||
console.log(` ${icon} Completed with success=${result?.success ?? null}`);
|
||||
if (result && !result.success) {
|
||||
console.log(` Message: ${result.message?.slice(0, 200)}`);
|
||||
}
|
||||
}
|
||||
|
||||
printExecuteResult(executed);
|
||||
|
||||
// ── Assertions ─────────────────────────────────────────────────────────
|
||||
expect(executed.length).toBeGreaterThan(0);
|
||||
|
||||
const allSucceeded = executed.every((e) => e.result?.success === true);
|
||||
if (!allSucceeded) {
|
||||
const failed = executed.filter((e) => !e.result?.success);
|
||||
console.warn(` [WARN] ${failed.length} execute task(s) did not succeed`);
|
||||
}
|
||||
|
||||
// ── Final summary ──────────────────────────────────────────────────────
|
||||
printFinalSummary(
|
||||
initiative.name,
|
||||
phases,
|
||||
allTasks,
|
||||
executed,
|
||||
Date.now() - startedAt,
|
||||
);
|
||||
},
|
||||
CASSETTE_FLOW_TIMEOUT,
|
||||
);
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
// Helpers
|
||||
// =============================================================================
|
||||
|
||||
async function gatherAllExecuteTasks(
|
||||
taskRepository: FullFlowHarness['taskRepository'],
|
||||
phases: Phase[],
|
||||
): Promise<Task[]> {
|
||||
const result: Task[] = [];
|
||||
for (const phase of phases) {
|
||||
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
|
||||
const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
|
||||
result.push(...execTasks);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
399
apps/server/test/integration/full-flow/harness.ts
Normal file
399
apps/server/test/integration/full-flow/harness.ts
Normal file
@@ -0,0 +1,399 @@
|
||||
/**
|
||||
* Full-Flow Test Harness
|
||||
*
|
||||
* Wires up the complete system with real agents for end-to-end multi-agent
|
||||
* workflow testing: discuss → plan → detail → execute.
|
||||
*
|
||||
* Unlike the standard TestHarness (MockAgentManager) or RealProviderHarness
|
||||
* (agents only), this harness adds:
|
||||
* - All 11 repositories
|
||||
* - tRPC caller for architect/agent procedures
|
||||
* - A self-contained fixture git repo (todo-api) for agents to work on
|
||||
* - Helpers for driving agents through question/answer loops
|
||||
*
|
||||
* Used by full-flow-cassette.test.ts (replay) and for manual recording runs.
|
||||
*/
|
||||
|
||||
import { mkdtemp, rm, cp } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { execSync } from 'node:child_process';
|
||||
import type { DrizzleDatabase } from '../../../db/index.js';
|
||||
import type { DomainEvent } from '../../../events/types.js';
|
||||
import { EventEmitterBus } from '../../../events/bus.js';
|
||||
import { MultiProviderAgentManager } from '../../../agent/manager.js';
|
||||
import type { AgentResult, PendingQuestions } from '../../../agent/types.js';
|
||||
import type { Initiative, Project, Phase, Task } from '../../../db/schema.js';
|
||||
import type { InitiativeRepository } from '../../../db/repositories/initiative-repository.js';
|
||||
import type { PhaseRepository } from '../../../db/repositories/phase-repository.js';
|
||||
import type { TaskRepository } from '../../../db/repositories/task-repository.js';
|
||||
import type { MessageRepository } from '../../../db/repositories/message-repository.js';
|
||||
import type { AgentRepository } from '../../../db/repositories/agent-repository.js';
|
||||
import type { PageRepository } from '../../../db/repositories/page-repository.js';
|
||||
import type { ProjectRepository } from '../../../db/repositories/project-repository.js';
|
||||
import type { AccountRepository } from '../../../db/repositories/account-repository.js';
|
||||
import type { ChangeSetRepository } from '../../../db/repositories/change-set-repository.js';
|
||||
import type { LogChunkRepository } from '../../../db/repositories/log-chunk-repository.js';
|
||||
import type { ConversationRepository } from '../../../db/repositories/conversation-repository.js';
|
||||
import type { ProcessManager } from '../../../agent/process-manager.js';
|
||||
import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
|
||||
import { createRepositories } from '../../../container.js';
|
||||
import { DefaultDispatchManager } from '../../../dispatch/manager.js';
|
||||
import { appRouter, createCallerFactory } from '../../../trpc/router.js';
|
||||
import { createContext } from '../../../trpc/context.js';
|
||||
|
||||
// =============================================================================
|
||||
// CapturingEventBus
|
||||
// =============================================================================
|
||||
|
||||
export class CapturingEventBus extends EventEmitterBus {
|
||||
emittedEvents: DomainEvent[] = [];
|
||||
|
||||
emit<T extends DomainEvent>(event: T): void {
|
||||
this.emittedEvents.push(event);
|
||||
super.emit(event);
|
||||
}
|
||||
|
||||
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
|
||||
return this.emittedEvents.filter((e) => e.type === type) as T[];
|
||||
}
|
||||
|
||||
clearEvents(): void {
|
||||
this.emittedEvents = [];
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Sleep helper
|
||||
// =============================================================================
|
||||
|
||||
export function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// tRPC caller type
|
||||
// =============================================================================
|
||||
|
||||
const createCaller = createCallerFactory(appRouter);
|
||||
export type FullFlowCaller = ReturnType<typeof createCaller>;
|
||||
|
||||
// =============================================================================
|
||||
// FullFlowHarness interface
|
||||
// =============================================================================
|
||||
|
||||
/** Status of an agent that requires attention: done, waiting for answers, or crashed */
|
||||
export type AgentAttentionStatus = 'done' | 'waiting' | 'crashed';
|
||||
|
||||
export interface FullFlowHarness {
|
||||
/** Absolute path to the CW workspace (worktrees are created here) */
|
||||
workspaceRoot: string;
|
||||
/** Absolute path to the cloned todo-api fixture git repo */
|
||||
fixtureRoot: string;
|
||||
/** The registered todo-api project */
|
||||
project: Project;
|
||||
/** The initiative created for the test run */
|
||||
initiative: Initiative;
|
||||
/** tRPC caller (all procedures available) */
|
||||
caller: FullFlowCaller;
|
||||
/** Real MultiProviderAgentManager */
|
||||
agentManager: MultiProviderAgentManager;
|
||||
/** In-memory SQLite database */
|
||||
db: DrizzleDatabase;
|
||||
/** Event bus with capture capability */
|
||||
eventBus: CapturingEventBus;
|
||||
|
||||
// All 11 repositories
|
||||
initiativeRepository: InitiativeRepository;
|
||||
phaseRepository: PhaseRepository;
|
||||
taskRepository: TaskRepository;
|
||||
messageRepository: MessageRepository;
|
||||
agentRepository: AgentRepository;
|
||||
pageRepository: PageRepository;
|
||||
projectRepository: ProjectRepository;
|
||||
accountRepository: AccountRepository;
|
||||
changeSetRepository: ChangeSetRepository;
|
||||
logChunkRepository: LogChunkRepository;
|
||||
conversationRepository: ConversationRepository;
|
||||
|
||||
/**
|
||||
* Wait for an agent to reach a terminal status (idle/stopped/crashed).
|
||||
* Returns null if the agent enters waiting_for_input.
|
||||
*/
|
||||
waitForAgentCompletion(agentId: string, timeoutMs?: number): Promise<AgentResult | null>;
|
||||
|
||||
/**
|
||||
* Poll until the agent needs attention: done (idle/stopped), waiting for input, or crashed.
|
||||
* Useful for the question/answer loop in discuss mode.
|
||||
*/
|
||||
waitForAgentAttention(agentId: string, timeoutMs?: number): Promise<AgentAttentionStatus>;
|
||||
|
||||
/**
|
||||
* Drive an agent to full completion, answering any questions along the way.
|
||||
* Answers all questions with the provided answer string (or a default).
|
||||
*/
|
||||
driveToCompletion(
|
||||
agentId: string,
|
||||
answer?: string,
|
||||
timeoutMs?: number,
|
||||
): Promise<AgentResult | null>;
|
||||
|
||||
/**
|
||||
* Get captured events filtered by type.
|
||||
*/
|
||||
getEventsByType<T extends DomainEvent>(type: T['type']): T[];
|
||||
|
||||
/**
|
||||
* Kill all running agents and remove temp directories.
|
||||
*/
|
||||
cleanup(): Promise<void>;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Poll interval
|
||||
// =============================================================================
|
||||
|
||||
const POLL_INTERVAL_MS = 1500;
|
||||
|
||||
// =============================================================================
|
||||
// Factory
|
||||
// =============================================================================
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
|
||||
|
||||
export interface FullFlowHarnessOptions {
|
||||
/** Factory called after workspaceRoot + repos are created. Return a custom ProcessManager. */
|
||||
processManagerFactory?: (workspaceRoot: string, projectRepo: ProjectRepository) => ProcessManager;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a full-flow test harness.
|
||||
*
|
||||
* Setup steps:
|
||||
* 1. Copy todo-api fixture into a temp git repo (fixtureRoot).
|
||||
* 2. Create workspace temp dir (workspaceRoot) for CW operations.
|
||||
* 3. Init in-memory DB + all 11 repos.
|
||||
* 4. Wire real MultiProviderAgentManager with all repos.
|
||||
* 5. Wire DefaultDispatchManager for execute stage.
|
||||
* 6. Create tRPC caller with full context.
|
||||
* 7. Register project in DB directly (url = fixtureRoot).
|
||||
* 8. Create initiative via tRPC (links project, creates root page).
|
||||
*/
|
||||
export async function createFullFlowHarness(
|
||||
initiativeName = 'Add complete() method to TodoStore',
|
||||
options?: FullFlowHarnessOptions,
|
||||
): Promise<FullFlowHarness> {
|
||||
// ── 0. Allow nested claude invocations ────────────────────────────────────
|
||||
// Claude Code sets CLAUDECODE in the environment, which prevents nested
|
||||
// claude CLI calls from starting ("cannot be launched inside another Claude
|
||||
// Code session"). Save and remove it so spawned agents can run normally.
|
||||
// It is restored in cleanup().
|
||||
const savedClaudeCodeEnv = process.env.CLAUDECODE;
|
||||
delete process.env.CLAUDECODE;
|
||||
|
||||
// ── 1. Fixture project ────────────────────────────────────────────────────
|
||||
// IMPORTANT: cp(src, dest) puts src INSIDE dest when dest already exists
|
||||
// (like `cp -r src dest/` → creates dest/src/). We need dest to NOT exist
|
||||
// yet so that cp creates it as a copy of src directly.
|
||||
const fixtureBase = await mkdtemp(join(tmpdir(), 'cw-fixture-'));
|
||||
const fixtureRoot = join(fixtureBase, 'todo-api'); // does not exist yet
|
||||
await cp(FIXTURES_DIR, fixtureRoot, { recursive: true });
|
||||
|
||||
// Verify files landed at the right level before git operations
|
||||
execSync(`test -f "${join(fixtureRoot, 'package.json')}"`, { stdio: 'pipe' });
|
||||
|
||||
execSync('git init', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git add .', { cwd: fixtureRoot, stdio: 'pipe' });
|
||||
execSync('git commit -m "initial todo-api with missing complete()"', {
|
||||
cwd: fixtureRoot,
|
||||
stdio: 'pipe',
|
||||
});
|
||||
|
||||
// ── 2. Workspace root ─────────────────────────────────────────────────────
|
||||
// Just a plain temp directory — agent worktrees live under repos/ inside it.
|
||||
// No git init needed; the PROJECT clone (repos/<name>-<id>/) is the git repo.
|
||||
const workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-workspace-'));
|
||||
|
||||
// ── 3. Database + repositories ────────────────────────────────────────────
|
||||
const db = createTestDatabase();
|
||||
const repos = createRepositories(db);
|
||||
|
||||
// ── 4. Event bus ──────────────────────────────────────────────────────────
|
||||
const eventBus = new CapturingEventBus();
|
||||
|
||||
// ── 5. Real agent manager ─────────────────────────────────────────────────
|
||||
const customProcessManager = options?.processManagerFactory?.(workspaceRoot, repos.projectRepository);
|
||||
const agentManager = new MultiProviderAgentManager(
|
||||
repos.agentRepository,
|
||||
workspaceRoot,
|
||||
repos.projectRepository,
|
||||
repos.accountRepository,
|
||||
eventBus,
|
||||
undefined, // no credential manager needed for default claude account
|
||||
repos.changeSetRepository,
|
||||
repos.phaseRepository,
|
||||
repos.taskRepository,
|
||||
repos.pageRepository,
|
||||
repos.logChunkRepository,
|
||||
false, // debug
|
||||
customProcessManager, // processManagerOverride
|
||||
);
|
||||
|
||||
// ── 6. Dispatch manager (for execute stage) ───────────────────────────────
|
||||
const dispatchManager = new DefaultDispatchManager(
|
||||
repos.taskRepository,
|
||||
repos.messageRepository,
|
||||
agentManager,
|
||||
eventBus,
|
||||
repos.initiativeRepository,
|
||||
repos.phaseRepository,
|
||||
);
|
||||
|
||||
// ── 7. tRPC caller ────────────────────────────────────────────────────────
|
||||
const ctx = createContext({
|
||||
eventBus,
|
||||
serverStartedAt: new Date(),
|
||||
processCount: 0,
|
||||
agentManager,
|
||||
dispatchManager,
|
||||
workspaceRoot,
|
||||
...repos,
|
||||
});
|
||||
const caller = createCaller(ctx);
|
||||
|
||||
// ── 8. Register project directly in DB (bypass tRPC clone) ───────────────
|
||||
const project = await repos.projectRepository.create({
|
||||
name: 'todo-api',
|
||||
url: fixtureRoot,
|
||||
});
|
||||
|
||||
// ── 9. Create initiative via tRPC (creates root page automatically) ───────
|
||||
const initiative = await caller.createInitiative({
|
||||
name: initiativeName,
|
||||
projectIds: [project.id],
|
||||
});
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function waitForAgentCompletion(
|
||||
agentId: string,
|
||||
timeoutMs = 120_000,
|
||||
): Promise<AgentResult | null> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
const agent = await repos.agentRepository.findById(agentId);
|
||||
if (!agent) return null;
|
||||
if (agent.status === 'idle' || agent.status === 'stopped' || agent.status === 'crashed') {
|
||||
return agentManager.getResult(agentId);
|
||||
}
|
||||
if (agent.status === 'waiting_for_input') return null;
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
}
|
||||
throw new Error(`Timeout: agent ${agentId} did not complete within ${timeoutMs}ms`);
|
||||
}
|
||||
|
||||
async function waitForAgentAttention(
|
||||
agentId: string,
|
||||
timeoutMs = 120_000,
|
||||
): Promise<AgentAttentionStatus> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
const agent = await repos.agentRepository.findById(agentId);
|
||||
if (!agent) return 'crashed';
|
||||
if (agent.status === 'idle' || agent.status === 'stopped') return 'done';
|
||||
if (agent.status === 'crashed') return 'crashed';
|
||||
if (agent.status === 'waiting_for_input') return 'waiting';
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
}
|
||||
throw new Error(`Timeout: agent ${agentId} did not reach attention state within ${timeoutMs}ms`);
|
||||
}
|
||||
|
||||
async function driveToCompletion(
|
||||
agentId: string,
|
||||
answer = 'Use your best judgment and keep it simple.',
|
||||
timeoutMs = 10 * 60_000,
|
||||
): Promise<AgentResult | null> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const remaining = deadline - Date.now();
|
||||
if (remaining <= 0) break;
|
||||
|
||||
let status: AgentAttentionStatus;
|
||||
try {
|
||||
status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
|
||||
} catch {
|
||||
// Agent is still running (hasn't reached an attention state within the polling
|
||||
// window). This is normal for long-running execute agents. Continue the outer
|
||||
// loop — the deadline check above will terminate us if we truly time out.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (status === 'done' || status === 'crashed') {
|
||||
return agentManager.getResult(agentId);
|
||||
}
|
||||
|
||||
if (status === 'waiting') {
|
||||
const pending = await agentManager.getPendingQuestions(agentId);
|
||||
if (!pending || pending.questions.length === 0) {
|
||||
// Shouldn't happen, but guard against it
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
continue;
|
||||
}
|
||||
const answers = Object.fromEntries(
|
||||
pending.questions.map((q) => [q.id, answer]),
|
||||
);
|
||||
await agentManager.resume(agentId, answers);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`driveToCompletion: agent ${agentId} did not finish within ${timeoutMs}ms`);
|
||||
}
|
||||
|
||||
// ── Build and return harness ───────────────────────────────────────────────
|
||||
|
||||
const harness: FullFlowHarness = {
|
||||
workspaceRoot,
|
||||
fixtureRoot,
|
||||
project,
|
||||
initiative,
|
||||
caller,
|
||||
agentManager,
|
||||
db,
|
||||
eventBus,
|
||||
...repos,
|
||||
|
||||
waitForAgentCompletion,
|
||||
waitForAgentAttention,
|
||||
driveToCompletion,
|
||||
|
||||
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
|
||||
return eventBus.getEventsByType<T>(type);
|
||||
},
|
||||
|
||||
async cleanup() {
|
||||
// Kill any running agents
|
||||
const agents = await repos.agentRepository.findAll();
|
||||
await Promise.allSettled(
|
||||
agents
|
||||
.filter((a) => a.status === 'running')
|
||||
.map((a) => agentManager.stop(a.id)),
|
||||
);
|
||||
// Restore CLAUDECODE env var
|
||||
if (savedClaudeCodeEnv !== undefined) {
|
||||
process.env.CLAUDECODE = savedClaudeCodeEnv;
|
||||
}
|
||||
// Remove temp directories (fixtureBase contains fixtureRoot)
|
||||
await Promise.allSettled([
|
||||
rm(fixtureBase, { recursive: true, force: true }),
|
||||
rm(workspaceRoot, { recursive: true, force: true }),
|
||||
]);
|
||||
},
|
||||
};
|
||||
|
||||
return harness;
|
||||
}
|
||||
156
apps/server/test/integration/full-flow/report.ts
Normal file
156
apps/server/test/integration/full-flow/report.ts
Normal file
@@ -0,0 +1,156 @@
|
||||
/**
|
||||
* Full-Flow Test Report Utility
|
||||
*
|
||||
* Plain console.log formatters for human-readable output at each stage of the
|
||||
* full-flow integration test. No external dependencies.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import { join } from 'node:path';
|
||||
import type { Phase, Task } from '../../../db/schema.js';
|
||||
import type { AgentResult } from '../../../agent/types.js';
|
||||
|
||||
// =============================================================================
|
||||
// Types
|
||||
// =============================================================================
|
||||
|
||||
export interface ExecutedTask {
|
||||
task: Task;
|
||||
result: AgentResult | null;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Helpers
|
||||
// =============================================================================
|
||||
|
||||
const DIVIDER = '═'.repeat(60);
|
||||
const THIN = '─'.repeat(60);
|
||||
|
||||
function section(title: string): void {
|
||||
console.log(`\n${DIVIDER}`);
|
||||
console.log(` ${title}`);
|
||||
console.log(DIVIDER);
|
||||
}
|
||||
|
||||
function line(msg: string): void {
|
||||
console.log(` ${msg}`);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Stage reporters
|
||||
// =============================================================================
|
||||
|
||||
export function printHeader(initiativeName: string): void {
|
||||
section(`FULL-FLOW TEST: ${initiativeName}`);
|
||||
console.log(` Started at: ${new Date().toISOString()}`);
|
||||
}
|
||||
|
||||
export function printDiscussResult(agentId: string, result: AgentResult | null): void {
|
||||
console.log(`\n[DISCUSS]`);
|
||||
console.log(THIN);
|
||||
line(`Agent: ${agentId}`);
|
||||
if (result) {
|
||||
line(`Success: ${result.success}`);
|
||||
if (result.message) line(`Message: ${result.message.slice(0, 200)}`);
|
||||
} else {
|
||||
line('Result: null (agent may have crashed)');
|
||||
}
|
||||
}
|
||||
|
||||
export function printPlanResult(phases: Phase[]): void {
|
||||
console.log(`\n[PLAN] ${phases.length} phase(s) created`);
|
||||
console.log(THIN);
|
||||
phases.forEach((ph, i) => {
|
||||
line(`${i + 1}. ${ph.name}`);
|
||||
});
|
||||
}
|
||||
|
||||
export function printDetailResult(phase: Phase, tasks: Task[]): void {
|
||||
console.log(`\n[DETAIL] Phase "${phase.name}" → ${tasks.length} task(s)`);
|
||||
console.log(THIN);
|
||||
tasks.forEach((t, i) => {
|
||||
const flags = [t.category, t.type, t.requiresApproval ? 'approval-required' : 'auto'].join(', ');
|
||||
line(`${i + 1}. ${t.name} [${flags}]`);
|
||||
if (t.description) {
|
||||
line(` ${t.description.slice(0, 120)}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export function printExecuteResult(executed: ExecutedTask[]): void {
|
||||
const succeeded = executed.filter((e) => e.result?.success).length;
|
||||
console.log(`\n[EXECUTE] ${succeeded}/${executed.length} task(s) succeeded`);
|
||||
console.log(THIN);
|
||||
for (const { task, result } of executed) {
|
||||
const icon = result?.success ? '✓' : '✗';
|
||||
line(`${icon} ${task.name}`);
|
||||
if (result && !result.success) {
|
||||
line(` Error: ${result.message?.slice(0, 120)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function printGitDiff(workspaceRoot: string, projectName: string): void {
|
||||
console.log('\n[GIT DIFF — agent worktrees]');
|
||||
console.log(THIN);
|
||||
|
||||
// Find all agent worktrees for this project
|
||||
const worktreesBase = join(workspaceRoot, 'agent-workdirs');
|
||||
try {
|
||||
const dirs = execSync(`ls "${worktreesBase}" 2>/dev/null || echo ""`, { encoding: 'utf8' })
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
|
||||
for (const dir of dirs) {
|
||||
const projectDir = join(worktreesBase, dir, projectName);
|
||||
try {
|
||||
const stat = execSync(`git -C "${projectDir}" diff HEAD~1 --stat 2>/dev/null || echo ""`, {
|
||||
encoding: 'utf8',
|
||||
}).trim();
|
||||
if (stat) {
|
||||
line(`Worktree: ${dir}/${projectName}`);
|
||||
stat.split('\n').forEach((l) => line(` ${l}`));
|
||||
}
|
||||
} catch {
|
||||
// Worktree might not have commits — skip silently
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
line('(no agent worktrees found)');
|
||||
}
|
||||
}
|
||||
|
||||
export function printNpmTestResult(projectDir: string): void {
|
||||
console.log('\n[NPM TEST]');
|
||||
console.log(THIN);
|
||||
try {
|
||||
const output = execSync('node --test src/todo.test.js', {
|
||||
cwd: projectDir,
|
||||
encoding: 'utf8',
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
});
|
||||
line('Tests passed:');
|
||||
output.split('\n').forEach((l) => line(` ${l}`));
|
||||
} catch (err: unknown) {
|
||||
const e = err as { stdout?: string; stderr?: string; status?: number };
|
||||
line(`Tests FAILED (exit ${e.status ?? '?'})`);
|
||||
if (e.stdout) e.stdout.split('\n').forEach((l) => line(` ${l}`));
|
||||
if (e.stderr) e.stderr.split('\n').forEach((l) => line(` ${l}`));
|
||||
}
|
||||
}
|
||||
|
||||
export function printFinalSummary(
|
||||
initiativeName: string,
|
||||
phases: Phase[],
|
||||
tasks: Task[],
|
||||
executed: ExecutedTask[],
|
||||
durationMs: number,
|
||||
): void {
|
||||
section(`SUMMARY: ${initiativeName}`);
|
||||
line(`Duration : ${Math.round(durationMs / 1000)}s`);
|
||||
line(`Phases : ${phases.length}`);
|
||||
line(`Tasks : ${tasks.length}`);
|
||||
line(`Executed : ${executed.filter((e) => e.result?.success).length}/${executed.length} succeeded`);
|
||||
console.log(DIVIDER);
|
||||
}
|
||||
183
apps/server/test/integration/real-claude.test.ts
Normal file
183
apps/server/test/integration/real-claude.test.ts
Normal file
@@ -0,0 +1,183 @@
|
||||
/**
|
||||
* Real Claude CLI Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the real Claude CLI and incur API costs.
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts --test-timeout=120000
|
||||
* ```
|
||||
*
|
||||
* Purpose:
|
||||
* - Validate that JSON schemas work correctly with Claude CLI --json-schema flag
|
||||
* - Confirm MockAgentManager accurately simulates real CLI behavior
|
||||
* - Document actual response structure and costs
|
||||
*
|
||||
* Updated (2026-02-06): Now uses the universal agentSignalSchema instead of
|
||||
* per-mode schemas. Agents output trivial signals (done/questions/error) and
|
||||
* write files instead of producing mode-specific JSON.
|
||||
*
|
||||
* Total validation cost: ~$0.10 (3 tests)
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll } from 'vitest';
|
||||
import { execa } from 'execa';
|
||||
import {
|
||||
agentSignalJsonSchema,
|
||||
agentSignalSchema,
|
||||
} from '../../agent/schema.js';
|
||||
|
||||
/**
|
||||
* Result structure from Claude CLI with --output-format json
|
||||
*
|
||||
* When --json-schema is used:
|
||||
* - result: "" (empty string)
|
||||
* - structured_output: { ... } (the validated JSON object)
|
||||
*/
|
||||
interface ClaudeCliResult {
|
||||
type: 'result';
|
||||
subtype: 'success' | 'error' | 'error_max_turns';
|
||||
is_error: boolean;
|
||||
session_id: string;
|
||||
result: string;
|
||||
structured_output?: unknown;
|
||||
total_cost_usd?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to call Claude CLI directly with a prompt and JSON schema.
|
||||
*
|
||||
* @param prompt - The prompt to send to Claude
|
||||
* @param jsonSchema - JSON schema to enforce structured output
|
||||
* @param timeoutMs - Timeout in milliseconds (default 90s)
|
||||
* @returns Parsed CLI result with structured_output
|
||||
*/
|
||||
async function callClaudeCli(
|
||||
prompt: string,
|
||||
jsonSchema: object,
|
||||
timeoutMs = 90000
|
||||
): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
|
||||
const startTime = Date.now();
|
||||
|
||||
const { stdout } = await execa(
|
||||
'claude',
|
||||
[
|
||||
'-p',
|
||||
prompt,
|
||||
'--output-format',
|
||||
'json',
|
||||
'--json-schema',
|
||||
JSON.stringify(jsonSchema),
|
||||
],
|
||||
{
|
||||
timeout: timeoutMs,
|
||||
}
|
||||
);
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
const cliResult: ClaudeCliResult = JSON.parse(stdout);
|
||||
|
||||
console.log(`\n Duration: ${(duration / 1000).toFixed(1)}s`);
|
||||
console.log(` Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
|
||||
console.log(` Session ID: ${cliResult.session_id}`);
|
||||
console.log(` Result field empty: ${cliResult.result === ''}`);
|
||||
console.log(` Has structured_output: ${cliResult.structured_output !== undefined}`);
|
||||
|
||||
// When --json-schema is used, structured output is in structured_output field
|
||||
// The result field is typically empty when using --json-schema
|
||||
const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);
|
||||
|
||||
return { cliResult, structuredOutput };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if real Claude tests should run.
|
||||
* Set REAL_CLAUDE_TESTS=1 environment variable to enable.
|
||||
*/
|
||||
const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';
|
||||
|
||||
/**
|
||||
* Skip wrapper - tests are expensive and should run manually
|
||||
*/
|
||||
const describeReal = shouldRunRealTests ? describe : describe.skip;
|
||||
|
||||
// Individual test timeout - real API calls take 5-30 seconds
|
||||
const TEST_TIMEOUT = 120000; // 2 minutes
|
||||
|
||||
describeReal('Real Claude CLI Integration', () => {
|
||||
beforeAll(() => {
|
||||
console.log('\n=== Running Real Claude CLI Tests ===');
|
||||
console.log('These tests call the real Claude API and incur costs.\n');
|
||||
});
|
||||
|
||||
describe('Universal Signal Schema', () => {
|
||||
it(
|
||||
'should return done status',
|
||||
async () => {
|
||||
const prompt = `Complete this simple task: Say "Hello, World!" as a test.
|
||||
|
||||
Output your response in the required JSON format with status "done".`;
|
||||
|
||||
const { cliResult, structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Verify the CLI response structure
|
||||
expect(cliResult.subtype).toBe('success');
|
||||
expect(cliResult.result).toBe(''); // Empty when using --json-schema
|
||||
expect(cliResult.structured_output).toBeDefined();
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = agentSignalSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('done');
|
||||
},
|
||||
TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'should return questions status with array',
|
||||
async () => {
|
||||
const prompt = `You are working on a vague task: "Make it better"
|
||||
|
||||
You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.
|
||||
|
||||
Output your response with status "questions" and include at least 2 questions with unique IDs.`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = agentSignalSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('questions');
|
||||
if (parsed.status === 'questions') {
|
||||
expect(Array.isArray(parsed.questions)).toBe(true);
|
||||
expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
|
||||
expect(parsed.questions[0].id).toBeTruthy();
|
||||
expect(parsed.questions[0].question).toBeTruthy();
|
||||
}
|
||||
},
|
||||
TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'should return error status',
|
||||
async () => {
|
||||
const prompt = `You have encountered an unrecoverable error. Output your response with status "error" and a descriptive error message.`;
|
||||
|
||||
const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
|
||||
|
||||
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
|
||||
|
||||
// Validate against Zod schema
|
||||
const parsed = agentSignalSchema.parse(structuredOutput);
|
||||
expect(parsed.status).toBe('error');
|
||||
if (parsed.status === 'error') {
|
||||
expect(parsed.error).toBeTruthy();
|
||||
}
|
||||
},
|
||||
TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,298 @@
|
||||
/**
|
||||
* Real Claude CLI Manager Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/claude-manager.test.ts --test-timeout=300000
|
||||
* ```
|
||||
*
|
||||
* Tests covered:
|
||||
* - Output stream parsing (text_delta events)
|
||||
* - Session ID extraction from init event
|
||||
* - Result parsing and validation
|
||||
* - Session resume with user answers
|
||||
*
|
||||
* Estimated cost: ~$0.10 per full run
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
|
||||
import {
|
||||
createRealProviderHarness,
|
||||
describeRealClaude,
|
||||
REAL_TEST_TIMEOUT,
|
||||
sleep,
|
||||
type RealProviderHarness,
|
||||
} from './harness.js';
|
||||
import { MINIMAL_PROMPTS } from './prompts.js';
|
||||
import type { AgentSpawnedEvent, AgentStoppedEvent, AgentOutputEvent } from '../../../events/types.js';
|
||||
|
||||
describeRealClaude('Real Claude Manager Integration', () => {
|
||||
let harness: RealProviderHarness;
|
||||
|
||||
beforeAll(async () => {
|
||||
console.log('\n=== Running Real Claude Manager Tests ===');
|
||||
console.log('These tests call the real Claude API and incur costs.\n');
|
||||
harness = await createRealProviderHarness({ provider: 'claude' });
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await harness.cleanup();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
harness.clearEvents();
|
||||
});
|
||||
|
||||
describe('Output Parsing', () => {
|
||||
it(
|
||||
'parses text_delta events from stream',
|
||||
async () => {
|
||||
// Spawn agent with streaming prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.streaming,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
expect(agent.id).toBeTruthy();
|
||||
expect(agent.status).toBe('running');
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify we got output events
|
||||
const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
|
||||
console.log(' Output events:', outputEvents.length);
|
||||
|
||||
// Verify completion
|
||||
expect(result).toBeTruthy();
|
||||
console.log(' Result:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'parses init event and extracts session ID',
|
||||
async () => {
|
||||
// Spawn agent with simple done prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.done,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion
|
||||
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify session ID was extracted and persisted
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.sessionId).toBeTruthy();
|
||||
expect(dbAgent?.sessionId).toMatch(/^[a-f0-9-]+$/);
|
||||
|
||||
console.log(' Session ID:', dbAgent?.sessionId);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'parses result event with completion',
|
||||
async () => {
|
||||
// Spawn agent with simple done prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.done,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify result was parsed
|
||||
expect(result).toBeTruthy();
|
||||
expect(result?.success).toBe(true);
|
||||
expect(result?.message).toBeTruthy();
|
||||
|
||||
// Verify events
|
||||
const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
|
||||
expect(spawnedEvents.length).toBe(1);
|
||||
expect(spawnedEvents[0].payload.agentId).toBe(agent.id);
|
||||
expect(spawnedEvents[0].payload.provider).toBe('claude');
|
||||
|
||||
const stoppedEvents = harness.getEventsByType<AgentStoppedEvent>('agent:stopped');
|
||||
expect(stoppedEvents.length).toBe(1);
|
||||
expect(stoppedEvents[0].payload.agentId).toBe(agent.id);
|
||||
expect(stoppedEvents[0].payload.reason).toBe('task_complete');
|
||||
|
||||
console.log(' Result message:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
|
||||
describe('Questions Flow', () => {
|
||||
it(
|
||||
'parses questions status and enters waiting_for_input',
|
||||
async () => {
|
||||
// Spawn agent with questions prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.questions,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for waiting_for_input status
|
||||
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify questions were parsed
|
||||
expect(questions).toBeTruthy();
|
||||
expect(questions?.questions).toBeTruthy();
|
||||
expect(questions?.questions.length).toBeGreaterThan(0);
|
||||
expect(questions?.questions[0].id).toBeTruthy();
|
||||
expect(questions?.questions[0].question).toBeTruthy();
|
||||
|
||||
// Verify agent status
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('waiting_for_input');
|
||||
expect(dbAgent?.sessionId).toBeTruthy();
|
||||
|
||||
console.log(' Questions:', questions?.questions.length);
|
||||
console.log(' First question:', questions?.questions[0].question);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
|
||||
describe('Session Resume', () => {
|
||||
it(
|
||||
'resumes session with user answers',
|
||||
async () => {
|
||||
// 1. Spawn agent that asks questions
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.questions,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// 2. Wait for waiting_for_input
|
||||
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
|
||||
expect(questions?.questions.length).toBeGreaterThan(0);
|
||||
|
||||
const sessionIdBeforeResume = (await harness.agentRepository.findById(agent.id))?.sessionId;
|
||||
console.log(' Session ID before resume:', sessionIdBeforeResume);
|
||||
console.log(' Questions received:', questions?.questions.map((q) => q.id).join(', '));
|
||||
|
||||
harness.clearEvents();
|
||||
|
||||
// 3. Resume with answer
|
||||
const answers: Record<string, string> = {};
|
||||
for (const q of questions?.questions ?? []) {
|
||||
answers[q.id] = `Answer to ${q.id}`;
|
||||
}
|
||||
|
||||
await harness.agentManager.resume(agent.id, answers);
|
||||
|
||||
// 4. Wait for completion or another waiting state
|
||||
let attempts = 0;
|
||||
let finalStatus = 'running';
|
||||
while (attempts < 60) {
|
||||
const agent2 = await harness.agentRepository.findById(agent.id);
|
||||
if (agent2?.status !== 'running') {
|
||||
finalStatus = agent2?.status ?? 'unknown';
|
||||
break;
|
||||
}
|
||||
await sleep(1000);
|
||||
attempts++;
|
||||
}
|
||||
|
||||
// Verify the agent processed the resume (either completed or asked more questions)
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' Final status:', dbAgent?.status);
|
||||
|
||||
// Agent should not still be running
|
||||
expect(['idle', 'waiting_for_input', 'crashed']).toContain(dbAgent?.status);
|
||||
|
||||
// If idle, verify result
|
||||
if (dbAgent?.status === 'idle') {
|
||||
const result = await harness.agentManager.getResult(agent.id);
|
||||
console.log(' Result:', result?.message);
|
||||
expect(result).toBeTruthy();
|
||||
}
|
||||
},
|
||||
REAL_TEST_TIMEOUT * 2 // Double timeout for two-step process
|
||||
);
|
||||
|
||||
it(
|
||||
'maintains session continuity across resume',
|
||||
async () => {
|
||||
// 1. Spawn agent that asks questions
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.questions,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// 2. Wait for waiting_for_input
|
||||
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
|
||||
expect(questions?.questions.length).toBeGreaterThan(0);
|
||||
|
||||
const sessionIdBefore = (await harness.agentRepository.findById(agent.id))?.sessionId;
|
||||
expect(sessionIdBefore).toBeTruthy();
|
||||
|
||||
// 3. Resume with answer
|
||||
const answers: Record<string, string> = {};
|
||||
for (const q of questions?.questions ?? []) {
|
||||
answers[q.id] = `Answer to ${q.id}`;
|
||||
}
|
||||
|
||||
await harness.agentManager.resume(agent.id, answers);
|
||||
|
||||
// 4. Wait for completion
|
||||
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify session ID exists (may be same or new depending on CLI behavior)
|
||||
const sessionIdAfter = (await harness.agentRepository.findById(agent.id))?.sessionId;
|
||||
expect(sessionIdAfter).toBeTruthy();
|
||||
|
||||
console.log(' Session ID before:', sessionIdBefore);
|
||||
console.log(' Session ID after:', sessionIdAfter);
|
||||
},
|
||||
REAL_TEST_TIMEOUT * 2
|
||||
);
|
||||
});
|
||||
|
||||
describe('Error Handling', () => {
|
||||
it(
|
||||
'handles error status',
|
||||
async () => {
|
||||
// Spawn agent with error prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.error,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion (will be crashed)
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify error was handled
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('crashed');
|
||||
expect(result?.success).toBe(false);
|
||||
expect(result?.message).toContain('Test error');
|
||||
|
||||
console.log(' Error message:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,172 @@
|
||||
/**
|
||||
* Real Codex CLI Manager Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the REAL Codex CLI and incur API costs!
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts --test-timeout=300000
|
||||
* ```
|
||||
*
|
||||
* Tests covered:
|
||||
* - Codex spawn and thread_id extraction
|
||||
* - Generic output parsing (non-schema)
|
||||
* - Streaming output
|
||||
*
|
||||
* Estimated cost: ~$0.10 per full run
|
||||
*
|
||||
* Note: Codex uses different output format and session ID field (thread_id).
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
|
||||
import {
|
||||
createRealProviderHarness,
|
||||
describeRealCodex,
|
||||
REAL_TEST_TIMEOUT,
|
||||
type RealProviderHarness,
|
||||
} from './harness.js';
|
||||
import { CODEX_PROMPTS } from './prompts.js';
|
||||
import type { AgentSpawnedEvent, AgentOutputEvent } from '../../../events/types.js';
|
||||
|
||||
describeRealCodex('Real Codex Manager Integration', () => {
|
||||
let harness: RealProviderHarness;
|
||||
|
||||
beforeAll(async () => {
|
||||
console.log('\n=== Running Real Codex Manager Tests ===');
|
||||
console.log('These tests call the real Codex API and incur costs.\n');
|
||||
harness = await createRealProviderHarness({ provider: 'codex' });
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await harness.cleanup();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
harness.clearEvents();
|
||||
});
|
||||
|
||||
describe('Codex Spawn', () => {
|
||||
it(
|
||||
'spawns codex agent and extracts thread_id',
|
||||
async () => {
|
||||
// Spawn agent with simple task
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: CODEX_PROMPTS.done,
|
||||
mode: 'execute',
|
||||
provider: 'codex',
|
||||
});
|
||||
|
||||
expect(agent.id).toBeTruthy();
|
||||
expect(agent.provider).toBe('codex');
|
||||
expect(agent.status).toBe('running');
|
||||
|
||||
// Verify spawned event
|
||||
const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
|
||||
expect(spawnedEvents.length).toBe(1);
|
||||
expect(spawnedEvents[0].payload.provider).toBe('codex');
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify session ID (thread_id) was extracted
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' Thread ID:', dbAgent?.sessionId);
|
||||
console.log(' Status:', dbAgent?.status);
|
||||
console.log(' Result:', result?.message);
|
||||
|
||||
// Codex should complete or crash
|
||||
expect(['idle', 'crashed']).toContain(dbAgent?.status);
|
||||
|
||||
// If completed successfully, should have extracted thread_id
|
||||
if (dbAgent?.status === 'idle' && dbAgent?.sessionId) {
|
||||
expect(dbAgent.sessionId).toBeTruthy();
|
||||
}
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'uses generic parser for output',
|
||||
async () => {
|
||||
// Spawn agent with streaming prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: CODEX_PROMPTS.streaming,
|
||||
mode: 'execute',
|
||||
provider: 'codex',
|
||||
});
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify output events were captured
|
||||
const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
|
||||
console.log(' Output events:', outputEvents.length);
|
||||
|
||||
// For generic provider, result should be captured
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' Status:', dbAgent?.status);
|
||||
console.log(' Result:', result?.message?.substring(0, 100) + '...');
|
||||
|
||||
expect(['idle', 'crashed']).toContain(dbAgent?.status);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
|
||||
describe('Codex Provider Config', () => {
|
||||
it(
|
||||
'uses correct command and args for codex',
|
||||
async () => {
|
||||
// This is more of a config verification test
|
||||
// The actual command execution is validated by the spawn test
|
||||
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: 'Say hello',
|
||||
mode: 'execute',
|
||||
provider: 'codex',
|
||||
});
|
||||
|
||||
// Verify agent was created with codex provider
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.provider).toBe('codex');
|
||||
|
||||
// Wait for completion (or timeout)
|
||||
try {
|
||||
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
} catch {
|
||||
// Codex might fail if not installed, that's OK for config test
|
||||
}
|
||||
|
||||
const finalAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' Provider:', finalAgent?.provider);
|
||||
console.log(' Status:', finalAgent?.status);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Codex-specific observations from testing:
|
||||
*
|
||||
* 1. Output Format:
|
||||
* - Codex uses JSONL streaming with different event types
|
||||
* - thread.started event contains thread_id
|
||||
* - Output parsing is more generic (not JSON schema validated)
|
||||
*
|
||||
* 2. Command Structure:
|
||||
* - codex exec --full-auto --json -p "prompt"
|
||||
* - resume: codex exec resume <thread_id>
|
||||
*
|
||||
* 3. Session ID:
|
||||
* - Called "thread_id" in Codex
|
||||
* - Extracted from thread.started event
|
||||
*
|
||||
* 4. Resume:
|
||||
* - Uses subcommand style: codex exec resume <thread_id>
|
||||
* - Different from Claude's flag style: claude --resume <session_id>
|
||||
*/
|
||||
540
apps/server/test/integration/real-providers/conversation.test.ts
Normal file
540
apps/server/test/integration/real-providers/conversation.test.ts
Normal file
@@ -0,0 +1,540 @@
|
||||
/**
|
||||
* Real Claude Inter-Agent Conversation Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run:
|
||||
* ```bash
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/conversation.test.ts --test-timeout=300000
|
||||
* ```
|
||||
*
|
||||
* Architecture:
|
||||
* - Mock conversation server (only cw listen/ask/answer endpoints, no full CoordinationServer)
|
||||
* - In-memory ConversationRepository (no SQLite, no FK constraints)
|
||||
* - Real agent harness for spawning two Claude sessions with actual coding tasks
|
||||
* - Two sequential questions prove the listen→answer→re-listen cycle works
|
||||
*
|
||||
* Estimated cost: ~$0.30 per full run (two Claude sessions)
|
||||
*/
|
||||
|
||||
import { it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { createServer } from 'node:http';
|
||||
import type { Server } from 'node:http';
|
||||
import { readFileSync, existsSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { nanoid } from 'nanoid';
|
||||
import { fetchRequestHandler } from '@trpc/server/adapters/fetch';
|
||||
import { router, publicProcedure } from '../../../trpc/trpc.js';
|
||||
import { conversationProcedures } from '../../../trpc/routers/conversation.js';
|
||||
import { EventEmitterBus } from '../../../events/bus.js';
|
||||
import type { ConversationRepository, CreateConversationData } from '../../../db/repositories/conversation-repository.js';
|
||||
import type { Conversation } from '../../../db/schema.js';
|
||||
import {
|
||||
createRealProviderHarness,
|
||||
describeRealClaude,
|
||||
sleep,
|
||||
type RealProviderHarness,
|
||||
} from './harness.js';
|
||||
|
||||
const TEST_TIMEOUT = 300000; // 5 minutes — agents do real coding + conversation
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// In-memory ConversationRepository — no SQLite, no FK constraints
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
class InMemoryConversationRepository implements ConversationRepository {
|
||||
private store = new Map<string, Conversation>();
|
||||
|
||||
async create(data: CreateConversationData): Promise<Conversation> {
|
||||
const now = new Date();
|
||||
const conversation: Conversation = {
|
||||
id: nanoid(),
|
||||
fromAgentId: data.fromAgentId,
|
||||
toAgentId: data.toAgentId,
|
||||
initiativeId: data.initiativeId ?? null,
|
||||
phaseId: data.phaseId ?? null,
|
||||
taskId: data.taskId ?? null,
|
||||
question: data.question,
|
||||
answer: null,
|
||||
status: 'pending',
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
};
|
||||
this.store.set(conversation.id, conversation);
|
||||
return conversation;
|
||||
}
|
||||
|
||||
async findById(id: string): Promise<Conversation | null> {
|
||||
return this.store.get(id) ?? null;
|
||||
}
|
||||
|
||||
async findPendingForAgent(toAgentId: string): Promise<Conversation[]> {
|
||||
return [...this.store.values()]
|
||||
.filter((c) => c.toAgentId === toAgentId && c.status === 'pending')
|
||||
.sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime());
|
||||
}
|
||||
|
||||
async answer(id: string, answer: string): Promise<Conversation | null> {
|
||||
const conv = this.store.get(id);
|
||||
if (!conv) return null;
|
||||
const updated: Conversation = {
|
||||
...conv,
|
||||
answer,
|
||||
status: 'answered' as const,
|
||||
updatedAt: new Date(),
|
||||
};
|
||||
this.store.set(id, updated);
|
||||
return updated;
|
||||
}
|
||||
|
||||
/** Test helper — return all conversations */
|
||||
getAll(): Conversation[] {
|
||||
return [...this.store.values()];
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Mock conversation server — serves ONLY conversation tRPC procedures
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function startMockConversationServer(): Promise<{
|
||||
server: Server;
|
||||
port: number;
|
||||
repo: InMemoryConversationRepository;
|
||||
}> {
|
||||
const repo = new InMemoryConversationRepository();
|
||||
const eventBus = new EventEmitterBus();
|
||||
|
||||
// Mini router with only conversation procedures
|
||||
const miniRouter = router({
|
||||
...conversationProcedures(publicProcedure),
|
||||
});
|
||||
|
||||
const httpServer = createServer(async (req, res) => {
|
||||
if (!req.url?.startsWith('/trpc')) {
|
||||
res.writeHead(404);
|
||||
res.end('Not found');
|
||||
return;
|
||||
}
|
||||
|
||||
const host = req.headers.host ?? 'localhost';
|
||||
const url = new URL(req.url, `http://${host}`);
|
||||
|
||||
let body: string | undefined;
|
||||
if (req.method !== 'GET' && req.method !== 'HEAD') {
|
||||
body = await new Promise<string>((resolve) => {
|
||||
let data = '';
|
||||
req.on('data', (chunk: Buffer) => {
|
||||
data += chunk.toString();
|
||||
});
|
||||
req.on('end', () => resolve(data));
|
||||
});
|
||||
}
|
||||
|
||||
const headers = new Headers();
|
||||
for (const [key, value] of Object.entries(req.headers)) {
|
||||
if (value) {
|
||||
if (Array.isArray(value)) {
|
||||
value.forEach((v) => headers.append(key, v));
|
||||
} else {
|
||||
headers.set(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const fetchRequest = new Request(url.toString(), {
|
||||
method: req.method,
|
||||
headers,
|
||||
body: body ?? undefined,
|
||||
});
|
||||
|
||||
const fetchResponse = await fetchRequestHandler({
|
||||
endpoint: '/trpc',
|
||||
req: fetchRequest,
|
||||
router: miniRouter,
|
||||
createContext: () =>
|
||||
({
|
||||
eventBus,
|
||||
serverStartedAt: new Date(),
|
||||
processCount: 0,
|
||||
conversationRepository: repo,
|
||||
// Stub — requireAgentManager is called unconditionally in createConversation,
|
||||
// but list() is only invoked for taskId/phaseId resolution. With --agent-id
|
||||
// targeting, list() is never called.
|
||||
agentManager: { list: async () => [] },
|
||||
}) as any,
|
||||
});
|
||||
|
||||
res.statusCode = fetchResponse.status;
|
||||
fetchResponse.headers.forEach((value, key) => {
|
||||
res.setHeader(key, value);
|
||||
});
|
||||
|
||||
if (fetchResponse.body) {
|
||||
const reader = fetchResponse.body.getReader();
|
||||
const pump = async () => {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) {
|
||||
res.end();
|
||||
return;
|
||||
}
|
||||
res.write(value);
|
||||
}
|
||||
};
|
||||
pump().catch(() => res.end());
|
||||
} else {
|
||||
res.end(await fetchResponse.text());
|
||||
}
|
||||
});
|
||||
|
||||
const port = 40000 + Math.floor(Math.random() * 10000);
|
||||
await new Promise<void>((resolve) => {
|
||||
httpServer.listen(port, '127.0.0.1', () => resolve());
|
||||
});
|
||||
|
||||
return { server: httpServer, port, repo };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Diagnostic helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function dumpAgentLogs(workspaceRoot: string, agentName: string) {
|
||||
const logDir = join(workspaceRoot, '.cw', 'agent-logs', agentName);
|
||||
if (!existsSync(logDir)) {
|
||||
console.log(` [${agentName}] No log directory at ${logDir}`);
|
||||
return;
|
||||
}
|
||||
// Dump output.jsonl (last 30 lines)
|
||||
const outputPath = join(logDir, 'output.jsonl');
|
||||
if (existsSync(outputPath)) {
|
||||
const lines = readFileSync(outputPath, 'utf-8').trim().split('\n');
|
||||
const last = lines.slice(-30);
|
||||
console.log(` [${agentName}] output.jsonl (last ${last.length}/${lines.length} lines):`);
|
||||
for (const line of last) {
|
||||
try {
|
||||
const ev = JSON.parse(line);
|
||||
if (ev.type === 'assistant' && ev.message?.content) {
|
||||
for (const block of ev.message.content) {
|
||||
if (block.type === 'text') {
|
||||
console.log(` TEXT: ${block.text.substring(0, 200)}`);
|
||||
} else if (block.type === 'tool_use') {
|
||||
console.log(` TOOL: ${block.name} ${JSON.stringify(block.input).substring(0, 150)}`);
|
||||
}
|
||||
}
|
||||
} else if (ev.type === 'result') {
|
||||
console.log(` RESULT: ${JSON.stringify(ev).substring(0, 300)}`);
|
||||
}
|
||||
} catch {
|
||||
console.log(` RAW: ${line.substring(0, 200)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Dump stderr
|
||||
const stderrPath = join(logDir, 'stderr.log');
|
||||
if (existsSync(stderrPath)) {
|
||||
const stderr = readFileSync(stderrPath, 'utf-8').trim();
|
||||
if (stderr) {
|
||||
console.log(` [${agentName}] stderr: ${stderr.substring(0, 500)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test suite
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describeRealClaude('Real Inter-Agent Conversation (mock server)', () => {
|
||||
let harness: RealProviderHarness;
|
||||
let mockServer: Server;
|
||||
let mockPort: number;
|
||||
let mockRepo: InMemoryConversationRepository;
|
||||
const originalCwPort = process.env.CW_PORT;
|
||||
|
||||
beforeAll(async () => {
|
||||
console.log('\n=== Real Inter-Agent Conversation Test ===');
|
||||
console.log('Mock conversation server + two Claude sessions.\n');
|
||||
|
||||
// Start mock conversation server (only listen/ask/answer endpoints)
|
||||
const mock = await startMockConversationServer();
|
||||
mockServer = mock.server;
|
||||
mockPort = mock.port;
|
||||
mockRepo = mock.repo;
|
||||
console.log(` Mock server on port ${mockPort}`);
|
||||
|
||||
// Set CW_PORT so agents' cw commands hit the mock server
|
||||
process.env.CW_PORT = String(mockPort);
|
||||
|
||||
// Real agent harness for spawning + worktrees (no full CoordinationServer)
|
||||
harness = await createRealProviderHarness({ provider: 'claude' });
|
||||
console.log(` Workspace: ${harness.workspaceRoot}`);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (originalCwPort) {
|
||||
process.env.CW_PORT = originalCwPort;
|
||||
} else {
|
||||
delete process.env.CW_PORT;
|
||||
}
|
||||
await harness?.cleanup();
|
||||
mockServer?.close();
|
||||
});
|
||||
|
||||
it(
|
||||
'two agents with real tasks communicate via cw ask/listen/answer (two questions prove re-listen)',
|
||||
async () => {
|
||||
const agentSuffix = nanoid(6); // unique suffix for temp files
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Agent A — builds a validator module WHILE answering questions
|
||||
// in the background via cw listen
|
||||
// ---------------------------------------------------------------
|
||||
const agentA = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: `You are Agent A in a multi-agent coordination test.
|
||||
|
||||
You have TWO concurrent responsibilities:
|
||||
1. Build a TypeScript validator module (your main coding task)
|
||||
2. Answer questions from other agents via a background listener
|
||||
|
||||
SETUP (do this first):
|
||||
- Read .cw/input/manifest.json to get your agentId
|
||||
- Start a background listener that writes to a temp file:
|
||||
cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
|
||||
LISTEN_PID=$!
|
||||
|
||||
MAIN CODING TASK — implement a user registration validator:
|
||||
|
||||
1. Create types.ts:
|
||||
export interface RegistrationInput { name: string; email: string; password: string; }
|
||||
export interface ValidationResult { valid: boolean; errors: string[]; }
|
||||
|
||||
2. Create validator.ts:
|
||||
Import from types.ts. Export function validateRegistration(input: RegistrationInput): ValidationResult
|
||||
Rules: name min 2 chars, email must have exactly one @ and domain with a dot and no spaces and max 254 chars, password min 8 chars.
|
||||
|
||||
3. Create index.ts that re-exports everything from types.ts and validator.ts.
|
||||
|
||||
BETWEEN EACH FILE, check for incoming questions:
|
||||
if [ -s /tmp/cw-listen-${agentSuffix}.txt ]; then
|
||||
# parse the JSON, get conversationId and question
|
||||
# answer: cw answer "<answer based on your code>" --conversation-id <id>
|
||||
# clear and restart listener:
|
||||
> /tmp/cw-listen-${agentSuffix}.txt
|
||||
cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
|
||||
LISTEN_PID=$!
|
||||
fi
|
||||
|
||||
You will receive TWO questions total while you work. Answer them based on the code you are writing.
|
||||
|
||||
CLEANUP: After all 3 files are written and both questions answered:
|
||||
- kill $LISTEN_PID 2>/dev/null
|
||||
- Write .cw/output/signal.json: {"status":"done","result":"validator module complete, answered 2 questions"}
|
||||
|
||||
CRITICAL:
|
||||
- The listener MUST run in the background while you write code.
|
||||
- Check for questions between files, not as blocking waits.
|
||||
- The CW_PORT environment variable is already set to ${mockPort}.`,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
inputContext: {},
|
||||
});
|
||||
|
||||
console.log(` Agent A: ${agentA.id} (${agentA.name})`);
|
||||
|
||||
// Give Agent A time to start its background listener and begin coding
|
||||
await sleep(15000);
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Agent B — builds a client module, asks Agent A questions to
|
||||
// learn the validation rules, then uses answers in its code
|
||||
// ---------------------------------------------------------------
|
||||
const agentB = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: `You are Agent B in a multi-agent coordination test.
|
||||
|
||||
Read .cw/input/manifest.json to get your agentId. Agent A (ID: ${agentA.id}) is building a validator module.
|
||||
|
||||
YOUR CODING TASK — build a registration API client that includes client-side validation matching Agent A's server-side rules:
|
||||
|
||||
1. Create client-scaffold.ts with a basic RegistrationClient class that has a register(name, email, password) method that returns Promise<{ok: boolean}>.
|
||||
Leave a TODO comment where validation will go.
|
||||
|
||||
2. NOW ask Agent A what the validation rules are — you need this to write proper client-side checks:
|
||||
FIELDS=$(cw ask "What are the required fields and their types for registration?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
|
||||
|
||||
3. Ask Agent A about the specific email validation rules:
|
||||
EMAIL_RULES=$(cw ask "What are the exact email validation rules you implemented?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
|
||||
|
||||
4. Create validated-client.ts — a COMPLETE implementation using the answers:
|
||||
Import the scaffold, add a validateBeforeSubmit(name, email, password) function
|
||||
that implements the EXACT validation rules Agent A told you about.
|
||||
Include a comment at the top with the rules you received.
|
||||
|
||||
5. Write .cw/output/signal.json: {"status":"done","result":"client module complete with validation from agent A"}
|
||||
|
||||
CRITICAL:
|
||||
- Create client-scaffold.ts BEFORE asking questions (you have independent work to do first).
|
||||
- Use the ACTUAL answers from Agent A in your validated-client.ts implementation.
|
||||
- The CW_PORT environment variable is already set to ${mockPort}.`,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
inputContext: {},
|
||||
});
|
||||
|
||||
console.log(` Agent B: ${agentB.id} (${agentB.name})`);
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Wait for both agents to stop running, then verify conversations
|
||||
// ---------------------------------------------------------------
|
||||
const deadline = Date.now() + TEST_TIMEOUT;
|
||||
let aDone = false;
|
||||
let bDone = false;
|
||||
let lastLogTime = 0;
|
||||
|
||||
while (Date.now() < deadline && (!aDone || !bDone)) {
|
||||
const agentAInfo = await harness.agentRepository.findById(agentA.id);
|
||||
const agentBInfo = await harness.agentRepository.findById(agentB.id);
|
||||
|
||||
// Periodic progress logging every 30s
|
||||
if (Date.now() - lastLogTime > 30000) {
|
||||
const elapsed = Math.round((Date.now() - (deadline - TEST_TIMEOUT)) / 1000);
|
||||
console.log(` [${elapsed}s] A=${agentAInfo?.status ?? '?'} B=${agentBInfo?.status ?? '?'} convs=${mockRepo.getAll().length}`);
|
||||
lastLogTime = Date.now();
|
||||
}
|
||||
|
||||
if (agentAInfo && agentAInfo.status !== 'running' && !aDone) {
|
||||
aDone = true;
|
||||
console.log(` Agent A final status: ${agentAInfo.status}`);
|
||||
dumpAgentLogs(harness.workspaceRoot, agentA.name);
|
||||
}
|
||||
if (agentBInfo && agentBInfo.status !== 'running' && !bDone) {
|
||||
bDone = true;
|
||||
console.log(` Agent B final status: ${agentBInfo.status}`);
|
||||
dumpAgentLogs(harness.workspaceRoot, agentB.name);
|
||||
}
|
||||
|
||||
if (!aDone || !bDone) await sleep(2000);
|
||||
}
|
||||
|
||||
expect(aDone).toBe(true);
|
||||
expect(bDone).toBe(true);
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Verify conversations in mock repo
|
||||
// ---------------------------------------------------------------
|
||||
const allConversations = mockRepo.getAll();
|
||||
console.log(` Total conversations: ${allConversations.length}`);
|
||||
for (const c of allConversations) {
|
||||
console.log(
|
||||
` ${c.id}: ${c.status} — Q: "${c.question}" A: "${c.answer?.substring(0, 80)}..."`,
|
||||
);
|
||||
}
|
||||
|
||||
// Exactly 2 conversations, both answered
|
||||
expect(allConversations.length).toBe(2);
|
||||
expect(allConversations.every((c) => c.status === 'answered')).toBe(true);
|
||||
|
||||
// Both target Agent A, both from Agent B
|
||||
expect(allConversations.every((c) => c.toAgentId === agentA.id)).toBe(true);
|
||||
expect(allConversations.every((c) => c.fromAgentId === agentB.id)).toBe(true);
|
||||
|
||||
// Questions should be distinct (one about fields, one about email validation)
|
||||
const questions = allConversations.map((c) => c.question);
|
||||
expect(questions.some((q) => q.toLowerCase().includes('field'))).toBe(true);
|
||||
expect(questions.some((q) => q.toLowerCase().includes('email'))).toBe(true);
|
||||
|
||||
// Both answers should be non-empty
|
||||
expect(allConversations.every((c) => c.answer && c.answer.length > 0)).toBe(true);
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Verify Agent A's coding output — validator module files exist
|
||||
// ---------------------------------------------------------------
|
||||
const aWorkdir = join(
|
||||
harness.workspaceRoot,
|
||||
'agent-workdirs',
|
||||
agentA.name,
|
||||
'workspace',
|
||||
);
|
||||
const aFiles = ['types.ts', 'validator.ts', 'index.ts'];
|
||||
for (const f of aFiles) {
|
||||
const filePath = join(aWorkdir, f);
|
||||
const exists = existsSync(filePath);
|
||||
console.log(` Agent A file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
|
||||
expect(exists).toBe(true);
|
||||
}
|
||||
// validator.ts should contain actual validation logic
|
||||
const validatorContent = readFileSync(join(aWorkdir, 'validator.ts'), 'utf-8');
|
||||
console.log(` Agent A validator.ts (${validatorContent.length} chars): ${validatorContent.substring(0, 120)}...`);
|
||||
expect(validatorContent.toLowerCase()).toContain('email');
|
||||
expect(validatorContent.toLowerCase()).toContain('password');
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Verify Agent B's coding output — client module files exist
|
||||
// ---------------------------------------------------------------
|
||||
const bWorkdir = join(
|
||||
harness.workspaceRoot,
|
||||
'agent-workdirs',
|
||||
agentB.name,
|
||||
'workspace',
|
||||
);
|
||||
const bFiles = ['client-scaffold.ts', 'validated-client.ts'];
|
||||
for (const f of bFiles) {
|
||||
const filePath = join(bWorkdir, f);
|
||||
const exists = existsSync(filePath);
|
||||
console.log(` Agent B file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
|
||||
expect(exists).toBe(true);
|
||||
}
|
||||
// validated-client.ts should reference validation rules from Agent A's answers
|
||||
const clientContent = readFileSync(join(bWorkdir, 'validated-client.ts'), 'utf-8');
|
||||
console.log(` Agent B validated-client.ts (${clientContent.length} chars): ${clientContent.substring(0, 120)}...`);
|
||||
expect(clientContent.toLowerCase()).toContain('email');
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Verify interleaving: Agent A's JSONL log has coding tool calls
|
||||
// (Write for .ts files) interleaved with conversation tool calls
|
||||
// (Bash for cw listen/answer)
|
||||
// ---------------------------------------------------------------
|
||||
const aLogPath = join(harness.workspaceRoot, '.cw', 'agent-logs', agentA.name, 'output.jsonl');
|
||||
const aLog = readFileSync(aLogPath, 'utf-8').trim().split('\n');
|
||||
const toolCalls: { type: 'code' | 'conversation'; name: string; detail: string }[] = [];
|
||||
|
||||
for (const line of aLog) {
|
||||
try {
|
||||
const ev = JSON.parse(line);
|
||||
if (ev.type !== 'assistant' || !ev.message?.content) continue;
|
||||
for (const block of ev.message.content) {
|
||||
if (block.type !== 'tool_use') continue;
|
||||
const input = typeof block.input === 'string' ? block.input : JSON.stringify(block.input);
|
||||
if (block.name === 'Write' && input.includes('.ts')) {
|
||||
toolCalls.push({ type: 'code', name: 'Write', detail: input.substring(0, 80) });
|
||||
} else if (block.name === 'Bash' && (input.includes('cw listen') || input.includes('cw answer'))) {
|
||||
toolCalls.push({ type: 'conversation', name: 'Bash', detail: input.substring(0, 80) });
|
||||
}
|
||||
}
|
||||
} catch { /* skip non-JSON lines */ }
|
||||
}
|
||||
|
||||
console.log(` Agent A interleaving (${toolCalls.length} relevant tool calls):`);
|
||||
for (const tc of toolCalls) {
|
||||
console.log(` [${tc.type}] ${tc.name}: ${tc.detail}`);
|
||||
}
|
||||
|
||||
// Must have both code and conversation tool calls
|
||||
const hasCode = toolCalls.some((tc) => tc.type === 'code');
|
||||
const hasConversation = toolCalls.some((tc) => tc.type === 'conversation');
|
||||
expect(hasCode).toBe(true);
|
||||
expect(hasConversation).toBe(true);
|
||||
|
||||
// Verify interleaving: at least one code call must appear AFTER a conversation call
|
||||
// (proving coding continued after handling a question)
|
||||
const firstConvIdx = toolCalls.findIndex((tc) => tc.type === 'conversation');
|
||||
const lastCodeIdx = toolCalls.length - 1 - [...toolCalls].reverse().findIndex((tc) => tc.type === 'code');
|
||||
console.log(` First conversation at index ${firstConvIdx}, last code at index ${lastCodeIdx}`);
|
||||
expect(lastCodeIdx).toBeGreaterThan(firstConvIdx);
|
||||
},
|
||||
TEST_TIMEOUT,
|
||||
);
|
||||
});
|
||||
@@ -0,0 +1,265 @@
|
||||
/**
|
||||
* Crash Recovery Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/crash-recovery.test.ts --test-timeout=300000
|
||||
* ```
|
||||
*
|
||||
* Tests covered:
|
||||
* - Server restart while agent is running
|
||||
* - Resuming streaming after restart
|
||||
* - Marking dead agents as crashed
|
||||
* - Output file processing after restart
|
||||
*
|
||||
* Estimated cost: ~$0.08 per full run
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
|
||||
import {
|
||||
createRealProviderHarness,
|
||||
describeRealClaude,
|
||||
REAL_TEST_TIMEOUT,
|
||||
EXTENDED_TEST_TIMEOUT,
|
||||
sleep,
|
||||
type RealProviderHarness,
|
||||
} from './harness.js';
|
||||
import { MINIMAL_PROMPTS } from './prompts.js';
|
||||
import { MultiProviderAgentManager } from '../../../agent/manager.js';
|
||||
|
||||
describeRealClaude('Crash Recovery', () => {
|
||||
let harness: RealProviderHarness;
|
||||
|
||||
beforeAll(async () => {
|
||||
console.log('\n=== Running Crash Recovery Tests ===');
|
||||
console.log('These tests call the real Claude API and incur costs.\n');
|
||||
harness = await createRealProviderHarness({ provider: 'claude' });
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await harness.cleanup();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
harness.clearEvents();
|
||||
});
|
||||
|
||||
describe('Server Restart Simulation', () => {
|
||||
it(
|
||||
'resumes streaming for still-running agent after restart',
|
||||
async () => {
|
||||
// 1. Spawn agent with slow task
|
||||
console.log(' 1. Spawning agent with slow task...');
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.slow,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// 2. Wait for agent to be running
|
||||
await harness.waitForAgentStatus(agent.id, 'running', 10000);
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.pid).toBeTruthy();
|
||||
expect(dbAgent?.outputFilePath).toBeTruthy();
|
||||
console.log(' 2. Agent running with PID:', dbAgent?.pid);
|
||||
|
||||
// 3. Give the agent a moment to start writing output
|
||||
await sleep(2000);
|
||||
|
||||
// 4. Simulate server crash - create NEW manager (old state lost)
|
||||
console.log(' 3. Simulating server restart with new manager...');
|
||||
harness.clearEvents(); // Clear events from old manager
|
||||
|
||||
const newManager = new MultiProviderAgentManager(
|
||||
harness.agentRepository,
|
||||
harness.workspaceRoot,
|
||||
harness.projectRepository,
|
||||
harness.accountRepository,
|
||||
harness.eventBus
|
||||
);
|
||||
|
||||
// 5. Reconcile - should pick up running agent
|
||||
console.log(' 4. Reconciling agent state...');
|
||||
await newManager.reconcileAfterRestart();
|
||||
|
||||
// 6. Wait for completion via new manager
|
||||
console.log(' 5. Waiting for completion via new manager...');
|
||||
let attempts = 0;
|
||||
let finalStatus = 'running';
|
||||
while (attempts < 60) {
|
||||
const refreshed = await harness.agentRepository.findById(agent.id);
|
||||
if (refreshed?.status !== 'running') {
|
||||
finalStatus = refreshed?.status ?? 'unknown';
|
||||
break;
|
||||
}
|
||||
await sleep(2000);
|
||||
attempts++;
|
||||
}
|
||||
|
||||
const finalAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' 6. Final status:', finalAgent?.status);
|
||||
|
||||
// Either completed successfully or crashed (both are valid outcomes)
|
||||
expect(['idle', 'crashed', 'stopped']).toContain(finalAgent?.status);
|
||||
|
||||
if (finalAgent?.status === 'idle') {
|
||||
const result = await newManager.getResult(agent.id);
|
||||
console.log(' Result:', result?.message);
|
||||
}
|
||||
},
|
||||
EXTENDED_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'marks dead agent as crashed during reconcile',
|
||||
async () => {
|
||||
// 1. Create a fake agent record with a dead PID
|
||||
console.log(' 1. Creating fake agent with dead PID...');
|
||||
const fakeAgent = await harness.agentRepository.create({
|
||||
name: 'dead-agent-test',
|
||||
taskId: null,
|
||||
initiativeId: null,
|
||||
sessionId: null,
|
||||
worktreeId: 'dead-worktree',
|
||||
status: 'running',
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
accountId: null,
|
||||
});
|
||||
|
||||
// Set a PID that's definitely dead (high number that won't exist)
|
||||
await harness.agentRepository.update(fakeAgent.id, { pid: 999999, outputFilePath: '/nonexistent/path' });
|
||||
|
||||
// Verify it's marked as running
|
||||
let agent = await harness.agentRepository.findById(fakeAgent.id);
|
||||
expect(agent?.status).toBe('running');
|
||||
expect(agent?.pid).toBe(999999);
|
||||
|
||||
// 2. Create new manager and reconcile
|
||||
console.log(' 2. Creating new manager and reconciling...');
|
||||
const newManager = new MultiProviderAgentManager(
|
||||
harness.agentRepository,
|
||||
harness.workspaceRoot,
|
||||
harness.projectRepository,
|
||||
harness.accountRepository,
|
||||
harness.eventBus
|
||||
);
|
||||
|
||||
await newManager.reconcileAfterRestart();
|
||||
|
||||
// 3. Verify agent is now crashed
|
||||
agent = await harness.agentRepository.findById(fakeAgent.id);
|
||||
expect(agent?.status).toBe('crashed');
|
||||
console.log(' 3. Agent marked as crashed (dead PID detected)');
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'processes output file for dead agent during reconcile',
|
||||
async () => {
|
||||
// 1. Spawn agent and wait for completion
|
||||
console.log(' 1. Spawning agent to completion...');
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.done,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
const outputFilePath = dbAgent?.outputFilePath;
|
||||
expect(outputFilePath).toBeTruthy();
|
||||
console.log(' 2. Output file:', outputFilePath);
|
||||
|
||||
// 2. Reset agent to "running" to simulate mid-crash state
|
||||
await harness.agentRepository.update(agent.id, { status: 'running' });
|
||||
// Clear result so reconcile has to re-process
|
||||
await harness.agentRepository.update(agent.id, { result: null });
|
||||
|
||||
// Verify reset
|
||||
let resetAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(resetAgent?.status).toBe('running');
|
||||
|
||||
// 3. Create new manager and reconcile
|
||||
console.log(' 3. Creating new manager and reconciling...');
|
||||
harness.clearEvents();
|
||||
|
||||
const newManager = new MultiProviderAgentManager(
|
||||
harness.agentRepository,
|
||||
harness.workspaceRoot,
|
||||
harness.projectRepository,
|
||||
harness.accountRepository,
|
||||
harness.eventBus
|
||||
);
|
||||
|
||||
await newManager.reconcileAfterRestart();
|
||||
|
||||
// Give it a moment to process the file
|
||||
await sleep(1000);
|
||||
|
||||
// 4. Verify agent was processed from output file
|
||||
const finalAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' 4. Final status:', finalAgent?.status);
|
||||
|
||||
// Should either be idle (processed successfully) or crashed (couldn't process)
|
||||
expect(['idle', 'crashed']).toContain(finalAgent?.status);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
|
||||
describe('Event Consistency', () => {
|
||||
it(
|
||||
'does not duplicate events on restart',
|
||||
async () => {
|
||||
// 1. Spawn agent with slow task
|
||||
console.log(' 1. Spawning agent...');
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.streaming,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// 2. Wait for some output events
|
||||
await sleep(3000);
|
||||
const initialOutputCount = harness.getEventsByType('agent:output').length;
|
||||
console.log(' 2. Initial output events:', initialOutputCount);
|
||||
|
||||
// 3. Wait for completion
|
||||
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
const finalOutputCount = harness.getEventsByType('agent:output').length;
|
||||
console.log(' 3. Final output events:', finalOutputCount);
|
||||
|
||||
// 4. Create new manager and reconcile (agent already complete)
|
||||
harness.clearEvents();
|
||||
|
||||
const newManager = new MultiProviderAgentManager(
|
||||
harness.agentRepository,
|
||||
harness.workspaceRoot,
|
||||
harness.projectRepository,
|
||||
harness.accountRepository,
|
||||
harness.eventBus
|
||||
);
|
||||
|
||||
await newManager.reconcileAfterRestart();
|
||||
await sleep(1000);
|
||||
|
||||
// 5. Verify no new output events (agent was already complete)
|
||||
const postReconcileOutputCount = harness.getEventsByType('agent:output').length;
|
||||
console.log(' 4. Post-reconcile output events:', postReconcileOutputCount);
|
||||
|
||||
// Should not have re-emitted all the old output events
|
||||
expect(postReconcileOutputCount).toBe(0);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
});
|
||||
378
apps/server/test/integration/real-providers/harness.ts
Normal file
378
apps/server/test/integration/real-providers/harness.ts
Normal file
@@ -0,0 +1,378 @@
|
||||
/**
|
||||
* Real Provider Test Harness
|
||||
*
|
||||
* Extends the existing test infrastructure to use REAL MultiProviderAgentManager
|
||||
* for integration testing with actual CLI providers like Claude and Codex.
|
||||
*
|
||||
* Unlike the standard TestHarness which uses MockAgentManager, this harness:
|
||||
* - Uses real CLI spawning (costs real API credits!)
|
||||
* - Provides poll-based waiting helpers
|
||||
* - Captures events for inspection
|
||||
* - Manages temp directories for worktrees
|
||||
*/
|
||||
|
||||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe } from 'vitest';
|
||||
import type { DrizzleDatabase } from '../../../db/index.js';
|
||||
import type { DomainEvent, EventBus } from '../../../events/types.js';
|
||||
import { EventEmitterBus } from '../../../events/bus.js';
|
||||
import { MultiProviderAgentManager } from '../../../agent/manager.js';
|
||||
import type { AgentResult, PendingQuestions, AgentStatus } from '../../../agent/types.js';
|
||||
import type { AgentRepository } from '../../../db/repositories/agent-repository.js';
|
||||
import type { ProjectRepository } from '../../../db/repositories/project-repository.js';
|
||||
import type { AccountRepository } from '../../../db/repositories/account-repository.js';
|
||||
import type { InitiativeRepository } from '../../../db/repositories/initiative-repository.js';
|
||||
import {
|
||||
DrizzleAgentRepository,
|
||||
DrizzleProjectRepository,
|
||||
DrizzleAccountRepository,
|
||||
DrizzleInitiativeRepository,
|
||||
} from '../../../db/repositories/drizzle/index.js';
|
||||
import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
|
||||
|
||||
/**
|
||||
* Sleep helper for polling loops.
|
||||
*/
|
||||
export function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Event bus that captures all emitted events for inspection.
|
||||
*/
|
||||
export class CapturingEventBus extends EventEmitterBus {
|
||||
emittedEvents: DomainEvent[] = [];
|
||||
|
||||
emit<T extends DomainEvent>(event: T): void {
|
||||
this.emittedEvents.push(event);
|
||||
super.emit(event);
|
||||
}
|
||||
|
||||
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
|
||||
return this.emittedEvents.filter((e) => e.type === type) as T[];
|
||||
}
|
||||
|
||||
clearEvents(): void {
|
||||
this.emittedEvents = [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Options for creating a real provider test harness.
|
||||
*/
|
||||
export interface RealProviderHarnessOptions {
|
||||
/** Which provider to test (default: 'claude') */
|
||||
provider?: 'claude' | 'codex';
|
||||
/** Optional workspace root (temp dir created if omitted) */
|
||||
workspaceRoot?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Real Provider Test Harness interface.
|
||||
*
|
||||
* Provides everything needed to test against real CLI providers:
|
||||
* - In-memory database with real repositories
|
||||
* - Real MultiProviderAgentManager (spawns actual CLI processes)
|
||||
* - Event capture for verification
|
||||
* - Polling-based wait helpers
|
||||
*/
|
||||
export interface RealProviderHarness {
|
||||
/** In-memory SQLite database */
|
||||
db: DrizzleDatabase;
|
||||
/** Event bus with capture capability */
|
||||
eventBus: CapturingEventBus;
|
||||
/** Real agent manager (not mock!) */
|
||||
agentManager: MultiProviderAgentManager;
|
||||
/** Workspace root directory */
|
||||
workspaceRoot: string;
|
||||
|
||||
/** Agent repository */
|
||||
agentRepository: AgentRepository;
|
||||
/** Project repository */
|
||||
projectRepository: ProjectRepository;
|
||||
/** Account repository */
|
||||
accountRepository: AccountRepository;
|
||||
/** Initiative repository */
|
||||
initiativeRepository: InitiativeRepository;
|
||||
|
||||
/**
|
||||
* Wait for an agent to reach idle or crashed status.
|
||||
* Polls the database at regular intervals.
|
||||
*
|
||||
* @param agentId - The agent ID to wait for
|
||||
* @param timeoutMs - Maximum time to wait (default 120000ms = 2 minutes)
|
||||
* @returns The agent result if completed, or null if crashed/timeout
|
||||
*/
|
||||
waitForAgentCompletion(agentId: string, timeoutMs?: number): Promise<AgentResult | null>;
|
||||
|
||||
/**
|
||||
* Wait for an agent to enter waiting_for_input status.
|
||||
* Polls the database at regular intervals.
|
||||
*
|
||||
* @param agentId - The agent ID to wait for
|
||||
* @param timeoutMs - Maximum time to wait (default 120000ms)
|
||||
* @returns The pending questions if waiting, or null if timeout/other status
|
||||
*/
|
||||
waitForAgentWaiting(agentId: string, timeoutMs?: number): Promise<PendingQuestions | null>;
|
||||
|
||||
/**
|
||||
* Wait for an agent to reach a specific status.
|
||||
*
|
||||
* @param agentId - The agent ID to wait for
|
||||
* @param status - The target status
|
||||
* @param timeoutMs - Maximum time to wait (default 120000ms)
|
||||
*/
|
||||
waitForAgentStatus(agentId: string, status: AgentStatus, timeoutMs?: number): Promise<void>;
|
||||
|
||||
/**
|
||||
* Get captured events filtered by type.
|
||||
*/
|
||||
getEventsByType<T extends DomainEvent>(type: T['type']): T[];
|
||||
|
||||
/**
|
||||
* Clear all captured events.
|
||||
*/
|
||||
clearEvents(): void;
|
||||
|
||||
/**
|
||||
* Kill all running agents (for cleanup).
|
||||
*/
|
||||
killAllAgents(): Promise<void>;
|
||||
|
||||
/**
|
||||
* Clean up all resources (directories, processes).
|
||||
* Call this in afterAll/afterEach.
|
||||
*/
|
||||
cleanup(): Promise<void>;
|
||||
}
|
||||
|
||||
/** Default poll interval for status checks */
|
||||
const POLL_INTERVAL_MS = 1000;
|
||||
|
||||
/**
|
||||
* Create a test harness for real provider integration tests.
|
||||
*
|
||||
* This creates:
|
||||
* - In-memory SQLite database
|
||||
* - Temp directory for worktrees (or uses provided workspace)
|
||||
* - Real MultiProviderAgentManager
|
||||
* - Event capture bus
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* let harness: RealProviderHarness;
|
||||
*
|
||||
* beforeAll(async () => {
|
||||
* harness = await createRealProviderHarness({ provider: 'claude' });
|
||||
* });
|
||||
*
|
||||
* afterAll(async () => {
|
||||
* await harness.cleanup();
|
||||
* });
|
||||
*
|
||||
* it('spawns and completes', async () => {
|
||||
* const agent = await harness.agentManager.spawn({...});
|
||||
* const result = await harness.waitForAgentCompletion(agent.id);
|
||||
* expect(result?.success).toBe(true);
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export async function createRealProviderHarness(
|
||||
options: RealProviderHarnessOptions = {}
|
||||
): Promise<RealProviderHarness> {
|
||||
// Create workspace directory (temp if not provided)
|
||||
const workspaceRoot = options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'cw-test-')));
|
||||
const ownedWorkspace = !options.workspaceRoot; // Track if we need to clean up
|
||||
|
||||
// Initialize git repo in temp workspace (required for worktree operations)
|
||||
if (ownedWorkspace) {
|
||||
const { execSync } = await import('node:child_process');
|
||||
execSync('git init', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
execSync('git config user.email "test@test.com"', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
execSync('git config user.name "Test"', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
// Create initial commit (worktrees require at least one commit)
|
||||
execSync('touch .gitkeep && git add .gitkeep && git commit -m "init"', { cwd: workspaceRoot, stdio: 'ignore' });
|
||||
}
|
||||
|
||||
// Create in-memory database
|
||||
const db = createTestDatabase();
|
||||
|
||||
// Create repositories
|
||||
const agentRepository = new DrizzleAgentRepository(db);
|
||||
const projectRepository = new DrizzleProjectRepository(db);
|
||||
const accountRepository = new DrizzleAccountRepository(db);
|
||||
const initiativeRepository = new DrizzleInitiativeRepository(db);
|
||||
|
||||
// Create event bus with capture (parent class already sets maxListeners to 100)
|
||||
const eventBus = new CapturingEventBus();
|
||||
|
||||
// Create REAL agent manager (not mock!)
|
||||
const agentManager = new MultiProviderAgentManager(
|
||||
agentRepository,
|
||||
workspaceRoot,
|
||||
projectRepository,
|
||||
accountRepository,
|
||||
eventBus
|
||||
);
|
||||
|
||||
// Build harness
|
||||
const harness: RealProviderHarness = {
|
||||
db,
|
||||
eventBus,
|
||||
agentManager,
|
||||
workspaceRoot,
|
||||
agentRepository,
|
||||
projectRepository,
|
||||
accountRepository,
|
||||
initiativeRepository,
|
||||
|
||||
async waitForAgentCompletion(agentId: string, timeoutMs = 120000): Promise<AgentResult | null> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const agent = await agentRepository.findById(agentId);
|
||||
if (!agent) return null;
|
||||
|
||||
if (agent.status === 'idle' || agent.status === 'stopped') {
|
||||
// Agent completed - get result
|
||||
return agentManager.getResult(agentId);
|
||||
}
|
||||
|
||||
if (agent.status === 'crashed') {
|
||||
// Agent crashed - return the error result
|
||||
return agentManager.getResult(agentId);
|
||||
}
|
||||
|
||||
if (agent.status === 'waiting_for_input') {
|
||||
// Agent is waiting - return null (not completed)
|
||||
return null;
|
||||
}
|
||||
|
||||
// Still running - wait and check again
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
}
|
||||
|
||||
throw new Error(`Timeout waiting for agent ${agentId} to complete after ${timeoutMs}ms`);
|
||||
},
|
||||
|
||||
async waitForAgentWaiting(agentId: string, timeoutMs = 120000): Promise<PendingQuestions | null> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const agent = await agentRepository.findById(agentId);
|
||||
if (!agent) return null;
|
||||
|
||||
if (agent.status === 'waiting_for_input') {
|
||||
return agentManager.getPendingQuestions(agentId);
|
||||
}
|
||||
|
||||
if (agent.status === 'idle' || agent.status === 'stopped' || agent.status === 'crashed') {
|
||||
// Agent finished without asking questions
|
||||
return null;
|
||||
}
|
||||
|
||||
// Still running - wait and check again
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
}
|
||||
|
||||
throw new Error(`Timeout waiting for agent ${agentId} to request input after ${timeoutMs}ms`);
|
||||
},
|
||||
|
||||
async waitForAgentStatus(agentId: string, status: AgentStatus, timeoutMs = 120000): Promise<void> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const agent = await agentRepository.findById(agentId);
|
||||
if (!agent) {
|
||||
throw new Error(`Agent ${agentId} not found`);
|
||||
}
|
||||
|
||||
if (agent.status === status) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for terminal states that mean we'll never reach target
|
||||
if (status === 'running' && ['idle', 'stopped', 'crashed', 'waiting_for_input'].includes(agent.status)) {
|
||||
throw new Error(`Agent ${agentId} already in terminal state ${agent.status}, cannot reach ${status}`);
|
||||
}
|
||||
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
}
|
||||
|
||||
throw new Error(`Timeout waiting for agent ${agentId} to reach status ${status} after ${timeoutMs}ms`);
|
||||
},
|
||||
|
||||
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
|
||||
return eventBus.getEventsByType<T>(type);
|
||||
},
|
||||
|
||||
clearEvents(): void {
|
||||
eventBus.clearEvents();
|
||||
},
|
||||
|
||||
async killAllAgents(): Promise<void> {
|
||||
const agents = await agentRepository.findAll();
|
||||
for (const agent of agents) {
|
||||
if (agent.status === 'running') {
|
||||
try {
|
||||
await agentManager.stop(agent.id);
|
||||
} catch {
|
||||
// Ignore errors during cleanup
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
// Kill any running agents
|
||||
await harness.killAllAgents();
|
||||
|
||||
// Clean up workspace directory if we created it
|
||||
if (ownedWorkspace) {
|
||||
try {
|
||||
await rm(workspaceRoot, { recursive: true, force: true });
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
return harness;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if real Claude tests should run.
|
||||
* Set REAL_CLAUDE_TESTS=1 environment variable to enable.
|
||||
*/
|
||||
export const shouldRunRealClaudeTests = process.env.REAL_CLAUDE_TESTS === '1';
|
||||
|
||||
/**
|
||||
* Check if real Codex tests should run.
|
||||
* Set REAL_CODEX_TESTS=1 environment variable to enable.
|
||||
*/
|
||||
export const shouldRunRealCodexTests = process.env.REAL_CODEX_TESTS === '1';
|
||||
|
||||
/**
|
||||
* Skip wrapper for Claude tests - skips unless REAL_CLAUDE_TESTS=1.
|
||||
*/
|
||||
export const describeRealClaude: typeof describe = shouldRunRealClaudeTests ? describe : (describe.skip as typeof describe);
|
||||
|
||||
/**
|
||||
* Skip wrapper for Codex tests - skips unless REAL_CODEX_TESTS=1.
|
||||
*/
|
||||
export const describeRealCodex: typeof describe = shouldRunRealCodexTests ? describe : (describe.skip as typeof describe);
|
||||
|
||||
/**
|
||||
* Default test timeout for real CLI tests (2 minutes).
|
||||
* Real API calls take 5-30 seconds typically.
|
||||
*/
|
||||
export const REAL_TEST_TIMEOUT = 120000;
|
||||
|
||||
/**
|
||||
* Extended test timeout for slow tests (5 minutes).
|
||||
* Used for schema retry tests and crash recovery tests.
|
||||
*/
|
||||
export const EXTENDED_TEST_TIMEOUT = 300000;
|
||||
56
apps/server/test/integration/real-providers/index.ts
Normal file
56
apps/server/test/integration/real-providers/index.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Real Provider Integration Tests
|
||||
*
|
||||
* This module provides infrastructure for testing against real CLI providers.
|
||||
* Tests are expensive (real API calls) and skipped by default.
|
||||
*
|
||||
* ## Running Tests
|
||||
*
|
||||
* ```bash
|
||||
* # Claude tests only
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000
|
||||
*
|
||||
* # Codex tests only
|
||||
* REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts
|
||||
*
|
||||
* # All real provider tests
|
||||
* REAL_CLAUDE_TESTS=1 REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/
|
||||
* ```
|
||||
*
|
||||
* ## Cost Estimates
|
||||
*
|
||||
* | Suite | Tests | Est. Cost | Duration |
|
||||
* |-------|-------|-----------|----------|
|
||||
* | Output Parsing | 3 | $0.06 | ~2 min |
|
||||
* | Schema Validation | 4 | $0.22 | ~4 min |
|
||||
* | Crash Recovery | 3 | $0.08 | ~3 min |
|
||||
* | Session Resume | 2 | $0.08 | ~3 min |
|
||||
* | Codex Integration | 2 | $0.10 | ~2 min |
|
||||
* | **TOTAL** | **14** | **~$0.54** | **~14 min** |
|
||||
*
|
||||
* ## Test Files
|
||||
*
|
||||
* - `harness.ts` - RealProviderHarness factory and utilities
|
||||
* - `prompts.ts` - Minimal cost test prompts
|
||||
* - `claude-manager.test.ts` - Claude spawn/resume/output tests
|
||||
* - `codex-manager.test.ts` - Codex provider tests
|
||||
* - `schema-retry.test.ts` - Schema validation + retry tests
|
||||
* - `crash-recovery.test.ts` - Server restart simulation
|
||||
* - `sample-outputs/` - Captured CLI output for parser unit tests
|
||||
*/
|
||||
|
||||
export {
|
||||
createRealProviderHarness,
|
||||
CapturingEventBus,
|
||||
sleep,
|
||||
shouldRunRealClaudeTests,
|
||||
shouldRunRealCodexTests,
|
||||
describeRealClaude,
|
||||
describeRealCodex,
|
||||
REAL_TEST_TIMEOUT,
|
||||
EXTENDED_TEST_TIMEOUT,
|
||||
type RealProviderHarness,
|
||||
type RealProviderHarnessOptions,
|
||||
} from './harness.js';
|
||||
|
||||
export { MINIMAL_PROMPTS, CODEX_PROMPTS } from './prompts.js';
|
||||
113
apps/server/test/integration/real-providers/prompts.ts
Normal file
113
apps/server/test/integration/real-providers/prompts.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
/**
|
||||
* Minimal Cost Test Prompts
|
||||
*
|
||||
* Carefully crafted prompts designed to minimize token usage while
|
||||
* testing specific CLI behaviors. Each prompt aims for the smallest
|
||||
* possible API cost while still exercising the target functionality.
|
||||
*
|
||||
* Cost estimates assume Claude Sonnet pricing (~$3/M input, $15/M output).
|
||||
*/
|
||||
|
||||
export const MINIMAL_PROMPTS = {
|
||||
/**
|
||||
* ~$0.01 - Cheapest done response
|
||||
* Tests: basic spawn → completion flow, status parsing
|
||||
*/
|
||||
done: `Output exactly this JSON with no other text:
|
||||
{"status":"done","result":"ok"}`,
|
||||
|
||||
/**
|
||||
* ~$0.01 - Cheapest questions response
|
||||
* Tests: waiting_for_input status, questions array parsing
|
||||
*/
|
||||
questions: `Output exactly this JSON with no other text:
|
||||
{"status":"questions","questions":[{"id":"q1","question":"What is your name?"}]}`,
|
||||
|
||||
/**
|
||||
* ~$0.03 - Slow task for timing tests
|
||||
* Tests: streaming during long-running task, crash recovery
|
||||
* Note: Agent may not actually wait 30 seconds, but will produce delayed output
|
||||
*/
|
||||
slow: `Think through a simple problem step by step, counting from 1 to 10 slowly, then output:
|
||||
{"status":"done","result":"counted to 10"}`,
|
||||
|
||||
/**
|
||||
* ~$0.02 - Produces text deltas for streaming tests
|
||||
* Tests: text_delta event parsing, output buffering
|
||||
*/
|
||||
streaming: `Count from 1 to 5, outputting each number, then output:
|
||||
{"status":"done","result":"counted"}`,
|
||||
|
||||
/**
|
||||
* ~$0.03 - Deliberately produces non-JSON first
|
||||
* Tests: schema validation failure, retry logic
|
||||
*/
|
||||
badThenGood: `First say "thinking..." on its own line, then output:
|
||||
{"status":"done","result":"fixed"}`,
|
||||
|
||||
/**
|
||||
* ~$0.02 - Multiple questions
|
||||
* Tests: questions array with multiple items
|
||||
*/
|
||||
multipleQuestions: `Output exactly this JSON with no other text:
|
||||
{"status":"questions","questions":[{"id":"q1","question":"First question?"},{"id":"q2","question":"Second question?"}]}`,
|
||||
|
||||
/**
|
||||
* ~$0.01 - Error signal
|
||||
* Tests: error status handling
|
||||
*/
|
||||
error: `Output exactly this JSON with no other text:
|
||||
{"status":"error","error":"Test error message"}`,
|
||||
|
||||
/**
|
||||
* ~$0.02 - Answer continuation
|
||||
* Tests: session resume with answers
|
||||
*/
|
||||
answerContinuation: (answers: Record<string, string>): string => {
|
||||
const answerLines = Object.entries(answers)
|
||||
.map(([id, answer]) => `${id}: ${answer}`)
|
||||
.join('\n');
|
||||
return `I received your answers:
|
||||
${answerLines}
|
||||
|
||||
Now complete the task by outputting:
|
||||
{"status":"done","result":"completed with answers"}`;
|
||||
},
|
||||
|
||||
/**
|
||||
* ~$0.02 - Context complete for discuss mode
|
||||
* Tests: discuss mode output handling (now uses universal done signal)
|
||||
*/
|
||||
discussComplete: `Output exactly this JSON with no other text:
|
||||
{"status":"done"}`,
|
||||
|
||||
/**
|
||||
* ~$0.02 - Plan complete
|
||||
* Tests: plan mode output handling (now uses universal done signal)
|
||||
*/
|
||||
planComplete: `Output exactly this JSON with no other text:
|
||||
{"status":"done"}`,
|
||||
|
||||
/**
|
||||
* ~$0.02 - Detail complete
|
||||
* Tests: detail mode output handling (now uses universal done signal)
|
||||
*/
|
||||
detailComplete: `Output exactly this JSON with no other text:
|
||||
{"status":"done"}`,
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* Prompts specifically for Codex provider testing.
|
||||
* Codex may have different output format requirements.
|
||||
*/
|
||||
export const CODEX_PROMPTS = {
|
||||
/**
|
||||
* Basic completion for Codex
|
||||
*/
|
||||
done: `Complete this simple task: output "done" and finish.`,
|
||||
|
||||
/**
|
||||
* Produces streaming output
|
||||
*/
|
||||
streaming: `Count from 1 to 5, saying each number aloud, then say "finished".`,
|
||||
} as const;
|
||||
@@ -0,0 +1,68 @@
|
||||
# Sample CLI Outputs
|
||||
|
||||
This directory contains captured real CLI outputs for use in parser unit tests.
|
||||
These files allow testing stream parsers without incurring API costs.
|
||||
|
||||
## Files
|
||||
|
||||
### claude-stream-success.jsonl
|
||||
A successful Claude CLI session (v2.1.33) that:
|
||||
- Initializes with `system` event containing `session_id`
|
||||
- Emits `assistant` message with content
|
||||
- Completes with `result` event containing `done` status JSON
|
||||
|
||||
### claude-stream-questions.jsonl
|
||||
A Claude CLI session that:
|
||||
- Initializes with `system` event containing `session_id`
|
||||
- Emits `assistant` message with content wrapped in markdown code block
|
||||
- Completes with `result` event containing `questions` status JSON
|
||||
|
||||
### codex-stream-success.jsonl
|
||||
A successful Codex CLI session (v0.98.0) that:
|
||||
- Starts with `thread.started` event containing `thread_id`
|
||||
- Emits `turn.started`, `item.completed` events
|
||||
- Completes with `turn.completed` event containing usage stats
|
||||
|
||||
## Event Type Differences
|
||||
|
||||
### Claude CLI (`--output-format stream-json`)
|
||||
- `system` (subtype: `init`) - Contains `session_id`, tools, model info
|
||||
- `assistant` - Contains message content in `content[].text`
|
||||
- `result` - Contains final `result` text and `total_cost_usd`
|
||||
|
||||
### Codex CLI (`--json`)
|
||||
- `thread.started` - Contains `thread_id` (equivalent to session_id)
|
||||
- `turn.started` - Marks beginning of turn
|
||||
- `item.completed` - Contains reasoning or agent_message items
|
||||
- `turn.completed` - Contains usage stats
|
||||
|
||||
## Usage
|
||||
|
||||
These files can be used to test stream parsers in isolation:
|
||||
|
||||
```typescript
|
||||
import { readFileSync } from 'fs';
|
||||
import { ClaudeStreamParser } from '../../../agent/providers/parsers/claude.js';
|
||||
|
||||
const output = readFileSync('sample-outputs/claude-stream-success.jsonl', 'utf-8');
|
||||
const parser = new ClaudeStreamParser();
|
||||
|
||||
for (const line of output.split('\n')) {
|
||||
if (line.trim()) {
|
||||
const events = parser.parseLine(line);
|
||||
// Assert on events...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Capturing New Outputs
|
||||
|
||||
### Claude
|
||||
```bash
|
||||
claude -p "your prompt" --output-format stream-json --verbose > output.jsonl
|
||||
```
|
||||
|
||||
### Codex
|
||||
```bash
|
||||
codex exec --full-auto --json "your prompt" > output.jsonl
|
||||
```
|
||||
@@ -0,0 +1,3 @@
|
||||
{"type":"system","subtype":"init","cwd":"/Users/lukasmay/development/projects/codewalk-district","session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","slash_commands":["keybindings-help","debug","gsd:define-requirements","gsd:list-phase-assumptions","gsd:debug","gsd:remove-phase","gsd:complete-milestone","gsd:research-phase","gsd:plan-phase","gsd:check-todos","gsd:pause-work","gsd:execute-plan","gsd:research-project","gsd:add-todo","gsd:plan-fix","gsd:resume-work","gsd:progress","gsd:help","gsd:discuss-milestone","gsd:add-phase","gsd:create-roadmap","gsd:map-codebase","gsd:whats-new","gsd:insert-phase","gsd:new-milestone","gsd:new-project","gsd:execute-phase","gsd:verify-work","gsd:discuss-phase","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.33","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","claude-code-guide","jira-sw-assessment"],"skills":["keybindings-help","debug"],"plugins":[],"uuid":"224c683c-41f4-4fdd-9af6-f8cdca366ec1"}
|
||||
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01CfDymxvSRFodJ5Zm6NjLHV","type":"message","role":"assistant","content":[{"type":"text","text":"```json\n{\"status\":\"questions\",\"questions\":[{\"id\":\"q1\",\"question\":\"What is your name?\"},{\"id\":\"q2\",\"question\":\"What is the deadline?\"}]}\n```"}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":5983,"cache_read_input_tokens":18026,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":5983},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","uuid":"29288f20-766c-4047-82f5-679024188f52"}
|
||||
{"type":"result","subtype":"success","is_error":false,"duration_ms":3213,"duration_api_ms":3203,"num_turns":1,"result":"```json\n{\"status\":\"questions\",\"questions\":[{\"id\":\"q1\",\"question\":\"What is your name?\"},{\"id\":\"q2\",\"question\":\"What is the deadline?\"}]}\n```","stop_reason":null,"session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","total_cost_usd":0.04754675,"usage":{"input_tokens":3,"cache_creation_input_tokens":5983,"cache_read_input_tokens":18026,"output_tokens":45,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":5983,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":3,"outputTokens":45,"cacheReadInputTokens":18026,"cacheCreationInputTokens":5983,"webSearchRequests":0,"costUSD":0.04754675,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"uuid":"08db08cd-0f12-47ae-8c21-c29e11a6d7df"}
|
||||
@@ -0,0 +1,3 @@
|
||||
{"type":"system","subtype":"init","cwd":"/Users/lukasmay/development/projects/codewalk-district","session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","slash_commands":["keybindings-help","debug","gsd:define-requirements","gsd:list-phase-assumptions","gsd:debug","gsd:remove-phase","gsd:complete-milestone","gsd:research-phase","gsd:plan-phase","gsd:check-todos","gsd:pause-work","gsd:execute-plan","gsd:research-project","gsd:add-todo","gsd:plan-fix","gsd:resume-work","gsd:progress","gsd:help","gsd:discuss-milestone","gsd:add-phase","gsd:create-roadmap","gsd:map-codebase","gsd:whats-new","gsd:insert-phase","gsd:new-milestone","gsd:new-project","gsd:execute-phase","gsd:verify-work","gsd:discuss-phase","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.33","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","claude-code-guide","jira-sw-assessment"],"skills":["keybindings-help","debug"],"plugins":[],"uuid":"c1d6dced-ca04-4335-a624-624660479b7b"}
|
||||
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RjSiQY1RUgT47j73Dom93j","type":"message","role":"assistant","content":[{"type":"text","text":"{\"status\":\"done\",\"result\":\"ok\"}"}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":5958,"cache_read_input_tokens":18026,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":5958},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","uuid":"f1c8695a-534e-4de2-a684-fa4a1ec03749"}
|
||||
{"type":"result","subtype":"success","is_error":false,"duration_ms":2465,"duration_api_ms":2453,"num_turns":1,"result":"{\"status\":\"done\",\"result\":\"ok\"}","stop_reason":null,"session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","total_cost_usd":0.046565499999999996,"usage":{"input_tokens":3,"cache_creation_input_tokens":5958,"cache_read_input_tokens":18026,"output_tokens":12,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":5958,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":3,"outputTokens":12,"cacheReadInputTokens":18026,"cacheCreationInputTokens":5958,"webSearchRequests":0,"costUSD":0.046565499999999996,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"uuid":"53139e08-b4f3-4f94-b129-82759f77fdca"}
|
||||
@@ -0,0 +1,5 @@
|
||||
{"type":"thread.started","thread_id":"019c3242-955e-7140-9978-517f0b5a22cb"}
|
||||
{"type":"turn.started"}
|
||||
{"type":"item.completed","item":{"id":"item_0","type":"reasoning","text":"**Confirming simple greeting task**"}}
|
||||
{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"Hello!"}}
|
||||
{"type":"turn.completed","usage":{"input_tokens":8458,"cached_input_tokens":6912,"output_tokens":32}}
|
||||
306
apps/server/test/integration/real-providers/schema-retry.test.ts
Normal file
306
apps/server/test/integration/real-providers/schema-retry.test.ts
Normal file
@@ -0,0 +1,306 @@
|
||||
/**
|
||||
* Schema Validation & Retry Integration Tests
|
||||
*
|
||||
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
|
||||
* They are SKIPPED by default and should only be run manually for validation.
|
||||
*
|
||||
* To run these tests:
|
||||
* ```bash
|
||||
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/schema-retry.test.ts --test-timeout=300000
|
||||
* ```
|
||||
*
|
||||
* Tests covered:
|
||||
* - Valid JSON output validation
|
||||
* - Questions status parsing
|
||||
* - Schema validation failure with retry
|
||||
* - Max retry limit handling
|
||||
*
|
||||
* Estimated cost: ~$0.20 per full run (includes retries)
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
|
||||
import {
|
||||
createRealProviderHarness,
|
||||
describeRealClaude,
|
||||
REAL_TEST_TIMEOUT,
|
||||
EXTENDED_TEST_TIMEOUT,
|
||||
type RealProviderHarness,
|
||||
} from './harness.js';
|
||||
import { MINIMAL_PROMPTS } from './prompts.js';
|
||||
import type { AgentResumedEvent, AgentCrashedEvent } from '../../../events/types.js';
|
||||
|
||||
describeRealClaude('Schema Validation & Retry', () => {
|
||||
let harness: RealProviderHarness;
|
||||
|
||||
beforeAll(async () => {
|
||||
console.log('\n=== Running Schema Validation & Retry Tests ===');
|
||||
console.log('These tests call the real Claude API and incur costs.');
|
||||
console.log('Retry tests may take longer and cost more.\n');
|
||||
harness = await createRealProviderHarness({ provider: 'claude' });
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await harness.cleanup();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
harness.clearEvents();
|
||||
});
|
||||
|
||||
describe('Valid Output', () => {
|
||||
it(
|
||||
'validates done status output',
|
||||
async () => {
|
||||
// Spawn agent with minimal done prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.done,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify completion
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('idle');
|
||||
expect(result?.success).toBe(true);
|
||||
|
||||
// No retry events should have been emitted
|
||||
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
|
||||
expect(resumeEvents.length).toBe(0);
|
||||
|
||||
console.log(' Status: idle (valid done output)');
|
||||
console.log(' Result:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'validates questions status output',
|
||||
async () => {
|
||||
// Spawn agent with questions prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.questions,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for waiting_for_input
|
||||
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify questions were validated
|
||||
expect(questions).toBeTruthy();
|
||||
expect(questions?.questions).toBeInstanceOf(Array);
|
||||
expect(questions?.questions.length).toBeGreaterThan(0);
|
||||
|
||||
// Each question should have id and question fields
|
||||
for (const q of questions?.questions ?? []) {
|
||||
expect(q.id).toBeTruthy();
|
||||
expect(q.question).toBeTruthy();
|
||||
}
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('waiting_for_input');
|
||||
|
||||
// No retry events
|
||||
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
|
||||
expect(resumeEvents.length).toBe(0);
|
||||
|
||||
console.log(' Status: waiting_for_input (valid questions output)');
|
||||
console.log(' Questions:', questions?.questions.length);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'validates multiple questions',
|
||||
async () => {
|
||||
// Spawn agent with multiple questions prompt
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.multipleQuestions,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for waiting_for_input
|
||||
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
// Verify multiple questions
|
||||
expect(questions?.questions.length).toBeGreaterThanOrEqual(2);
|
||||
|
||||
// Each question should have unique ID
|
||||
const ids = questions?.questions.map((q) => q.id) ?? [];
|
||||
const uniqueIds = new Set(ids);
|
||||
expect(uniqueIds.size).toBe(ids.length);
|
||||
|
||||
console.log(' Questions:', questions?.questions.map((q) => q.id).join(', '));
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
|
||||
describe('Retry Logic', () => {
|
||||
it(
|
||||
'retries when output does not match schema',
|
||||
async () => {
|
||||
// Prompt that produces non-JSON first, then valid JSON
|
||||
// Note: Claude may or may not produce invalid output first
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.badThenGood,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion (may involve retries)
|
||||
const result = await harness.waitForAgentCompletion(agent.id, EXTENDED_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
|
||||
// Either succeeded with retry OR succeeded first time
|
||||
expect(['idle', 'crashed']).toContain(dbAgent?.status);
|
||||
|
||||
// Check for retry events
|
||||
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
|
||||
console.log(' Retry attempts:', resumeEvents.length);
|
||||
console.log(' Final status:', dbAgent?.status);
|
||||
|
||||
if (dbAgent?.status === 'idle') {
|
||||
expect(result?.success).toBe(true);
|
||||
console.log(' Result:', result?.message);
|
||||
} else {
|
||||
// Crashed after max retries
|
||||
const crashedEvents = harness.getEventsByType<AgentCrashedEvent>('agent:crashed');
|
||||
expect(crashedEvents.length).toBeGreaterThan(0);
|
||||
console.log(' Crashed after retries');
|
||||
}
|
||||
},
|
||||
EXTENDED_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'extracts JSON from markdown code blocks',
|
||||
async () => {
|
||||
// Prompt that produces JSON wrapped in markdown
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: `Output the result wrapped in a markdown code block like this:
|
||||
\`\`\`json
|
||||
{"status":"done","result":"extracted from markdown"}
|
||||
\`\`\``,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' Status:', dbAgent?.status);
|
||||
console.log(' Result:', result?.message);
|
||||
|
||||
// Should succeed (JSON extraction from code block)
|
||||
if (dbAgent?.status === 'idle') {
|
||||
expect(result?.success).toBe(true);
|
||||
}
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'extracts JSON from text with surrounding content',
|
||||
async () => {
|
||||
// Prompt that produces JSON with text before it
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: `First say "Here is my response:" then output the JSON:
|
||||
{"status":"done","result":"extracted from text"}`,
|
||||
mode: 'execute',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
// Wait for completion
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
console.log(' Status:', dbAgent?.status);
|
||||
console.log(' Result:', result?.message);
|
||||
|
||||
// Should succeed (JSON extraction from last {...} block)
|
||||
if (dbAgent?.status === 'idle') {
|
||||
expect(result?.success).toBe(true);
|
||||
}
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
|
||||
describe('Mode-Specific Schemas', () => {
|
||||
it(
|
||||
'validates discuss mode output',
|
||||
async () => {
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.discussComplete,
|
||||
mode: 'discuss',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('idle');
|
||||
expect(result?.success).toBe(true);
|
||||
|
||||
console.log(' Discuss mode result:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'validates plan mode output',
|
||||
async () => {
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.planComplete,
|
||||
mode: 'plan',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('idle');
|
||||
expect(result?.success).toBe(true);
|
||||
|
||||
console.log(' Plan mode result:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
|
||||
it(
|
||||
'validates detail mode output',
|
||||
async () => {
|
||||
const agent = await harness.agentManager.spawn({
|
||||
taskId: null,
|
||||
prompt: MINIMAL_PROMPTS.detailComplete,
|
||||
mode: 'detail',
|
||||
provider: 'claude',
|
||||
});
|
||||
|
||||
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
|
||||
|
||||
const dbAgent = await harness.agentRepository.findById(agent.id);
|
||||
expect(dbAgent?.status).toBe('idle');
|
||||
expect(result?.success).toBe(true);
|
||||
|
||||
console.log(' Detail mode result:', result?.message);
|
||||
},
|
||||
REAL_TEST_TIMEOUT
|
||||
);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user