refactor: Restructure monorepo to apps/server/ and apps/web/ layout

Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt
standard monorepo conventions (apps/ for runnable apps, packages/
for reusable libraries). Update all config files, shared package
imports, test fixtures, and documentation to reflect new paths.

Key fixes:
- Update workspace config to ["apps/*", "packages/*"]
- Update tsconfig.json rootDir/include for apps/server/
- Add apps/web/** to vitest exclude list
- Update drizzle.config.ts schema path
- Fix ensure-schema.ts migration path detection (3 levels up in dev,
  2 levels up in dist)
- Fix tests/integration/cli-server.test.ts import paths
- Update packages/shared imports to apps/server/ paths
- Update all docs/ files with new paths
This commit is contained in:
Lukas May
2026-03-03 11:22:53 +01:00
parent 8c38d958ce
commit 34578d39c6
535 changed files with 75452 additions and 687 deletions

View File

@@ -0,0 +1,203 @@
/**
* Agent Working Directory Verification Tests
*
* Tests that verify agents actually run in their intended working directories.
* These tests use simple shell commands to prove the agent execution location.
*
* IMPORTANT: These tests spawn real CLI processes and may incur API costs.
* They are SKIPPED by default to prevent accidental charges.
*
* To run these tests:
* ```bash
* REAL_WORKDIR_TESTS=1 npm test -- src/test/integration/agent-workdir-verification.test.ts --test-timeout=120000
* ```
*/
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { mkdtemp, rm, readFile } from 'node:fs/promises';
import { existsSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { MultiProviderAgentManager } from '../../agent/manager.js';
import { createTestDatabase } from '../../db/repositories/drizzle/test-helpers.js';
import {
DrizzleAgentRepository,
DrizzleProjectRepository,
DrizzleAccountRepository,
DrizzleInitiativeRepository,
} from '../../db/repositories/drizzle/index.js';
import { EventEmitterBus } from '../../events/bus.js';
const SHOULD_SKIP = !process.env.REAL_WORKDIR_TESTS;
const TEST_TIMEOUT = 60000;
describe.skipIf(SHOULD_SKIP)('Agent Working Directory Verification', () => {
let tempDir: string;
let agentManager: MultiProviderAgentManager;
let agentRepository: DrizzleAgentRepository;
beforeAll(async () => {
if (SHOULD_SKIP) return;
console.log('\n=== Running Agent Working Directory Tests ===');
console.log('These tests verify agents run in correct working directories.\n');
// Create temp directory for test workspace
tempDir = await mkdtemp(join(tmpdir(), 'cw-workdir-test-'));
// Set up test database and repositories
const db = await createTestDatabase();
const eventBus = new EventEmitterBus();
agentRepository = new DrizzleAgentRepository(db);
const projectRepository = new DrizzleProjectRepository(db);
const accountRepository = new DrizzleAccountRepository(db);
agentManager = new MultiProviderAgentManager(
agentRepository,
tempDir,
projectRepository,
accountRepository,
eventBus,
);
});
afterAll(async () => {
if (SHOULD_SKIP || !tempDir) return;
try {
await rm(tempDir, { recursive: true });
} catch (err) {
console.warn('Failed to cleanup temp directory:', err);
}
});
it('spawns agent in correct standalone working directory', async () => {
const prompt = `
Write your current working directory to a file called 'verify-pwd.txt'.
Use this exact bash command:
pwd > verify-pwd.txt
Then output the signal: {"done": true}
`.trim();
// Spawn standalone agent
const agent = await agentManager.spawn({
taskId: null,
prompt,
mode: 'execute',
provider: 'claude',
});
expect(agent.id).toBeTruthy();
expect(agent.status).toBe('running');
// Wait for completion (poll agent status)
let attempts = 0;
const maxAttempts = 60; // 60 seconds timeout
while (attempts < maxAttempts) {
await new Promise(resolve => setTimeout(resolve, 1000));
attempts++;
const currentAgent = await agentRepository.findById(agent.id);
if (!currentAgent || currentAgent.status !== 'running') {
break;
}
}
// Verify final agent state
const completedAgent = await agentRepository.findById(agent.id);
expect(completedAgent).toBeTruthy();
expect(completedAgent!.status).not.toBe('running');
// Get the agent's expected working directory
const expectedWorkdir = join(tempDir, 'agent-workdirs', agent.name, 'workspace');
// Read diagnostic files
const diagnosticFile = join(expectedWorkdir, '.cw', 'spawn-diagnostic.json');
const expectedPwdFile = join(expectedWorkdir, '.cw', 'expected-pwd.txt');
const verifyPwdFile = join(expectedWorkdir, 'verify-pwd.txt');
// Verify diagnostic files exist
expect(existsSync(diagnosticFile), 'spawn diagnostic file should exist').toBe(true);
expect(existsSync(expectedPwdFile), 'expected pwd file should exist').toBe(true);
// Read diagnostic data
const diagnostic = JSON.parse(await readFile(diagnosticFile, 'utf-8'));
const expectedPwd = (await readFile(expectedPwdFile, 'utf-8')).trim();
console.log('Diagnostic data:', diagnostic);
console.log('Expected working directory:', expectedPwd);
// Verify diagnostic consistency
expect(diagnostic.intendedCwd).toBe(expectedWorkdir);
expect(diagnostic.cwdExistsAtSpawn).toBe(true);
expect(expectedPwd).toBe(expectedWorkdir);
// The critical test: verify the agent actually wrote the file in the expected location
if (existsSync(verifyPwdFile)) {
const actualPwd = (await readFile(verifyPwdFile, 'utf-8')).trim();
console.log('Agent reported working directory:', actualPwd);
// This is the key verification: the pwd reported by the agent should match expected
expect(actualPwd).toBe(expectedWorkdir);
} else {
// If the file doesn't exist, the agent either failed or ran somewhere else
console.warn('Agent did not create verify-pwd.txt file');
console.log('Expected at:', verifyPwdFile);
// Let's check if it was created elsewhere (debugging)
const alternativeLocations = [
join(tempDir, 'verify-pwd.txt'),
join(process.cwd(), 'verify-pwd.txt'),
];
for (const loc of alternativeLocations) {
if (existsSync(loc)) {
const content = await readFile(loc, 'utf-8');
console.log(`Found verify-pwd.txt at unexpected location ${loc}:`, content.trim());
}
}
throw new Error('Agent did not create pwd verification file in expected location');
}
}, TEST_TIMEOUT);
it('creates diagnostic files with correct metadata', async () => {
const prompt = `Output the signal: {"done": true}`;
const agent = await agentManager.spawn({
taskId: null,
prompt,
mode: 'execute',
provider: 'claude',
});
// Wait a bit for spawn to complete
await new Promise(resolve => setTimeout(resolve, 2000));
const expectedWorkdir = join(tempDir, 'agent-workdirs', agent.name, 'workspace');
const diagnosticFile = join(expectedWorkdir, '.cw', 'spawn-diagnostic.json');
const expectedPwdFile = join(expectedWorkdir, '.cw', 'expected-pwd.txt');
// Verify files exist immediately after spawn
expect(existsSync(diagnosticFile), 'diagnostic file should be created after spawn').toBe(true);
expect(existsSync(expectedPwdFile), 'expected pwd file should be created').toBe(true);
// Verify diagnostic content
const diagnostic = JSON.parse(await readFile(diagnosticFile, 'utf-8'));
const expectedPwd = (await readFile(expectedPwdFile, 'utf-8')).trim();
expect(diagnostic.agentId).toBe(agent.id);
expect(diagnostic.alias).toBe(agent.name);
expect(diagnostic.intendedCwd).toBe(expectedWorkdir);
expect(diagnostic.provider).toBe('claude');
expect(diagnostic.cwdExistsAtSpawn).toBe(true);
expect(diagnostic.customCwdProvided).toBe(false);
expect(typeof diagnostic.timestamp).toBe('string');
expect(Array.isArray(diagnostic.args)).toBe(true);
expect(expectedPwd).toBe(expectedWorkdir);
});
});

View File

@@ -0,0 +1,232 @@
/**
* Integration test to reproduce and fix the crash marking race condition.
*
* This test simulates the exact scenario where agents complete successfully
* but get marked as crashed due to timing issues in the output handler.
*/
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { writeFile, mkdir, rm } from 'node:fs/promises';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { randomBytes } from 'node:crypto';
import { OutputHandler } from '../../agent/output-handler.js';
import type { AgentRepository } from '../../db/repositories/agent-repository.js';
interface TestAgent {
id: string;
name: string;
status: 'idle' | 'running' | 'waiting_for_input' | 'stopped' | 'crashed';
mode: 'execute' | 'discuss' | 'plan' | 'detail' | 'refine';
taskId: string | null;
sessionId: string | null;
worktreeId: string;
createdAt: Date;
updatedAt: Date;
provider: string;
accountId: string | null;
pid: number | null;
outputFilePath: string | null;
result: string | null;
pendingQuestions: string | null;
initiativeId: string | null;
userDismissedAt: Date | null;
exitCode: number | null;
}
describe('Crash marking race condition', () => {
let outputHandler: OutputHandler;
let testAgent: TestAgent;
let testDir: string;
let mockRepo: AgentRepository;
// Track all repository calls
let updateCalls: Array<{ id: string; data: any }> = [];
let finalAgentStatus: string | null = null;
beforeEach(async () => {
updateCalls = [];
finalAgentStatus = null;
// Create test directory structure
testDir = join(tmpdir(), `crash-test-${randomBytes(8).toString('hex')}`);
const outputDir = join(testDir, '.cw/output');
await mkdir(outputDir, { recursive: true });
// Create test agent
testAgent = {
id: 'test-agent-id',
name: 'test-agent',
status: 'running',
mode: 'refine',
taskId: 'task-1',
sessionId: 'session-1',
worktreeId: 'worktree-1',
createdAt: new Date(),
updatedAt: new Date(),
provider: 'claude',
accountId: null,
pid: 12345,
outputFilePath: join(testDir, 'output.jsonl'),
result: null,
pendingQuestions: null,
initiativeId: 'init-1',
userDismissedAt: null,
exitCode: null
};
// Mock repository that tracks all update calls
mockRepo = {
async findById(id: string) {
return id === testAgent.id ? { ...testAgent } : null;
},
async update(id: string, data: any) {
updateCalls.push({ id, data });
if (data.status) {
finalAgentStatus = data.status;
testAgent.status = data.status;
}
return { ...testAgent, ...data };
},
async create() { throw new Error('Not implemented'); },
async findAll() { throw new Error('Not implemented'); },
async findByStatus() { throw new Error('Not implemented'); },
async findByTaskId() { throw new Error('Not implemented'); },
async findByName() { throw new Error('Not implemented'); },
async findBySessionId() { throw new Error('Not implemented'); },
async delete() { throw new Error('Not implemented'); }
};
outputHandler = new OutputHandler(mockRepo);
});
afterEach(async () => {
try {
await rm(testDir, { recursive: true });
} catch {
// Ignore cleanup errors
}
});
it('should NOT mark agent as crashed when signal.json indicates completion', async () => {
// SETUP: Create a valid completion signal that should prevent crash marking
const signalPath = join(testDir, '.cw/output/signal.json');
const signalContent = {
status: 'questions',
questions: [
{ id: 'q1', question: 'Test question?' }
]
};
await writeFile(signalPath, JSON.stringify(signalContent, null, 2));
// SETUP: Create empty output file to simulate "no new output detected" scenario
const outputFilePath = join(testDir, 'output.jsonl');
await writeFile(outputFilePath, ''); // Empty file simulates the race condition
// Mock active agent with output file path
const mockActive = {
outputFilePath,
streamSessionId: 'session-1'
};
// Mock getAgentWorkdir function — receives worktreeId, not agentId
const getAgentWorkdir = (worktreeId: string) => {
expect(worktreeId).toBe(testAgent.worktreeId);
return testDir;
};
// EXECUTE: Call handleCompletion which should trigger the race condition scenario
// This simulates: no stream text + no new file content + valid signal.json
await (outputHandler as any).handleCompletion(
testAgent.id,
mockActive,
getAgentWorkdir
);
// VERIFY: Agent should NOT be marked as crashed
console.log('Update calls:', updateCalls);
console.log('Final agent status:', finalAgentStatus);
expect(updateCalls.length).toBeGreaterThan(0);
expect(finalAgentStatus).not.toBe('crashed');
// Should be marked with the appropriate completion status
expect(['idle', 'waiting_for_input', 'stopped']).toContain(finalAgentStatus);
});
it('should mark agent as crashed when no completion signal exists', async () => {
// SETUP: No signal.json file exists - agent should be marked as crashed
const outputFilePath = join(testDir, 'output.jsonl');
await writeFile(outputFilePath, ''); // Empty file
const mockActive = {
outputFilePath,
streamSessionId: 'session-1'
};
const getAgentWorkdir = (agentId: string) => testDir;
// EXECUTE: This should mark agent as crashed since no completion signal exists
await (outputHandler as any).handleCompletion(
testAgent.id,
mockActive,
getAgentWorkdir
);
// VERIFY: Agent SHOULD be marked as crashed
expect(finalAgentStatus).toBe('crashed');
});
it('should handle the exact slim-wildebeest scenario', async () => {
// SETUP: Reproduce the exact conditions that slim-wildebeest had
const signalPath = join(testDir, '.cw/output/signal.json');
const exactSignalContent = {
"status": "questions",
"questions": [
{
"id": "q1",
"question": "What UI framework/styling system is the admin UI currently using that needs to be replaced?"
},
{
"id": "q2",
"question": "What specific problems with the current admin UI are we solving? (e.g., poor developer experience, design inconsistency, performance issues, lack of accessibility)"
}
]
};
await writeFile(signalPath, JSON.stringify(exactSignalContent, null, 2));
// Create SUMMARY.md like slim-wildebeest had
const summaryPath = join(testDir, '.cw/output/SUMMARY.md');
const summaryContent = `---
files_modified: []
---
Initiative page is essentially empty — lacks context, scope, goals, and technical approach. Requested clarification on current state, problems being solved, scope boundaries, and success criteria before proposing meaningful improvements.`;
await writeFile(summaryPath, summaryContent);
// Simulate the output file scenario
const outputFilePath = join(testDir, 'output.jsonl');
await writeFile(outputFilePath, 'some initial content\n'); // Some content but no new lines
const mockActive = {
outputFilePath,
streamSessionId: 'session-1'
};
const getAgentWorkdir = (agentId: string) => testDir;
// EXECUTE: This is the exact scenario that caused slim-wildebeest to be marked as crashed
await (outputHandler as any).handleCompletion(
testAgent.id,
mockActive,
getAgentWorkdir
);
// VERIFY: This should NOT be marked as crashed
console.log('slim-wildebeest scenario - Final status:', finalAgentStatus);
console.log('slim-wildebeest scenario - Update calls:', updateCalls);
expect(finalAgentStatus).not.toBe('crashed');
expect(['idle', 'waiting_for_input', 'stopped']).toContain(finalAgentStatus);
});
});

View File

@@ -0,0 +1,244 @@
/**
* Full-Flow Cassette Integration Test
*
* Cassette-backed variant of the full multi-agent workflow test.
* Runs the same discuss → plan → detail → execute pipeline but intercepts
* subprocess spawning with CassetteProcessManager — no real API calls in CI.
*
* Recording (one-time, costs ~$25):
* CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
* # Commit the generated src/test/cassettes/<hash>.json files afterward
*
* Replay (default — runs in seconds):
* npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts
*
* Force re-record (overwrites existing cassettes):
* CW_CASSETTE_FORCE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
*/
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { existsSync, readdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import type { Phase, Task } from '../../../db/schema.js';
import type { AgentResult } from '../../../agent/types.js';
import { buildExecutePrompt } from '../../../agent/prompts/index.js';
import { CassetteStore } from '../../cassette/store.js';
import { CassetteProcessManager, type CassetteMode } from '../../cassette/process-manager.js';
import {
createFullFlowHarness,
type FullFlowHarness,
} from './harness.js';
import {
printHeader,
printDiscussResult,
printPlanResult,
printDetailResult,
printExecuteResult,
printFinalSummary,
type ExecutedTask,
} from './report.js';
// =============================================================================
// Constants
// =============================================================================
const RECORDING =
process.env.CW_CASSETTE_FORCE_RECORD === '1' || process.env.CW_CASSETTE_RECORD === '1';
/**
* Test timeout.
* - Replay: 5 min (cassettes complete in seconds; cap is generous headroom)
* - Record: 60 min (real agents doing discuss/plan/detail/execute take API time)
*/
const CASSETTE_FLOW_TIMEOUT = RECORDING ? 60 * 60_000 : 5 * 60_000;
const __dirname = dirname(fileURLToPath(import.meta.url));
const CASSETTE_DIR =
process.env.CW_CASSETTE_DIR ?? join(__dirname, '../../cassettes');
// =============================================================================
// Mode helper
// =============================================================================
function cassetteMode(): CassetteMode {
if (process.env.CW_CASSETTE_FORCE_RECORD === '1') return 'record';
if (process.env.CW_CASSETTE_RECORD === '1') return 'auto';
return 'replay';
}
/**
* True when cassettes are available (at least one .json file) OR we're in a
* recording run. Skips the suite if no cassettes have been recorded yet so
* that `npm test` doesn't fail on a fresh clone before cassettes are committed.
*/
function cassettesAvailable(): boolean {
const mode = cassetteMode();
if (mode !== 'replay') return true; // recording runs always proceed
if (!existsSync(CASSETTE_DIR)) return false;
return readdirSync(CASSETTE_DIR).some((f) => f.endsWith('.json'));
}
// =============================================================================
// Test
// =============================================================================
describe.skipIf(!cassettesAvailable())('full flow (cassette replay)', () => {
let harness: FullFlowHarness;
const startedAt = Date.now();
beforeAll(async () => {
const store = new CassetteStore(CASSETTE_DIR);
const mode = cassetteMode();
harness = await createFullFlowHarness('Add complete() method to TodoStore', {
processManagerFactory: (workspaceRoot, projectRepo) =>
new CassetteProcessManager(workspaceRoot, projectRepo, store, mode),
});
printHeader(harness.initiative.name);
console.log(` Cassette mode : ${mode}`);
console.log(` Cassette dir : ${CASSETTE_DIR}`);
console.log(` Initiative ID : ${harness.initiative.id}`);
console.log(` Workspace : ${harness.workspaceRoot}`);
}, CASSETTE_FLOW_TIMEOUT);
afterAll(async () => {
if (harness) await harness.cleanup();
});
it(
'runs the complete multi-agent workflow from cassettes',
async () => {
const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness;
const initiativeId = initiative.id;
// ── Stage 2: Discuss ───────────────────────────────────────────────────
console.log('\n\n>>> Stage 2: DISCUSS <<<');
const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId });
expect(discussAgent.id).toBeTruthy();
console.log(` Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`);
const discussResult = await harness.driveToCompletion(
discussAgent.id,
'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.',
CASSETTE_FLOW_TIMEOUT,
);
printDiscussResult(discussAgent.id, discussResult);
if (!discussResult?.success) {
console.warn(' [WARN] discuss agent did not succeed; continuing to plan stage');
}
// ── Stage 3: Plan ──────────────────────────────────────────────────────
console.log('\n\n>>> Stage 3: PLAN <<<');
const planAgent = await caller.spawnArchitectPlan({ initiativeId });
expect(planAgent.id).toBeTruthy();
console.log(` Spawned plan agent: ${planAgent.name} (${planAgent.id})`);
const planResult = await harness.driveToCompletion(
planAgent.id,
'Keep it simple.',
CASSETTE_FLOW_TIMEOUT,
);
expect(planResult).toBeTruthy();
const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId);
expect(phases.length).toBeGreaterThan(0);
printPlanResult(phases);
// ── Stage 4: Detail (per phase) ────────────────────────────────────────
console.log('\n\n>>> Stage 4: DETAIL <<<');
for (const phase of phases) {
const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id });
expect(detailAgent.id).toBeTruthy();
console.log(` Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`);
const detailResult = await harness.driveToCompletion(
detailAgent.id,
'Keep it simple.',
CASSETTE_FLOW_TIMEOUT,
);
expect(detailResult).toBeTruthy();
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
expect(executeTasks.length).toBeGreaterThan(0);
printDetailResult(phase, phaseTasks);
}
// ── Stage 5: Execute ───────────────────────────────────────────────────
console.log('\n\n>>> Stage 5: EXECUTE <<<');
const allTasks = await gatherAllExecuteTasks(taskRepository, phases);
console.log(` Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`);
const executed: ExecutedTask[] = [];
for (const task of allTasks) {
console.log(` Spawning execute agent for: "${task.name}"`);
const execAgent = await agentManager.spawn({
taskId: task.id,
prompt: buildExecutePrompt(task.description ?? task.name),
mode: 'execute',
initiativeId,
phaseId: task.phaseId ?? undefined,
inputContext: {
initiative,
task,
},
});
console.log(` Agent: ${execAgent.name} (${execAgent.id})`);
const result = await harness.driveToCompletion(
execAgent.id,
'Use your best judgment and keep it simple.',
CASSETTE_FLOW_TIMEOUT,
);
executed.push({ task, result });
const icon = result?.success ? '✓' : '✗';
console.log(` ${icon} Completed with success=${result?.success ?? null}`);
if (result && !result.success) {
console.log(` Message: ${result.message?.slice(0, 200)}`);
}
}
printExecuteResult(executed);
// ── Assertions ─────────────────────────────────────────────────────────
expect(executed.length).toBeGreaterThan(0);
const allSucceeded = executed.every((e) => e.result?.success === true);
if (!allSucceeded) {
const failed = executed.filter((e) => !e.result?.success);
console.warn(` [WARN] ${failed.length} execute task(s) did not succeed`);
}
// ── Final summary ──────────────────────────────────────────────────────
printFinalSummary(
initiative.name,
phases,
allTasks,
executed,
Date.now() - startedAt,
);
},
CASSETTE_FLOW_TIMEOUT,
);
});
// =============================================================================
// Helpers
// =============================================================================
async function gatherAllExecuteTasks(
taskRepository: FullFlowHarness['taskRepository'],
phases: Phase[],
): Promise<Task[]> {
const result: Task[] = [];
for (const phase of phases) {
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
result.push(...execTasks);
}
return result;
}

View File

@@ -0,0 +1,399 @@
/**
* Full-Flow Test Harness
*
* Wires up the complete system with real agents for end-to-end multi-agent
* workflow testing: discuss → plan → detail → execute.
*
* Unlike the standard TestHarness (MockAgentManager) or RealProviderHarness
* (agents only), this harness adds:
* - All 11 repositories
* - tRPC caller for architect/agent procedures
* - A self-contained fixture git repo (todo-api) for agents to work on
* - Helpers for driving agents through question/answer loops
*
* Used by full-flow-cassette.test.ts (replay) and for manual recording runs.
*/
import { mkdtemp, rm, cp } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { execSync } from 'node:child_process';
import type { DrizzleDatabase } from '../../../db/index.js';
import type { DomainEvent } from '../../../events/types.js';
import { EventEmitterBus } from '../../../events/bus.js';
import { MultiProviderAgentManager } from '../../../agent/manager.js';
import type { AgentResult, PendingQuestions } from '../../../agent/types.js';
import type { Initiative, Project, Phase, Task } from '../../../db/schema.js';
import type { InitiativeRepository } from '../../../db/repositories/initiative-repository.js';
import type { PhaseRepository } from '../../../db/repositories/phase-repository.js';
import type { TaskRepository } from '../../../db/repositories/task-repository.js';
import type { MessageRepository } from '../../../db/repositories/message-repository.js';
import type { AgentRepository } from '../../../db/repositories/agent-repository.js';
import type { PageRepository } from '../../../db/repositories/page-repository.js';
import type { ProjectRepository } from '../../../db/repositories/project-repository.js';
import type { AccountRepository } from '../../../db/repositories/account-repository.js';
import type { ChangeSetRepository } from '../../../db/repositories/change-set-repository.js';
import type { LogChunkRepository } from '../../../db/repositories/log-chunk-repository.js';
import type { ConversationRepository } from '../../../db/repositories/conversation-repository.js';
import type { ProcessManager } from '../../../agent/process-manager.js';
import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
import { createRepositories } from '../../../container.js';
import { DefaultDispatchManager } from '../../../dispatch/manager.js';
import { appRouter, createCallerFactory } from '../../../trpc/router.js';
import { createContext } from '../../../trpc/context.js';
// =============================================================================
// CapturingEventBus
// =============================================================================
export class CapturingEventBus extends EventEmitterBus {
emittedEvents: DomainEvent[] = [];
emit<T extends DomainEvent>(event: T): void {
this.emittedEvents.push(event);
super.emit(event);
}
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
return this.emittedEvents.filter((e) => e.type === type) as T[];
}
clearEvents(): void {
this.emittedEvents = [];
}
}
// =============================================================================
// Sleep helper
// =============================================================================
export function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
// =============================================================================
// tRPC caller type
// =============================================================================
const createCaller = createCallerFactory(appRouter);
export type FullFlowCaller = ReturnType<typeof createCaller>;
// =============================================================================
// FullFlowHarness interface
// =============================================================================
/** Status of an agent that requires attention: done, waiting for answers, or crashed */
export type AgentAttentionStatus = 'done' | 'waiting' | 'crashed';
export interface FullFlowHarness {
/** Absolute path to the CW workspace (worktrees are created here) */
workspaceRoot: string;
/** Absolute path to the cloned todo-api fixture git repo */
fixtureRoot: string;
/** The registered todo-api project */
project: Project;
/** The initiative created for the test run */
initiative: Initiative;
/** tRPC caller (all procedures available) */
caller: FullFlowCaller;
/** Real MultiProviderAgentManager */
agentManager: MultiProviderAgentManager;
/** In-memory SQLite database */
db: DrizzleDatabase;
/** Event bus with capture capability */
eventBus: CapturingEventBus;
// All 11 repositories
initiativeRepository: InitiativeRepository;
phaseRepository: PhaseRepository;
taskRepository: TaskRepository;
messageRepository: MessageRepository;
agentRepository: AgentRepository;
pageRepository: PageRepository;
projectRepository: ProjectRepository;
accountRepository: AccountRepository;
changeSetRepository: ChangeSetRepository;
logChunkRepository: LogChunkRepository;
conversationRepository: ConversationRepository;
/**
* Wait for an agent to reach a terminal status (idle/stopped/crashed).
* Returns null if the agent enters waiting_for_input.
*/
waitForAgentCompletion(agentId: string, timeoutMs?: number): Promise<AgentResult | null>;
/**
* Poll until the agent needs attention: done (idle/stopped), waiting for input, or crashed.
* Useful for the question/answer loop in discuss mode.
*/
waitForAgentAttention(agentId: string, timeoutMs?: number): Promise<AgentAttentionStatus>;
/**
* Drive an agent to full completion, answering any questions along the way.
* Answers all questions with the provided answer string (or a default).
*/
driveToCompletion(
agentId: string,
answer?: string,
timeoutMs?: number,
): Promise<AgentResult | null>;
/**
* Get captured events filtered by type.
*/
getEventsByType<T extends DomainEvent>(type: T['type']): T[];
/**
* Kill all running agents and remove temp directories.
*/
cleanup(): Promise<void>;
}
// =============================================================================
// Poll interval
// =============================================================================
const POLL_INTERVAL_MS = 1500;
// =============================================================================
// Factory
// =============================================================================
const __dirname = dirname(fileURLToPath(import.meta.url));
const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
export interface FullFlowHarnessOptions {
/** Factory called after workspaceRoot + repos are created. Return a custom ProcessManager. */
processManagerFactory?: (workspaceRoot: string, projectRepo: ProjectRepository) => ProcessManager;
}
/**
* Create a full-flow test harness.
*
* Setup steps:
* 1. Copy todo-api fixture into a temp git repo (fixtureRoot).
* 2. Create workspace temp dir (workspaceRoot) for CW operations.
* 3. Init in-memory DB + all 11 repos.
* 4. Wire real MultiProviderAgentManager with all repos.
* 5. Wire DefaultDispatchManager for execute stage.
* 6. Create tRPC caller with full context.
* 7. Register project in DB directly (url = fixtureRoot).
* 8. Create initiative via tRPC (links project, creates root page).
*/
export async function createFullFlowHarness(
initiativeName = 'Add complete() method to TodoStore',
options?: FullFlowHarnessOptions,
): Promise<FullFlowHarness> {
// ── 0. Allow nested claude invocations ────────────────────────────────────
// Claude Code sets CLAUDECODE in the environment, which prevents nested
// claude CLI calls from starting ("cannot be launched inside another Claude
// Code session"). Save and remove it so spawned agents can run normally.
// It is restored in cleanup().
const savedClaudeCodeEnv = process.env.CLAUDECODE;
delete process.env.CLAUDECODE;
// ── 1. Fixture project ────────────────────────────────────────────────────
// IMPORTANT: cp(src, dest) puts src INSIDE dest when dest already exists
// (like `cp -r src dest/` → creates dest/src/). We need dest to NOT exist
// yet so that cp creates it as a copy of src directly.
const fixtureBase = await mkdtemp(join(tmpdir(), 'cw-fixture-'));
const fixtureRoot = join(fixtureBase, 'todo-api'); // does not exist yet
await cp(FIXTURES_DIR, fixtureRoot, { recursive: true });
// Verify files landed at the right level before git operations
execSync(`test -f "${join(fixtureRoot, 'package.json')}"`, { stdio: 'pipe' });
execSync('git init', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git config user.email "test@test.com"', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git config user.name "Test"', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git add .', { cwd: fixtureRoot, stdio: 'pipe' });
execSync('git commit -m "initial todo-api with missing complete()"', {
cwd: fixtureRoot,
stdio: 'pipe',
});
// ── 2. Workspace root ─────────────────────────────────────────────────────
// Just a plain temp directory — agent worktrees live under repos/ inside it.
// No git init needed; the PROJECT clone (repos/<name>-<id>/) is the git repo.
const workspaceRoot = await mkdtemp(join(tmpdir(), 'cw-workspace-'));
// ── 3. Database + repositories ────────────────────────────────────────────
const db = createTestDatabase();
const repos = createRepositories(db);
// ── 4. Event bus ──────────────────────────────────────────────────────────
const eventBus = new CapturingEventBus();
// ── 5. Real agent manager ─────────────────────────────────────────────────
const customProcessManager = options?.processManagerFactory?.(workspaceRoot, repos.projectRepository);
const agentManager = new MultiProviderAgentManager(
repos.agentRepository,
workspaceRoot,
repos.projectRepository,
repos.accountRepository,
eventBus,
undefined, // no credential manager needed for default claude account
repos.changeSetRepository,
repos.phaseRepository,
repos.taskRepository,
repos.pageRepository,
repos.logChunkRepository,
false, // debug
customProcessManager, // processManagerOverride
);
// ── 6. Dispatch manager (for execute stage) ───────────────────────────────
const dispatchManager = new DefaultDispatchManager(
repos.taskRepository,
repos.messageRepository,
agentManager,
eventBus,
repos.initiativeRepository,
repos.phaseRepository,
);
// ── 7. tRPC caller ────────────────────────────────────────────────────────
const ctx = createContext({
eventBus,
serverStartedAt: new Date(),
processCount: 0,
agentManager,
dispatchManager,
workspaceRoot,
...repos,
});
const caller = createCaller(ctx);
// ── 8. Register project directly in DB (bypass tRPC clone) ───────────────
const project = await repos.projectRepository.create({
name: 'todo-api',
url: fixtureRoot,
});
// ── 9. Create initiative via tRPC (creates root page automatically) ───────
const initiative = await caller.createInitiative({
name: initiativeName,
projectIds: [project.id],
});
// ── Helpers ───────────────────────────────────────────────────────────────
async function waitForAgentCompletion(
agentId: string,
timeoutMs = 120_000,
): Promise<AgentResult | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const agent = await repos.agentRepository.findById(agentId);
if (!agent) return null;
if (agent.status === 'idle' || agent.status === 'stopped' || agent.status === 'crashed') {
return agentManager.getResult(agentId);
}
if (agent.status === 'waiting_for_input') return null;
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`Timeout: agent ${agentId} did not complete within ${timeoutMs}ms`);
}
async function waitForAgentAttention(
agentId: string,
timeoutMs = 120_000,
): Promise<AgentAttentionStatus> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const agent = await repos.agentRepository.findById(agentId);
if (!agent) return 'crashed';
if (agent.status === 'idle' || agent.status === 'stopped') return 'done';
if (agent.status === 'crashed') return 'crashed';
if (agent.status === 'waiting_for_input') return 'waiting';
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`Timeout: agent ${agentId} did not reach attention state within ${timeoutMs}ms`);
}
async function driveToCompletion(
agentId: string,
answer = 'Use your best judgment and keep it simple.',
timeoutMs = 10 * 60_000,
): Promise<AgentResult | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const remaining = deadline - Date.now();
if (remaining <= 0) break;
let status: AgentAttentionStatus;
try {
status = await waitForAgentAttention(agentId, Math.min(remaining, 3 * 60_000));
} catch {
// Agent is still running (hasn't reached an attention state within the polling
// window). This is normal for long-running execute agents. Continue the outer
// loop — the deadline check above will terminate us if we truly time out.
continue;
}
if (status === 'done' || status === 'crashed') {
return agentManager.getResult(agentId);
}
if (status === 'waiting') {
const pending = await agentManager.getPendingQuestions(agentId);
if (!pending || pending.questions.length === 0) {
// Shouldn't happen, but guard against it
await sleep(POLL_INTERVAL_MS);
continue;
}
const answers = Object.fromEntries(
pending.questions.map((q) => [q.id, answer]),
);
await agentManager.resume(agentId, answers);
}
}
throw new Error(`driveToCompletion: agent ${agentId} did not finish within ${timeoutMs}ms`);
}
// ── Build and return harness ───────────────────────────────────────────────
const harness: FullFlowHarness = {
workspaceRoot,
fixtureRoot,
project,
initiative,
caller,
agentManager,
db,
eventBus,
...repos,
waitForAgentCompletion,
waitForAgentAttention,
driveToCompletion,
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
return eventBus.getEventsByType<T>(type);
},
async cleanup() {
// Kill any running agents
const agents = await repos.agentRepository.findAll();
await Promise.allSettled(
agents
.filter((a) => a.status === 'running')
.map((a) => agentManager.stop(a.id)),
);
// Restore CLAUDECODE env var
if (savedClaudeCodeEnv !== undefined) {
process.env.CLAUDECODE = savedClaudeCodeEnv;
}
// Remove temp directories (fixtureBase contains fixtureRoot)
await Promise.allSettled([
rm(fixtureBase, { recursive: true, force: true }),
rm(workspaceRoot, { recursive: true, force: true }),
]);
},
};
return harness;
}

View File

@@ -0,0 +1,156 @@
/**
* Full-Flow Test Report Utility
*
* Plain console.log formatters for human-readable output at each stage of the
* full-flow integration test. No external dependencies.
*/
import { execSync } from 'node:child_process';
import { join } from 'node:path';
import type { Phase, Task } from '../../../db/schema.js';
import type { AgentResult } from '../../../agent/types.js';
// =============================================================================
// Types
// =============================================================================
export interface ExecutedTask {
task: Task;
result: AgentResult | null;
}
// =============================================================================
// Helpers
// =============================================================================
const DIVIDER = '═'.repeat(60);
const THIN = '─'.repeat(60);
function section(title: string): void {
console.log(`\n${DIVIDER}`);
console.log(` ${title}`);
console.log(DIVIDER);
}
function line(msg: string): void {
console.log(` ${msg}`);
}
// =============================================================================
// Stage reporters
// =============================================================================
export function printHeader(initiativeName: string): void {
section(`FULL-FLOW TEST: ${initiativeName}`);
console.log(` Started at: ${new Date().toISOString()}`);
}
export function printDiscussResult(agentId: string, result: AgentResult | null): void {
console.log(`\n[DISCUSS]`);
console.log(THIN);
line(`Agent: ${agentId}`);
if (result) {
line(`Success: ${result.success}`);
if (result.message) line(`Message: ${result.message.slice(0, 200)}`);
} else {
line('Result: null (agent may have crashed)');
}
}
export function printPlanResult(phases: Phase[]): void {
console.log(`\n[PLAN] ${phases.length} phase(s) created`);
console.log(THIN);
phases.forEach((ph, i) => {
line(`${i + 1}. ${ph.name}`);
});
}
export function printDetailResult(phase: Phase, tasks: Task[]): void {
console.log(`\n[DETAIL] Phase "${phase.name}" → ${tasks.length} task(s)`);
console.log(THIN);
tasks.forEach((t, i) => {
const flags = [t.category, t.type, t.requiresApproval ? 'approval-required' : 'auto'].join(', ');
line(`${i + 1}. ${t.name} [${flags}]`);
if (t.description) {
line(` ${t.description.slice(0, 120)}`);
}
});
}
export function printExecuteResult(executed: ExecutedTask[]): void {
const succeeded = executed.filter((e) => e.result?.success).length;
console.log(`\n[EXECUTE] ${succeeded}/${executed.length} task(s) succeeded`);
console.log(THIN);
for (const { task, result } of executed) {
const icon = result?.success ? '✓' : '✗';
line(`${icon} ${task.name}`);
if (result && !result.success) {
line(` Error: ${result.message?.slice(0, 120)}`);
}
}
}
export function printGitDiff(workspaceRoot: string, projectName: string): void {
console.log('\n[GIT DIFF — agent worktrees]');
console.log(THIN);
// Find all agent worktrees for this project
const worktreesBase = join(workspaceRoot, 'agent-workdirs');
try {
const dirs = execSync(`ls "${worktreesBase}" 2>/dev/null || echo ""`, { encoding: 'utf8' })
.trim()
.split('\n')
.filter(Boolean);
for (const dir of dirs) {
const projectDir = join(worktreesBase, dir, projectName);
try {
const stat = execSync(`git -C "${projectDir}" diff HEAD~1 --stat 2>/dev/null || echo ""`, {
encoding: 'utf8',
}).trim();
if (stat) {
line(`Worktree: ${dir}/${projectName}`);
stat.split('\n').forEach((l) => line(` ${l}`));
}
} catch {
// Worktree might not have commits — skip silently
}
}
} catch {
line('(no agent worktrees found)');
}
}
export function printNpmTestResult(projectDir: string): void {
console.log('\n[NPM TEST]');
console.log(THIN);
try {
const output = execSync('node --test src/todo.test.js', {
cwd: projectDir,
encoding: 'utf8',
stdio: ['ignore', 'pipe', 'pipe'],
});
line('Tests passed:');
output.split('\n').forEach((l) => line(` ${l}`));
} catch (err: unknown) {
const e = err as { stdout?: string; stderr?: string; status?: number };
line(`Tests FAILED (exit ${e.status ?? '?'})`);
if (e.stdout) e.stdout.split('\n').forEach((l) => line(` ${l}`));
if (e.stderr) e.stderr.split('\n').forEach((l) => line(` ${l}`));
}
}
export function printFinalSummary(
initiativeName: string,
phases: Phase[],
tasks: Task[],
executed: ExecutedTask[],
durationMs: number,
): void {
section(`SUMMARY: ${initiativeName}`);
line(`Duration : ${Math.round(durationMs / 1000)}s`);
line(`Phases : ${phases.length}`);
line(`Tasks : ${tasks.length}`);
line(`Executed : ${executed.filter((e) => e.result?.success).length}/${executed.length} succeeded`);
console.log(DIVIDER);
}

View File

@@ -0,0 +1,183 @@
/**
* Real Claude CLI Integration Tests
*
* IMPORTANT: These tests call the real Claude CLI and incur API costs.
* They are SKIPPED by default and should only be run manually for validation.
*
* To run these tests:
* ```bash
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-claude.test.ts --test-timeout=120000
* ```
*
* Purpose:
* - Validate that JSON schemas work correctly with Claude CLI --json-schema flag
* - Confirm MockAgentManager accurately simulates real CLI behavior
* - Document actual response structure and costs
*
* Updated (2026-02-06): Now uses the universal agentSignalSchema instead of
* per-mode schemas. Agents output trivial signals (done/questions/error) and
* write files instead of producing mode-specific JSON.
*
* Total validation cost: ~$0.10 (3 tests)
*/
import { describe, it, expect, beforeAll } from 'vitest';
import { execa } from 'execa';
import {
agentSignalJsonSchema,
agentSignalSchema,
} from '../../agent/schema.js';
/**
* Result structure from Claude CLI with --output-format json
*
* When --json-schema is used:
* - result: "" (empty string)
* - structured_output: { ... } (the validated JSON object)
*/
interface ClaudeCliResult {
type: 'result';
subtype: 'success' | 'error' | 'error_max_turns';
is_error: boolean;
session_id: string;
result: string;
structured_output?: unknown;
total_cost_usd?: number;
}
/**
* Helper to call Claude CLI directly with a prompt and JSON schema.
*
* @param prompt - The prompt to send to Claude
* @param jsonSchema - JSON schema to enforce structured output
* @param timeoutMs - Timeout in milliseconds (default 90s)
* @returns Parsed CLI result with structured_output
*/
async function callClaudeCli(
prompt: string,
jsonSchema: object,
timeoutMs = 90000
): Promise<{ cliResult: ClaudeCliResult; structuredOutput: unknown }> {
const startTime = Date.now();
const { stdout } = await execa(
'claude',
[
'-p',
prompt,
'--output-format',
'json',
'--json-schema',
JSON.stringify(jsonSchema),
],
{
timeout: timeoutMs,
}
);
const duration = Date.now() - startTime;
const cliResult: ClaudeCliResult = JSON.parse(stdout);
console.log(`\n Duration: ${(duration / 1000).toFixed(1)}s`);
console.log(` Cost: $${cliResult.total_cost_usd?.toFixed(4) ?? 'N/A'}`);
console.log(` Session ID: ${cliResult.session_id}`);
console.log(` Result field empty: ${cliResult.result === ''}`);
console.log(` Has structured_output: ${cliResult.structured_output !== undefined}`);
// When --json-schema is used, structured output is in structured_output field
// The result field is typically empty when using --json-schema
const structuredOutput = cliResult.structured_output ?? JSON.parse(cliResult.result);
return { cliResult, structuredOutput };
}
/**
* Check if real Claude tests should run.
* Set REAL_CLAUDE_TESTS=1 environment variable to enable.
*/
const shouldRunRealTests = process.env.REAL_CLAUDE_TESTS === '1';
/**
* Skip wrapper - tests are expensive and should run manually
*/
const describeReal = shouldRunRealTests ? describe : describe.skip;
// Individual test timeout - real API calls take 5-30 seconds
const TEST_TIMEOUT = 120000; // 2 minutes
describeReal('Real Claude CLI Integration', () => {
beforeAll(() => {
console.log('\n=== Running Real Claude CLI Tests ===');
console.log('These tests call the real Claude API and incur costs.\n');
});
describe('Universal Signal Schema', () => {
it(
'should return done status',
async () => {
const prompt = `Complete this simple task: Say "Hello, World!" as a test.
Output your response in the required JSON format with status "done".`;
const { cliResult, structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
// Verify the CLI response structure
expect(cliResult.subtype).toBe('success');
expect(cliResult.result).toBe(''); // Empty when using --json-schema
expect(cliResult.structured_output).toBeDefined();
// Validate against Zod schema
const parsed = agentSignalSchema.parse(structuredOutput);
expect(parsed.status).toBe('done');
},
TEST_TIMEOUT
);
it(
'should return questions status with array',
async () => {
const prompt = `You are working on a vague task: "Make it better"
You MUST ask clarifying questions before proceeding. You cannot complete this task without more information.
Output your response with status "questions" and include at least 2 questions with unique IDs.`;
const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
// Validate against Zod schema
const parsed = agentSignalSchema.parse(structuredOutput);
expect(parsed.status).toBe('questions');
if (parsed.status === 'questions') {
expect(Array.isArray(parsed.questions)).toBe(true);
expect(parsed.questions.length).toBeGreaterThanOrEqual(1);
expect(parsed.questions[0].id).toBeTruthy();
expect(parsed.questions[0].question).toBeTruthy();
}
},
TEST_TIMEOUT
);
it(
'should return error status',
async () => {
const prompt = `You have encountered an unrecoverable error. Output your response with status "error" and a descriptive error message.`;
const { structuredOutput } = await callClaudeCli(prompt, agentSignalJsonSchema);
console.log(' Output:', JSON.stringify(structuredOutput, null, 2));
// Validate against Zod schema
const parsed = agentSignalSchema.parse(structuredOutput);
expect(parsed.status).toBe('error');
if (parsed.status === 'error') {
expect(parsed.error).toBeTruthy();
}
},
TEST_TIMEOUT
);
});
});

View File

@@ -0,0 +1,298 @@
/**
* Real Claude CLI Manager Integration Tests
*
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
* They are SKIPPED by default and should only be run manually for validation.
*
* To run these tests:
* ```bash
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/claude-manager.test.ts --test-timeout=300000
* ```
*
* Tests covered:
* - Output stream parsing (text_delta events)
* - Session ID extraction from init event
* - Result parsing and validation
* - Session resume with user answers
*
* Estimated cost: ~$0.10 per full run
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import {
createRealProviderHarness,
describeRealClaude,
REAL_TEST_TIMEOUT,
sleep,
type RealProviderHarness,
} from './harness.js';
import { MINIMAL_PROMPTS } from './prompts.js';
import type { AgentSpawnedEvent, AgentStoppedEvent, AgentOutputEvent } from '../../../events/types.js';
describeRealClaude('Real Claude Manager Integration', () => {
let harness: RealProviderHarness;
beforeAll(async () => {
console.log('\n=== Running Real Claude Manager Tests ===');
console.log('These tests call the real Claude API and incur costs.\n');
harness = await createRealProviderHarness({ provider: 'claude' });
});
afterAll(async () => {
await harness.cleanup();
});
beforeEach(() => {
harness.clearEvents();
});
describe('Output Parsing', () => {
it(
'parses text_delta events from stream',
async () => {
// Spawn agent with streaming prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.streaming,
mode: 'execute',
provider: 'claude',
});
expect(agent.id).toBeTruthy();
expect(agent.status).toBe('running');
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify we got output events
const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
console.log(' Output events:', outputEvents.length);
// Verify completion
expect(result).toBeTruthy();
console.log(' Result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'parses init event and extracts session ID',
async () => {
// Spawn agent with simple done prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.done,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify session ID was extracted and persisted
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.sessionId).toBeTruthy();
expect(dbAgent?.sessionId).toMatch(/^[a-f0-9-]+$/);
console.log(' Session ID:', dbAgent?.sessionId);
},
REAL_TEST_TIMEOUT
);
it(
'parses result event with completion',
async () => {
// Spawn agent with simple done prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.done,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify result was parsed
expect(result).toBeTruthy();
expect(result?.success).toBe(true);
expect(result?.message).toBeTruthy();
// Verify events
const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
expect(spawnedEvents.length).toBe(1);
expect(spawnedEvents[0].payload.agentId).toBe(agent.id);
expect(spawnedEvents[0].payload.provider).toBe('claude');
const stoppedEvents = harness.getEventsByType<AgentStoppedEvent>('agent:stopped');
expect(stoppedEvents.length).toBe(1);
expect(stoppedEvents[0].payload.agentId).toBe(agent.id);
expect(stoppedEvents[0].payload.reason).toBe('task_complete');
console.log(' Result message:', result?.message);
},
REAL_TEST_TIMEOUT
);
});
describe('Questions Flow', () => {
it(
'parses questions status and enters waiting_for_input',
async () => {
// Spawn agent with questions prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.questions,
mode: 'execute',
provider: 'claude',
});
// Wait for waiting_for_input status
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
// Verify questions were parsed
expect(questions).toBeTruthy();
expect(questions?.questions).toBeTruthy();
expect(questions?.questions.length).toBeGreaterThan(0);
expect(questions?.questions[0].id).toBeTruthy();
expect(questions?.questions[0].question).toBeTruthy();
// Verify agent status
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('waiting_for_input');
expect(dbAgent?.sessionId).toBeTruthy();
console.log(' Questions:', questions?.questions.length);
console.log(' First question:', questions?.questions[0].question);
},
REAL_TEST_TIMEOUT
);
});
describe('Session Resume', () => {
it(
'resumes session with user answers',
async () => {
// 1. Spawn agent that asks questions
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.questions,
mode: 'execute',
provider: 'claude',
});
// 2. Wait for waiting_for_input
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
expect(questions?.questions.length).toBeGreaterThan(0);
const sessionIdBeforeResume = (await harness.agentRepository.findById(agent.id))?.sessionId;
console.log(' Session ID before resume:', sessionIdBeforeResume);
console.log(' Questions received:', questions?.questions.map((q) => q.id).join(', '));
harness.clearEvents();
// 3. Resume with answer
const answers: Record<string, string> = {};
for (const q of questions?.questions ?? []) {
answers[q.id] = `Answer to ${q.id}`;
}
await harness.agentManager.resume(agent.id, answers);
// 4. Wait for completion or another waiting state
let attempts = 0;
let finalStatus = 'running';
while (attempts < 60) {
const agent2 = await harness.agentRepository.findById(agent.id);
if (agent2?.status !== 'running') {
finalStatus = agent2?.status ?? 'unknown';
break;
}
await sleep(1000);
attempts++;
}
// Verify the agent processed the resume (either completed or asked more questions)
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Final status:', dbAgent?.status);
// Agent should not still be running
expect(['idle', 'waiting_for_input', 'crashed']).toContain(dbAgent?.status);
// If idle, verify result
if (dbAgent?.status === 'idle') {
const result = await harness.agentManager.getResult(agent.id);
console.log(' Result:', result?.message);
expect(result).toBeTruthy();
}
},
REAL_TEST_TIMEOUT * 2 // Double timeout for two-step process
);
it(
'maintains session continuity across resume',
async () => {
// 1. Spawn agent that asks questions
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.questions,
mode: 'execute',
provider: 'claude',
});
// 2. Wait for waiting_for_input
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
expect(questions?.questions.length).toBeGreaterThan(0);
const sessionIdBefore = (await harness.agentRepository.findById(agent.id))?.sessionId;
expect(sessionIdBefore).toBeTruthy();
// 3. Resume with answer
const answers: Record<string, string> = {};
for (const q of questions?.questions ?? []) {
answers[q.id] = `Answer to ${q.id}`;
}
await harness.agentManager.resume(agent.id, answers);
// 4. Wait for completion
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify session ID exists (may be same or new depending on CLI behavior)
const sessionIdAfter = (await harness.agentRepository.findById(agent.id))?.sessionId;
expect(sessionIdAfter).toBeTruthy();
console.log(' Session ID before:', sessionIdBefore);
console.log(' Session ID after:', sessionIdAfter);
},
REAL_TEST_TIMEOUT * 2
);
});
describe('Error Handling', () => {
it(
'handles error status',
async () => {
// Spawn agent with error prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.error,
mode: 'execute',
provider: 'claude',
});
// Wait for completion (will be crashed)
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify error was handled
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('crashed');
expect(result?.success).toBe(false);
expect(result?.message).toContain('Test error');
console.log(' Error message:', result?.message);
},
REAL_TEST_TIMEOUT
);
});
});

View File

@@ -0,0 +1,172 @@
/**
* Real Codex CLI Manager Integration Tests
*
* IMPORTANT: These tests call the REAL Codex CLI and incur API costs!
* They are SKIPPED by default and should only be run manually for validation.
*
* To run these tests:
* ```bash
* REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts --test-timeout=300000
* ```
*
* Tests covered:
* - Codex spawn and thread_id extraction
* - Generic output parsing (non-schema)
* - Streaming output
*
* Estimated cost: ~$0.10 per full run
*
* Note: Codex uses different output format and session ID field (thread_id).
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import {
createRealProviderHarness,
describeRealCodex,
REAL_TEST_TIMEOUT,
type RealProviderHarness,
} from './harness.js';
import { CODEX_PROMPTS } from './prompts.js';
import type { AgentSpawnedEvent, AgentOutputEvent } from '../../../events/types.js';
describeRealCodex('Real Codex Manager Integration', () => {
let harness: RealProviderHarness;
beforeAll(async () => {
console.log('\n=== Running Real Codex Manager Tests ===');
console.log('These tests call the real Codex API and incur costs.\n');
harness = await createRealProviderHarness({ provider: 'codex' });
});
afterAll(async () => {
await harness.cleanup();
});
beforeEach(() => {
harness.clearEvents();
});
describe('Codex Spawn', () => {
it(
'spawns codex agent and extracts thread_id',
async () => {
// Spawn agent with simple task
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: CODEX_PROMPTS.done,
mode: 'execute',
provider: 'codex',
});
expect(agent.id).toBeTruthy();
expect(agent.provider).toBe('codex');
expect(agent.status).toBe('running');
// Verify spawned event
const spawnedEvents = harness.getEventsByType<AgentSpawnedEvent>('agent:spawned');
expect(spawnedEvents.length).toBe(1);
expect(spawnedEvents[0].payload.provider).toBe('codex');
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify session ID (thread_id) was extracted
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Thread ID:', dbAgent?.sessionId);
console.log(' Status:', dbAgent?.status);
console.log(' Result:', result?.message);
// Codex should complete or crash
expect(['idle', 'crashed']).toContain(dbAgent?.status);
// If completed successfully, should have extracted thread_id
if (dbAgent?.status === 'idle' && dbAgent?.sessionId) {
expect(dbAgent.sessionId).toBeTruthy();
}
},
REAL_TEST_TIMEOUT
);
it(
'uses generic parser for output',
async () => {
// Spawn agent with streaming prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: CODEX_PROMPTS.streaming,
mode: 'execute',
provider: 'codex',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify output events were captured
const outputEvents = harness.getEventsByType<AgentOutputEvent>('agent:output');
console.log(' Output events:', outputEvents.length);
// For generic provider, result should be captured
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Status:', dbAgent?.status);
console.log(' Result:', result?.message?.substring(0, 100) + '...');
expect(['idle', 'crashed']).toContain(dbAgent?.status);
},
REAL_TEST_TIMEOUT
);
});
describe('Codex Provider Config', () => {
it(
'uses correct command and args for codex',
async () => {
// This is more of a config verification test
// The actual command execution is validated by the spawn test
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: 'Say hello',
mode: 'execute',
provider: 'codex',
});
// Verify agent was created with codex provider
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.provider).toBe('codex');
// Wait for completion (or timeout)
try {
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
} catch {
// Codex might fail if not installed, that's OK for config test
}
const finalAgent = await harness.agentRepository.findById(agent.id);
console.log(' Provider:', finalAgent?.provider);
console.log(' Status:', finalAgent?.status);
},
REAL_TEST_TIMEOUT
);
});
});
/**
* Codex-specific observations from testing:
*
* 1. Output Format:
* - Codex uses JSONL streaming with different event types
* - thread.started event contains thread_id
* - Output parsing is more generic (not JSON schema validated)
*
* 2. Command Structure:
* - codex exec --full-auto --json -p "prompt"
* - resume: codex exec resume <thread_id>
*
* 3. Session ID:
* - Called "thread_id" in Codex
* - Extracted from thread.started event
*
* 4. Resume:
* - Uses subcommand style: codex exec resume <thread_id>
* - Different from Claude's flag style: claude --resume <session_id>
*/

View File

@@ -0,0 +1,540 @@
/**
* Real Claude Inter-Agent Conversation Integration Tests
*
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
* They are SKIPPED by default and should only be run manually for validation.
*
* To run:
* ```bash
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/conversation.test.ts --test-timeout=300000
* ```
*
* Architecture:
* - Mock conversation server (only cw listen/ask/answer endpoints, no full CoordinationServer)
* - In-memory ConversationRepository (no SQLite, no FK constraints)
* - Real agent harness for spawning two Claude sessions with actual coding tasks
* - Two sequential questions prove the listen→answer→re-listen cycle works
*
* Estimated cost: ~$0.30 per full run (two Claude sessions)
*/
import { it, expect, beforeAll, afterAll } from 'vitest';
import { createServer } from 'node:http';
import type { Server } from 'node:http';
import { readFileSync, existsSync } from 'node:fs';
import { join } from 'node:path';
import { nanoid } from 'nanoid';
import { fetchRequestHandler } from '@trpc/server/adapters/fetch';
import { router, publicProcedure } from '../../../trpc/trpc.js';
import { conversationProcedures } from '../../../trpc/routers/conversation.js';
import { EventEmitterBus } from '../../../events/bus.js';
import type { ConversationRepository, CreateConversationData } from '../../../db/repositories/conversation-repository.js';
import type { Conversation } from '../../../db/schema.js';
import {
createRealProviderHarness,
describeRealClaude,
sleep,
type RealProviderHarness,
} from './harness.js';
const TEST_TIMEOUT = 300000; // 5 minutes — agents do real coding + conversation
// ---------------------------------------------------------------------------
// In-memory ConversationRepository — no SQLite, no FK constraints
// ---------------------------------------------------------------------------
class InMemoryConversationRepository implements ConversationRepository {
private store = new Map<string, Conversation>();
async create(data: CreateConversationData): Promise<Conversation> {
const now = new Date();
const conversation: Conversation = {
id: nanoid(),
fromAgentId: data.fromAgentId,
toAgentId: data.toAgentId,
initiativeId: data.initiativeId ?? null,
phaseId: data.phaseId ?? null,
taskId: data.taskId ?? null,
question: data.question,
answer: null,
status: 'pending',
createdAt: now,
updatedAt: now,
};
this.store.set(conversation.id, conversation);
return conversation;
}
async findById(id: string): Promise<Conversation | null> {
return this.store.get(id) ?? null;
}
async findPendingForAgent(toAgentId: string): Promise<Conversation[]> {
return [...this.store.values()]
.filter((c) => c.toAgentId === toAgentId && c.status === 'pending')
.sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime());
}
async answer(id: string, answer: string): Promise<Conversation | null> {
const conv = this.store.get(id);
if (!conv) return null;
const updated: Conversation = {
...conv,
answer,
status: 'answered' as const,
updatedAt: new Date(),
};
this.store.set(id, updated);
return updated;
}
/** Test helper — return all conversations */
getAll(): Conversation[] {
return [...this.store.values()];
}
}
// ---------------------------------------------------------------------------
// Mock conversation server — serves ONLY conversation tRPC procedures
// ---------------------------------------------------------------------------
async function startMockConversationServer(): Promise<{
server: Server;
port: number;
repo: InMemoryConversationRepository;
}> {
const repo = new InMemoryConversationRepository();
const eventBus = new EventEmitterBus();
// Mini router with only conversation procedures
const miniRouter = router({
...conversationProcedures(publicProcedure),
});
const httpServer = createServer(async (req, res) => {
if (!req.url?.startsWith('/trpc')) {
res.writeHead(404);
res.end('Not found');
return;
}
const host = req.headers.host ?? 'localhost';
const url = new URL(req.url, `http://${host}`);
let body: string | undefined;
if (req.method !== 'GET' && req.method !== 'HEAD') {
body = await new Promise<string>((resolve) => {
let data = '';
req.on('data', (chunk: Buffer) => {
data += chunk.toString();
});
req.on('end', () => resolve(data));
});
}
const headers = new Headers();
for (const [key, value] of Object.entries(req.headers)) {
if (value) {
if (Array.isArray(value)) {
value.forEach((v) => headers.append(key, v));
} else {
headers.set(key, value);
}
}
}
const fetchRequest = new Request(url.toString(), {
method: req.method,
headers,
body: body ?? undefined,
});
const fetchResponse = await fetchRequestHandler({
endpoint: '/trpc',
req: fetchRequest,
router: miniRouter,
createContext: () =>
({
eventBus,
serverStartedAt: new Date(),
processCount: 0,
conversationRepository: repo,
// Stub — requireAgentManager is called unconditionally in createConversation,
// but list() is only invoked for taskId/phaseId resolution. With --agent-id
// targeting, list() is never called.
agentManager: { list: async () => [] },
}) as any,
});
res.statusCode = fetchResponse.status;
fetchResponse.headers.forEach((value, key) => {
res.setHeader(key, value);
});
if (fetchResponse.body) {
const reader = fetchResponse.body.getReader();
const pump = async () => {
while (true) {
const { done, value } = await reader.read();
if (done) {
res.end();
return;
}
res.write(value);
}
};
pump().catch(() => res.end());
} else {
res.end(await fetchResponse.text());
}
});
const port = 40000 + Math.floor(Math.random() * 10000);
await new Promise<void>((resolve) => {
httpServer.listen(port, '127.0.0.1', () => resolve());
});
return { server: httpServer, port, repo };
}
// ---------------------------------------------------------------------------
// Diagnostic helpers
// ---------------------------------------------------------------------------
function dumpAgentLogs(workspaceRoot: string, agentName: string) {
const logDir = join(workspaceRoot, '.cw', 'agent-logs', agentName);
if (!existsSync(logDir)) {
console.log(` [${agentName}] No log directory at ${logDir}`);
return;
}
// Dump output.jsonl (last 30 lines)
const outputPath = join(logDir, 'output.jsonl');
if (existsSync(outputPath)) {
const lines = readFileSync(outputPath, 'utf-8').trim().split('\n');
const last = lines.slice(-30);
console.log(` [${agentName}] output.jsonl (last ${last.length}/${lines.length} lines):`);
for (const line of last) {
try {
const ev = JSON.parse(line);
if (ev.type === 'assistant' && ev.message?.content) {
for (const block of ev.message.content) {
if (block.type === 'text') {
console.log(` TEXT: ${block.text.substring(0, 200)}`);
} else if (block.type === 'tool_use') {
console.log(` TOOL: ${block.name} ${JSON.stringify(block.input).substring(0, 150)}`);
}
}
} else if (ev.type === 'result') {
console.log(` RESULT: ${JSON.stringify(ev).substring(0, 300)}`);
}
} catch {
console.log(` RAW: ${line.substring(0, 200)}`);
}
}
}
// Dump stderr
const stderrPath = join(logDir, 'stderr.log');
if (existsSync(stderrPath)) {
const stderr = readFileSync(stderrPath, 'utf-8').trim();
if (stderr) {
console.log(` [${agentName}] stderr: ${stderr.substring(0, 500)}`);
}
}
}
// ---------------------------------------------------------------------------
// Test suite
// ---------------------------------------------------------------------------
describeRealClaude('Real Inter-Agent Conversation (mock server)', () => {
let harness: RealProviderHarness;
let mockServer: Server;
let mockPort: number;
let mockRepo: InMemoryConversationRepository;
const originalCwPort = process.env.CW_PORT;
beforeAll(async () => {
console.log('\n=== Real Inter-Agent Conversation Test ===');
console.log('Mock conversation server + two Claude sessions.\n');
// Start mock conversation server (only listen/ask/answer endpoints)
const mock = await startMockConversationServer();
mockServer = mock.server;
mockPort = mock.port;
mockRepo = mock.repo;
console.log(` Mock server on port ${mockPort}`);
// Set CW_PORT so agents' cw commands hit the mock server
process.env.CW_PORT = String(mockPort);
// Real agent harness for spawning + worktrees (no full CoordinationServer)
harness = await createRealProviderHarness({ provider: 'claude' });
console.log(` Workspace: ${harness.workspaceRoot}`);
});
afterAll(async () => {
if (originalCwPort) {
process.env.CW_PORT = originalCwPort;
} else {
delete process.env.CW_PORT;
}
await harness?.cleanup();
mockServer?.close();
});
it(
'two agents with real tasks communicate via cw ask/listen/answer (two questions prove re-listen)',
async () => {
const agentSuffix = nanoid(6); // unique suffix for temp files
// ---------------------------------------------------------------
// Agent A — builds a validator module WHILE answering questions
// in the background via cw listen
// ---------------------------------------------------------------
const agentA = await harness.agentManager.spawn({
taskId: null,
prompt: `You are Agent A in a multi-agent coordination test.
You have TWO concurrent responsibilities:
1. Build a TypeScript validator module (your main coding task)
2. Answer questions from other agents via a background listener
SETUP (do this first):
- Read .cw/input/manifest.json to get your agentId
- Start a background listener that writes to a temp file:
cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
LISTEN_PID=$!
MAIN CODING TASK — implement a user registration validator:
1. Create types.ts:
export interface RegistrationInput { name: string; email: string; password: string; }
export interface ValidationResult { valid: boolean; errors: string[]; }
2. Create validator.ts:
Import from types.ts. Export function validateRegistration(input: RegistrationInput): ValidationResult
Rules: name min 2 chars, email must have exactly one @ and domain with a dot and no spaces and max 254 chars, password min 8 chars.
3. Create index.ts that re-exports everything from types.ts and validator.ts.
BETWEEN EACH FILE, check for incoming questions:
if [ -s /tmp/cw-listen-${agentSuffix}.txt ]; then
# parse the JSON, get conversationId and question
# answer: cw answer "<answer based on your code>" --conversation-id <id>
# clear and restart listener:
> /tmp/cw-listen-${agentSuffix}.txt
cw listen --agent-id <YOUR_AGENT_ID> --timeout 120000 > /tmp/cw-listen-${agentSuffix}.txt 2>&1 &
LISTEN_PID=$!
fi
You will receive TWO questions total while you work. Answer them based on the code you are writing.
CLEANUP: After all 3 files are written and both questions answered:
- kill $LISTEN_PID 2>/dev/null
- Write .cw/output/signal.json: {"status":"done","result":"validator module complete, answered 2 questions"}
CRITICAL:
- The listener MUST run in the background while you write code.
- Check for questions between files, not as blocking waits.
- The CW_PORT environment variable is already set to ${mockPort}.`,
mode: 'execute',
provider: 'claude',
inputContext: {},
});
console.log(` Agent A: ${agentA.id} (${agentA.name})`);
// Give Agent A time to start its background listener and begin coding
await sleep(15000);
// ---------------------------------------------------------------
// Agent B — builds a client module, asks Agent A questions to
// learn the validation rules, then uses answers in its code
// ---------------------------------------------------------------
const agentB = await harness.agentManager.spawn({
taskId: null,
prompt: `You are Agent B in a multi-agent coordination test.
Read .cw/input/manifest.json to get your agentId. Agent A (ID: ${agentA.id}) is building a validator module.
YOUR CODING TASK — build a registration API client that includes client-side validation matching Agent A's server-side rules:
1. Create client-scaffold.ts with a basic RegistrationClient class that has a register(name, email, password) method that returns Promise<{ok: boolean}>.
Leave a TODO comment where validation will go.
2. NOW ask Agent A what the validation rules are — you need this to write proper client-side checks:
FIELDS=$(cw ask "What are the required fields and their types for registration?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
3. Ask Agent A about the specific email validation rules:
EMAIL_RULES=$(cw ask "What are the exact email validation rules you implemented?" --from <YOUR_AGENT_ID> --agent-id ${agentA.id} --timeout 120000)
4. Create validated-client.ts — a COMPLETE implementation using the answers:
Import the scaffold, add a validateBeforeSubmit(name, email, password) function
that implements the EXACT validation rules Agent A told you about.
Include a comment at the top with the rules you received.
5. Write .cw/output/signal.json: {"status":"done","result":"client module complete with validation from agent A"}
CRITICAL:
- Create client-scaffold.ts BEFORE asking questions (you have independent work to do first).
- Use the ACTUAL answers from Agent A in your validated-client.ts implementation.
- The CW_PORT environment variable is already set to ${mockPort}.`,
mode: 'execute',
provider: 'claude',
inputContext: {},
});
console.log(` Agent B: ${agentB.id} (${agentB.name})`);
// ---------------------------------------------------------------
// Wait for both agents to stop running, then verify conversations
// ---------------------------------------------------------------
const deadline = Date.now() + TEST_TIMEOUT;
let aDone = false;
let bDone = false;
let lastLogTime = 0;
while (Date.now() < deadline && (!aDone || !bDone)) {
const agentAInfo = await harness.agentRepository.findById(agentA.id);
const agentBInfo = await harness.agentRepository.findById(agentB.id);
// Periodic progress logging every 30s
if (Date.now() - lastLogTime > 30000) {
const elapsed = Math.round((Date.now() - (deadline - TEST_TIMEOUT)) / 1000);
console.log(` [${elapsed}s] A=${agentAInfo?.status ?? '?'} B=${agentBInfo?.status ?? '?'} convs=${mockRepo.getAll().length}`);
lastLogTime = Date.now();
}
if (agentAInfo && agentAInfo.status !== 'running' && !aDone) {
aDone = true;
console.log(` Agent A final status: ${agentAInfo.status}`);
dumpAgentLogs(harness.workspaceRoot, agentA.name);
}
if (agentBInfo && agentBInfo.status !== 'running' && !bDone) {
bDone = true;
console.log(` Agent B final status: ${agentBInfo.status}`);
dumpAgentLogs(harness.workspaceRoot, agentB.name);
}
if (!aDone || !bDone) await sleep(2000);
}
expect(aDone).toBe(true);
expect(bDone).toBe(true);
// ---------------------------------------------------------------
// Verify conversations in mock repo
// ---------------------------------------------------------------
const allConversations = mockRepo.getAll();
console.log(` Total conversations: ${allConversations.length}`);
for (const c of allConversations) {
console.log(
` ${c.id}: ${c.status} — Q: "${c.question}" A: "${c.answer?.substring(0, 80)}..."`,
);
}
// Exactly 2 conversations, both answered
expect(allConversations.length).toBe(2);
expect(allConversations.every((c) => c.status === 'answered')).toBe(true);
// Both target Agent A, both from Agent B
expect(allConversations.every((c) => c.toAgentId === agentA.id)).toBe(true);
expect(allConversations.every((c) => c.fromAgentId === agentB.id)).toBe(true);
// Questions should be distinct (one about fields, one about email validation)
const questions = allConversations.map((c) => c.question);
expect(questions.some((q) => q.toLowerCase().includes('field'))).toBe(true);
expect(questions.some((q) => q.toLowerCase().includes('email'))).toBe(true);
// Both answers should be non-empty
expect(allConversations.every((c) => c.answer && c.answer.length > 0)).toBe(true);
// ---------------------------------------------------------------
// Verify Agent A's coding output — validator module files exist
// ---------------------------------------------------------------
const aWorkdir = join(
harness.workspaceRoot,
'agent-workdirs',
agentA.name,
'workspace',
);
const aFiles = ['types.ts', 'validator.ts', 'index.ts'];
for (const f of aFiles) {
const filePath = join(aWorkdir, f);
const exists = existsSync(filePath);
console.log(` Agent A file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
expect(exists).toBe(true);
}
// validator.ts should contain actual validation logic
const validatorContent = readFileSync(join(aWorkdir, 'validator.ts'), 'utf-8');
console.log(` Agent A validator.ts (${validatorContent.length} chars): ${validatorContent.substring(0, 120)}...`);
expect(validatorContent.toLowerCase()).toContain('email');
expect(validatorContent.toLowerCase()).toContain('password');
// ---------------------------------------------------------------
// Verify Agent B's coding output — client module files exist
// ---------------------------------------------------------------
const bWorkdir = join(
harness.workspaceRoot,
'agent-workdirs',
agentB.name,
'workspace',
);
const bFiles = ['client-scaffold.ts', 'validated-client.ts'];
for (const f of bFiles) {
const filePath = join(bWorkdir, f);
const exists = existsSync(filePath);
console.log(` Agent B file ${f}: ${exists ? 'EXISTS' : 'MISSING'}`);
expect(exists).toBe(true);
}
// validated-client.ts should reference validation rules from Agent A's answers
const clientContent = readFileSync(join(bWorkdir, 'validated-client.ts'), 'utf-8');
console.log(` Agent B validated-client.ts (${clientContent.length} chars): ${clientContent.substring(0, 120)}...`);
expect(clientContent.toLowerCase()).toContain('email');
// ---------------------------------------------------------------
// Verify interleaving: Agent A's JSONL log has coding tool calls
// (Write for .ts files) interleaved with conversation tool calls
// (Bash for cw listen/answer)
// ---------------------------------------------------------------
const aLogPath = join(harness.workspaceRoot, '.cw', 'agent-logs', agentA.name, 'output.jsonl');
const aLog = readFileSync(aLogPath, 'utf-8').trim().split('\n');
const toolCalls: { type: 'code' | 'conversation'; name: string; detail: string }[] = [];
for (const line of aLog) {
try {
const ev = JSON.parse(line);
if (ev.type !== 'assistant' || !ev.message?.content) continue;
for (const block of ev.message.content) {
if (block.type !== 'tool_use') continue;
const input = typeof block.input === 'string' ? block.input : JSON.stringify(block.input);
if (block.name === 'Write' && input.includes('.ts')) {
toolCalls.push({ type: 'code', name: 'Write', detail: input.substring(0, 80) });
} else if (block.name === 'Bash' && (input.includes('cw listen') || input.includes('cw answer'))) {
toolCalls.push({ type: 'conversation', name: 'Bash', detail: input.substring(0, 80) });
}
}
} catch { /* skip non-JSON lines */ }
}
console.log(` Agent A interleaving (${toolCalls.length} relevant tool calls):`);
for (const tc of toolCalls) {
console.log(` [${tc.type}] ${tc.name}: ${tc.detail}`);
}
// Must have both code and conversation tool calls
const hasCode = toolCalls.some((tc) => tc.type === 'code');
const hasConversation = toolCalls.some((tc) => tc.type === 'conversation');
expect(hasCode).toBe(true);
expect(hasConversation).toBe(true);
// Verify interleaving: at least one code call must appear AFTER a conversation call
// (proving coding continued after handling a question)
const firstConvIdx = toolCalls.findIndex((tc) => tc.type === 'conversation');
const lastCodeIdx = toolCalls.length - 1 - [...toolCalls].reverse().findIndex((tc) => tc.type === 'code');
console.log(` First conversation at index ${firstConvIdx}, last code at index ${lastCodeIdx}`);
expect(lastCodeIdx).toBeGreaterThan(firstConvIdx);
},
TEST_TIMEOUT,
);
});

View File

@@ -0,0 +1,265 @@
/**
* Crash Recovery Integration Tests
*
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
* They are SKIPPED by default and should only be run manually for validation.
*
* To run these tests:
* ```bash
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/crash-recovery.test.ts --test-timeout=300000
* ```
*
* Tests covered:
* - Server restart while agent is running
* - Resuming streaming after restart
* - Marking dead agents as crashed
* - Output file processing after restart
*
* Estimated cost: ~$0.08 per full run
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import {
createRealProviderHarness,
describeRealClaude,
REAL_TEST_TIMEOUT,
EXTENDED_TEST_TIMEOUT,
sleep,
type RealProviderHarness,
} from './harness.js';
import { MINIMAL_PROMPTS } from './prompts.js';
import { MultiProviderAgentManager } from '../../../agent/manager.js';
describeRealClaude('Crash Recovery', () => {
let harness: RealProviderHarness;
beforeAll(async () => {
console.log('\n=== Running Crash Recovery Tests ===');
console.log('These tests call the real Claude API and incur costs.\n');
harness = await createRealProviderHarness({ provider: 'claude' });
});
afterAll(async () => {
await harness.cleanup();
});
beforeEach(() => {
harness.clearEvents();
});
describe('Server Restart Simulation', () => {
it(
'resumes streaming for still-running agent after restart',
async () => {
// 1. Spawn agent with slow task
console.log(' 1. Spawning agent with slow task...');
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.slow,
mode: 'execute',
provider: 'claude',
});
// 2. Wait for agent to be running
await harness.waitForAgentStatus(agent.id, 'running', 10000);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.pid).toBeTruthy();
expect(dbAgent?.outputFilePath).toBeTruthy();
console.log(' 2. Agent running with PID:', dbAgent?.pid);
// 3. Give the agent a moment to start writing output
await sleep(2000);
// 4. Simulate server crash - create NEW manager (old state lost)
console.log(' 3. Simulating server restart with new manager...');
harness.clearEvents(); // Clear events from old manager
const newManager = new MultiProviderAgentManager(
harness.agentRepository,
harness.workspaceRoot,
harness.projectRepository,
harness.accountRepository,
harness.eventBus
);
// 5. Reconcile - should pick up running agent
console.log(' 4. Reconciling agent state...');
await newManager.reconcileAfterRestart();
// 6. Wait for completion via new manager
console.log(' 5. Waiting for completion via new manager...');
let attempts = 0;
let finalStatus = 'running';
while (attempts < 60) {
const refreshed = await harness.agentRepository.findById(agent.id);
if (refreshed?.status !== 'running') {
finalStatus = refreshed?.status ?? 'unknown';
break;
}
await sleep(2000);
attempts++;
}
const finalAgent = await harness.agentRepository.findById(agent.id);
console.log(' 6. Final status:', finalAgent?.status);
// Either completed successfully or crashed (both are valid outcomes)
expect(['idle', 'crashed', 'stopped']).toContain(finalAgent?.status);
if (finalAgent?.status === 'idle') {
const result = await newManager.getResult(agent.id);
console.log(' Result:', result?.message);
}
},
EXTENDED_TEST_TIMEOUT
);
it(
'marks dead agent as crashed during reconcile',
async () => {
// 1. Create a fake agent record with a dead PID
console.log(' 1. Creating fake agent with dead PID...');
const fakeAgent = await harness.agentRepository.create({
name: 'dead-agent-test',
taskId: null,
initiativeId: null,
sessionId: null,
worktreeId: 'dead-worktree',
status: 'running',
mode: 'execute',
provider: 'claude',
accountId: null,
});
// Set a PID that's definitely dead (high number that won't exist)
await harness.agentRepository.update(fakeAgent.id, { pid: 999999, outputFilePath: '/nonexistent/path' });
// Verify it's marked as running
let agent = await harness.agentRepository.findById(fakeAgent.id);
expect(agent?.status).toBe('running');
expect(agent?.pid).toBe(999999);
// 2. Create new manager and reconcile
console.log(' 2. Creating new manager and reconciling...');
const newManager = new MultiProviderAgentManager(
harness.agentRepository,
harness.workspaceRoot,
harness.projectRepository,
harness.accountRepository,
harness.eventBus
);
await newManager.reconcileAfterRestart();
// 3. Verify agent is now crashed
agent = await harness.agentRepository.findById(fakeAgent.id);
expect(agent?.status).toBe('crashed');
console.log(' 3. Agent marked as crashed (dead PID detected)');
},
REAL_TEST_TIMEOUT
);
it(
'processes output file for dead agent during reconcile',
async () => {
// 1. Spawn agent and wait for completion
console.log(' 1. Spawning agent to completion...');
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.done,
mode: 'execute',
provider: 'claude',
});
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
const outputFilePath = dbAgent?.outputFilePath;
expect(outputFilePath).toBeTruthy();
console.log(' 2. Output file:', outputFilePath);
// 2. Reset agent to "running" to simulate mid-crash state
await harness.agentRepository.update(agent.id, { status: 'running' });
// Clear result so reconcile has to re-process
await harness.agentRepository.update(agent.id, { result: null });
// Verify reset
let resetAgent = await harness.agentRepository.findById(agent.id);
expect(resetAgent?.status).toBe('running');
// 3. Create new manager and reconcile
console.log(' 3. Creating new manager and reconciling...');
harness.clearEvents();
const newManager = new MultiProviderAgentManager(
harness.agentRepository,
harness.workspaceRoot,
harness.projectRepository,
harness.accountRepository,
harness.eventBus
);
await newManager.reconcileAfterRestart();
// Give it a moment to process the file
await sleep(1000);
// 4. Verify agent was processed from output file
const finalAgent = await harness.agentRepository.findById(agent.id);
console.log(' 4. Final status:', finalAgent?.status);
// Should either be idle (processed successfully) or crashed (couldn't process)
expect(['idle', 'crashed']).toContain(finalAgent?.status);
},
REAL_TEST_TIMEOUT
);
});
describe('Event Consistency', () => {
it(
'does not duplicate events on restart',
async () => {
// 1. Spawn agent with slow task
console.log(' 1. Spawning agent...');
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.streaming,
mode: 'execute',
provider: 'claude',
});
// 2. Wait for some output events
await sleep(3000);
const initialOutputCount = harness.getEventsByType('agent:output').length;
console.log(' 2. Initial output events:', initialOutputCount);
// 3. Wait for completion
await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const finalOutputCount = harness.getEventsByType('agent:output').length;
console.log(' 3. Final output events:', finalOutputCount);
// 4. Create new manager and reconcile (agent already complete)
harness.clearEvents();
const newManager = new MultiProviderAgentManager(
harness.agentRepository,
harness.workspaceRoot,
harness.projectRepository,
harness.accountRepository,
harness.eventBus
);
await newManager.reconcileAfterRestart();
await sleep(1000);
// 5. Verify no new output events (agent was already complete)
const postReconcileOutputCount = harness.getEventsByType('agent:output').length;
console.log(' 4. Post-reconcile output events:', postReconcileOutputCount);
// Should not have re-emitted all the old output events
expect(postReconcileOutputCount).toBe(0);
},
REAL_TEST_TIMEOUT
);
});
});

View File

@@ -0,0 +1,378 @@
/**
* Real Provider Test Harness
*
* Extends the existing test infrastructure to use REAL MultiProviderAgentManager
* for integration testing with actual CLI providers like Claude and Codex.
*
* Unlike the standard TestHarness which uses MockAgentManager, this harness:
* - Uses real CLI spawning (costs real API credits!)
* - Provides poll-based waiting helpers
* - Captures events for inspection
* - Manages temp directories for worktrees
*/
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe } from 'vitest';
import type { DrizzleDatabase } from '../../../db/index.js';
import type { DomainEvent, EventBus } from '../../../events/types.js';
import { EventEmitterBus } from '../../../events/bus.js';
import { MultiProviderAgentManager } from '../../../agent/manager.js';
import type { AgentResult, PendingQuestions, AgentStatus } from '../../../agent/types.js';
import type { AgentRepository } from '../../../db/repositories/agent-repository.js';
import type { ProjectRepository } from '../../../db/repositories/project-repository.js';
import type { AccountRepository } from '../../../db/repositories/account-repository.js';
import type { InitiativeRepository } from '../../../db/repositories/initiative-repository.js';
import {
DrizzleAgentRepository,
DrizzleProjectRepository,
DrizzleAccountRepository,
DrizzleInitiativeRepository,
} from '../../../db/repositories/drizzle/index.js';
import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
/**
* Sleep helper for polling loops.
*/
export function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Event bus that captures all emitted events for inspection.
*/
export class CapturingEventBus extends EventEmitterBus {
emittedEvents: DomainEvent[] = [];
emit<T extends DomainEvent>(event: T): void {
this.emittedEvents.push(event);
super.emit(event);
}
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
return this.emittedEvents.filter((e) => e.type === type) as T[];
}
clearEvents(): void {
this.emittedEvents = [];
}
}
/**
* Options for creating a real provider test harness.
*/
export interface RealProviderHarnessOptions {
/** Which provider to test (default: 'claude') */
provider?: 'claude' | 'codex';
/** Optional workspace root (temp dir created if omitted) */
workspaceRoot?: string;
}
/**
* Real Provider Test Harness interface.
*
* Provides everything needed to test against real CLI providers:
* - In-memory database with real repositories
* - Real MultiProviderAgentManager (spawns actual CLI processes)
* - Event capture for verification
* - Polling-based wait helpers
*/
export interface RealProviderHarness {
/** In-memory SQLite database */
db: DrizzleDatabase;
/** Event bus with capture capability */
eventBus: CapturingEventBus;
/** Real agent manager (not mock!) */
agentManager: MultiProviderAgentManager;
/** Workspace root directory */
workspaceRoot: string;
/** Agent repository */
agentRepository: AgentRepository;
/** Project repository */
projectRepository: ProjectRepository;
/** Account repository */
accountRepository: AccountRepository;
/** Initiative repository */
initiativeRepository: InitiativeRepository;
/**
* Wait for an agent to reach idle or crashed status.
* Polls the database at regular intervals.
*
* @param agentId - The agent ID to wait for
* @param timeoutMs - Maximum time to wait (default 120000ms = 2 minutes)
* @returns The agent result if completed, or null if crashed/timeout
*/
waitForAgentCompletion(agentId: string, timeoutMs?: number): Promise<AgentResult | null>;
/**
* Wait for an agent to enter waiting_for_input status.
* Polls the database at regular intervals.
*
* @param agentId - The agent ID to wait for
* @param timeoutMs - Maximum time to wait (default 120000ms)
* @returns The pending questions if waiting, or null if timeout/other status
*/
waitForAgentWaiting(agentId: string, timeoutMs?: number): Promise<PendingQuestions | null>;
/**
* Wait for an agent to reach a specific status.
*
* @param agentId - The agent ID to wait for
* @param status - The target status
* @param timeoutMs - Maximum time to wait (default 120000ms)
*/
waitForAgentStatus(agentId: string, status: AgentStatus, timeoutMs?: number): Promise<void>;
/**
* Get captured events filtered by type.
*/
getEventsByType<T extends DomainEvent>(type: T['type']): T[];
/**
* Clear all captured events.
*/
clearEvents(): void;
/**
* Kill all running agents (for cleanup).
*/
killAllAgents(): Promise<void>;
/**
* Clean up all resources (directories, processes).
* Call this in afterAll/afterEach.
*/
cleanup(): Promise<void>;
}
/** Default poll interval for status checks */
const POLL_INTERVAL_MS = 1000;
/**
* Create a test harness for real provider integration tests.
*
* This creates:
* - In-memory SQLite database
* - Temp directory for worktrees (or uses provided workspace)
* - Real MultiProviderAgentManager
* - Event capture bus
*
* @example
* ```typescript
* let harness: RealProviderHarness;
*
* beforeAll(async () => {
* harness = await createRealProviderHarness({ provider: 'claude' });
* });
*
* afterAll(async () => {
* await harness.cleanup();
* });
*
* it('spawns and completes', async () => {
* const agent = await harness.agentManager.spawn({...});
* const result = await harness.waitForAgentCompletion(agent.id);
* expect(result?.success).toBe(true);
* });
* ```
*/
export async function createRealProviderHarness(
options: RealProviderHarnessOptions = {}
): Promise<RealProviderHarness> {
// Create workspace directory (temp if not provided)
const workspaceRoot = options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'cw-test-')));
const ownedWorkspace = !options.workspaceRoot; // Track if we need to clean up
// Initialize git repo in temp workspace (required for worktree operations)
if (ownedWorkspace) {
const { execSync } = await import('node:child_process');
execSync('git init', { cwd: workspaceRoot, stdio: 'ignore' });
execSync('git config user.email "test@test.com"', { cwd: workspaceRoot, stdio: 'ignore' });
execSync('git config user.name "Test"', { cwd: workspaceRoot, stdio: 'ignore' });
// Create initial commit (worktrees require at least one commit)
execSync('touch .gitkeep && git add .gitkeep && git commit -m "init"', { cwd: workspaceRoot, stdio: 'ignore' });
}
// Create in-memory database
const db = createTestDatabase();
// Create repositories
const agentRepository = new DrizzleAgentRepository(db);
const projectRepository = new DrizzleProjectRepository(db);
const accountRepository = new DrizzleAccountRepository(db);
const initiativeRepository = new DrizzleInitiativeRepository(db);
// Create event bus with capture (parent class already sets maxListeners to 100)
const eventBus = new CapturingEventBus();
// Create REAL agent manager (not mock!)
const agentManager = new MultiProviderAgentManager(
agentRepository,
workspaceRoot,
projectRepository,
accountRepository,
eventBus
);
// Build harness
const harness: RealProviderHarness = {
db,
eventBus,
agentManager,
workspaceRoot,
agentRepository,
projectRepository,
accountRepository,
initiativeRepository,
async waitForAgentCompletion(agentId: string, timeoutMs = 120000): Promise<AgentResult | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const agent = await agentRepository.findById(agentId);
if (!agent) return null;
if (agent.status === 'idle' || agent.status === 'stopped') {
// Agent completed - get result
return agentManager.getResult(agentId);
}
if (agent.status === 'crashed') {
// Agent crashed - return the error result
return agentManager.getResult(agentId);
}
if (agent.status === 'waiting_for_input') {
// Agent is waiting - return null (not completed)
return null;
}
// Still running - wait and check again
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`Timeout waiting for agent ${agentId} to complete after ${timeoutMs}ms`);
},
async waitForAgentWaiting(agentId: string, timeoutMs = 120000): Promise<PendingQuestions | null> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const agent = await agentRepository.findById(agentId);
if (!agent) return null;
if (agent.status === 'waiting_for_input') {
return agentManager.getPendingQuestions(agentId);
}
if (agent.status === 'idle' || agent.status === 'stopped' || agent.status === 'crashed') {
// Agent finished without asking questions
return null;
}
// Still running - wait and check again
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`Timeout waiting for agent ${agentId} to request input after ${timeoutMs}ms`);
},
async waitForAgentStatus(agentId: string, status: AgentStatus, timeoutMs = 120000): Promise<void> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
const agent = await agentRepository.findById(agentId);
if (!agent) {
throw new Error(`Agent ${agentId} not found`);
}
if (agent.status === status) {
return;
}
// Check for terminal states that mean we'll never reach target
if (status === 'running' && ['idle', 'stopped', 'crashed', 'waiting_for_input'].includes(agent.status)) {
throw new Error(`Agent ${agentId} already in terminal state ${agent.status}, cannot reach ${status}`);
}
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`Timeout waiting for agent ${agentId} to reach status ${status} after ${timeoutMs}ms`);
},
getEventsByType<T extends DomainEvent>(type: T['type']): T[] {
return eventBus.getEventsByType<T>(type);
},
clearEvents(): void {
eventBus.clearEvents();
},
async killAllAgents(): Promise<void> {
const agents = await agentRepository.findAll();
for (const agent of agents) {
if (agent.status === 'running') {
try {
await agentManager.stop(agent.id);
} catch {
// Ignore errors during cleanup
}
}
}
},
async cleanup(): Promise<void> {
// Kill any running agents
await harness.killAllAgents();
// Clean up workspace directory if we created it
if (ownedWorkspace) {
try {
await rm(workspaceRoot, { recursive: true, force: true });
} catch {
// Ignore cleanup errors
}
}
},
};
return harness;
}
/**
* Check if real Claude tests should run.
* Set REAL_CLAUDE_TESTS=1 environment variable to enable.
*/
export const shouldRunRealClaudeTests = process.env.REAL_CLAUDE_TESTS === '1';
/**
* Check if real Codex tests should run.
* Set REAL_CODEX_TESTS=1 environment variable to enable.
*/
export const shouldRunRealCodexTests = process.env.REAL_CODEX_TESTS === '1';
/**
* Skip wrapper for Claude tests - skips unless REAL_CLAUDE_TESTS=1.
*/
export const describeRealClaude: typeof describe = shouldRunRealClaudeTests ? describe : (describe.skip as typeof describe);
/**
* Skip wrapper for Codex tests - skips unless REAL_CODEX_TESTS=1.
*/
export const describeRealCodex: typeof describe = shouldRunRealCodexTests ? describe : (describe.skip as typeof describe);
/**
* Default test timeout for real CLI tests (2 minutes).
* Real API calls take 5-30 seconds typically.
*/
export const REAL_TEST_TIMEOUT = 120000;
/**
* Extended test timeout for slow tests (5 minutes).
* Used for schema retry tests and crash recovery tests.
*/
export const EXTENDED_TEST_TIMEOUT = 300000;

View File

@@ -0,0 +1,56 @@
/**
* Real Provider Integration Tests
*
* This module provides infrastructure for testing against real CLI providers.
* Tests are expensive (real API calls) and skipped by default.
*
* ## Running Tests
*
* ```bash
* # Claude tests only
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000
*
* # Codex tests only
* REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/codex-manager.test.ts
*
* # All real provider tests
* REAL_CLAUDE_TESTS=1 REAL_CODEX_TESTS=1 npm test -- src/test/integration/real-providers/
* ```
*
* ## Cost Estimates
*
* | Suite | Tests | Est. Cost | Duration |
* |-------|-------|-----------|----------|
* | Output Parsing | 3 | $0.06 | ~2 min |
* | Schema Validation | 4 | $0.22 | ~4 min |
* | Crash Recovery | 3 | $0.08 | ~3 min |
* | Session Resume | 2 | $0.08 | ~3 min |
* | Codex Integration | 2 | $0.10 | ~2 min |
* | **TOTAL** | **14** | **~$0.54** | **~14 min** |
*
* ## Test Files
*
* - `harness.ts` - RealProviderHarness factory and utilities
* - `prompts.ts` - Minimal cost test prompts
* - `claude-manager.test.ts` - Claude spawn/resume/output tests
* - `codex-manager.test.ts` - Codex provider tests
* - `schema-retry.test.ts` - Schema validation + retry tests
* - `crash-recovery.test.ts` - Server restart simulation
* - `sample-outputs/` - Captured CLI output for parser unit tests
*/
export {
createRealProviderHarness,
CapturingEventBus,
sleep,
shouldRunRealClaudeTests,
shouldRunRealCodexTests,
describeRealClaude,
describeRealCodex,
REAL_TEST_TIMEOUT,
EXTENDED_TEST_TIMEOUT,
type RealProviderHarness,
type RealProviderHarnessOptions,
} from './harness.js';
export { MINIMAL_PROMPTS, CODEX_PROMPTS } from './prompts.js';

View File

@@ -0,0 +1,113 @@
/**
* Minimal Cost Test Prompts
*
* Carefully crafted prompts designed to minimize token usage while
* testing specific CLI behaviors. Each prompt aims for the smallest
* possible API cost while still exercising the target functionality.
*
* Cost estimates assume Claude Sonnet pricing (~$3/M input, $15/M output).
*/
export const MINIMAL_PROMPTS = {
/**
* ~$0.01 - Cheapest done response
* Tests: basic spawn → completion flow, status parsing
*/
done: `Output exactly this JSON with no other text:
{"status":"done","result":"ok"}`,
/**
* ~$0.01 - Cheapest questions response
* Tests: waiting_for_input status, questions array parsing
*/
questions: `Output exactly this JSON with no other text:
{"status":"questions","questions":[{"id":"q1","question":"What is your name?"}]}`,
/**
* ~$0.03 - Slow task for timing tests
* Tests: streaming during long-running task, crash recovery
* Note: Agent may not actually wait 30 seconds, but will produce delayed output
*/
slow: `Think through a simple problem step by step, counting from 1 to 10 slowly, then output:
{"status":"done","result":"counted to 10"}`,
/**
* ~$0.02 - Produces text deltas for streaming tests
* Tests: text_delta event parsing, output buffering
*/
streaming: `Count from 1 to 5, outputting each number, then output:
{"status":"done","result":"counted"}`,
/**
* ~$0.03 - Deliberately produces non-JSON first
* Tests: schema validation failure, retry logic
*/
badThenGood: `First say "thinking..." on its own line, then output:
{"status":"done","result":"fixed"}`,
/**
* ~$0.02 - Multiple questions
* Tests: questions array with multiple items
*/
multipleQuestions: `Output exactly this JSON with no other text:
{"status":"questions","questions":[{"id":"q1","question":"First question?"},{"id":"q2","question":"Second question?"}]}`,
/**
* ~$0.01 - Error signal
* Tests: error status handling
*/
error: `Output exactly this JSON with no other text:
{"status":"error","error":"Test error message"}`,
/**
* ~$0.02 - Answer continuation
* Tests: session resume with answers
*/
answerContinuation: (answers: Record<string, string>): string => {
const answerLines = Object.entries(answers)
.map(([id, answer]) => `${id}: ${answer}`)
.join('\n');
return `I received your answers:
${answerLines}
Now complete the task by outputting:
{"status":"done","result":"completed with answers"}`;
},
/**
* ~$0.02 - Context complete for discuss mode
* Tests: discuss mode output handling (now uses universal done signal)
*/
discussComplete: `Output exactly this JSON with no other text:
{"status":"done"}`,
/**
* ~$0.02 - Plan complete
* Tests: plan mode output handling (now uses universal done signal)
*/
planComplete: `Output exactly this JSON with no other text:
{"status":"done"}`,
/**
* ~$0.02 - Detail complete
* Tests: detail mode output handling (now uses universal done signal)
*/
detailComplete: `Output exactly this JSON with no other text:
{"status":"done"}`,
} as const;
/**
* Prompts specifically for Codex provider testing.
* Codex may have different output format requirements.
*/
export const CODEX_PROMPTS = {
/**
* Basic completion for Codex
*/
done: `Complete this simple task: output "done" and finish.`,
/**
* Produces streaming output
*/
streaming: `Count from 1 to 5, saying each number aloud, then say "finished".`,
} as const;

View File

@@ -0,0 +1,68 @@
# Sample CLI Outputs
This directory contains captured real CLI outputs for use in parser unit tests.
These files allow testing stream parsers without incurring API costs.
## Files
### claude-stream-success.jsonl
A successful Claude CLI session (v2.1.33) that:
- Initializes with `system` event containing `session_id`
- Emits `assistant` message with content
- Completes with `result` event containing `done` status JSON
### claude-stream-questions.jsonl
A Claude CLI session that:
- Initializes with `system` event containing `session_id`
- Emits `assistant` message with content wrapped in markdown code block
- Completes with `result` event containing `questions` status JSON
### codex-stream-success.jsonl
A successful Codex CLI session (v0.98.0) that:
- Starts with `thread.started` event containing `thread_id`
- Emits `turn.started`, `item.completed` events
- Completes with `turn.completed` event containing usage stats
## Event Type Differences
### Claude CLI (`--output-format stream-json`)
- `system` (subtype: `init`) - Contains `session_id`, tools, model info
- `assistant` - Contains message content in `content[].text`
- `result` - Contains final `result` text and `total_cost_usd`
### Codex CLI (`--json`)
- `thread.started` - Contains `thread_id` (equivalent to session_id)
- `turn.started` - Marks beginning of turn
- `item.completed` - Contains reasoning or agent_message items
- `turn.completed` - Contains usage stats
## Usage
These files can be used to test stream parsers in isolation:
```typescript
import { readFileSync } from 'fs';
import { ClaudeStreamParser } from '../../../agent/providers/parsers/claude.js';
const output = readFileSync('sample-outputs/claude-stream-success.jsonl', 'utf-8');
const parser = new ClaudeStreamParser();
for (const line of output.split('\n')) {
if (line.trim()) {
const events = parser.parseLine(line);
// Assert on events...
}
}
```
## Capturing New Outputs
### Claude
```bash
claude -p "your prompt" --output-format stream-json --verbose > output.jsonl
```
### Codex
```bash
codex exec --full-auto --json "your prompt" > output.jsonl
```

View File

@@ -0,0 +1,3 @@
{"type":"system","subtype":"init","cwd":"/Users/lukasmay/development/projects/codewalk-district","session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","slash_commands":["keybindings-help","debug","gsd:define-requirements","gsd:list-phase-assumptions","gsd:debug","gsd:remove-phase","gsd:complete-milestone","gsd:research-phase","gsd:plan-phase","gsd:check-todos","gsd:pause-work","gsd:execute-plan","gsd:research-project","gsd:add-todo","gsd:plan-fix","gsd:resume-work","gsd:progress","gsd:help","gsd:discuss-milestone","gsd:add-phase","gsd:create-roadmap","gsd:map-codebase","gsd:whats-new","gsd:insert-phase","gsd:new-milestone","gsd:new-project","gsd:execute-phase","gsd:verify-work","gsd:discuss-phase","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.33","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","claude-code-guide","jira-sw-assessment"],"skills":["keybindings-help","debug"],"plugins":[],"uuid":"224c683c-41f4-4fdd-9af6-f8cdca366ec1"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01CfDymxvSRFodJ5Zm6NjLHV","type":"message","role":"assistant","content":[{"type":"text","text":"```json\n{\"status\":\"questions\",\"questions\":[{\"id\":\"q1\",\"question\":\"What is your name?\"},{\"id\":\"q2\",\"question\":\"What is the deadline?\"}]}\n```"}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":5983,"cache_read_input_tokens":18026,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":5983},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","uuid":"29288f20-766c-4047-82f5-679024188f52"}
{"type":"result","subtype":"success","is_error":false,"duration_ms":3213,"duration_api_ms":3203,"num_turns":1,"result":"```json\n{\"status\":\"questions\",\"questions\":[{\"id\":\"q1\",\"question\":\"What is your name?\"},{\"id\":\"q2\",\"question\":\"What is the deadline?\"}]}\n```","stop_reason":null,"session_id":"774631da-8e54-445e-9ccb-eea8e7fe805e","total_cost_usd":0.04754675,"usage":{"input_tokens":3,"cache_creation_input_tokens":5983,"cache_read_input_tokens":18026,"output_tokens":45,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":5983,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":3,"outputTokens":45,"cacheReadInputTokens":18026,"cacheCreationInputTokens":5983,"webSearchRequests":0,"costUSD":0.04754675,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"uuid":"08db08cd-0f12-47ae-8c21-c29e11a6d7df"}

View File

@@ -0,0 +1,3 @@
{"type":"system","subtype":"init","cwd":"/Users/lukasmay/development/projects/codewalk-district","session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","slash_commands":["keybindings-help","debug","gsd:define-requirements","gsd:list-phase-assumptions","gsd:debug","gsd:remove-phase","gsd:complete-milestone","gsd:research-phase","gsd:plan-phase","gsd:check-todos","gsd:pause-work","gsd:execute-plan","gsd:research-project","gsd:add-todo","gsd:plan-fix","gsd:resume-work","gsd:progress","gsd:help","gsd:discuss-milestone","gsd:add-phase","gsd:create-roadmap","gsd:map-codebase","gsd:whats-new","gsd:insert-phase","gsd:new-milestone","gsd:new-project","gsd:execute-phase","gsd:verify-work","gsd:discuss-phase","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.33","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","claude-code-guide","jira-sw-assessment"],"skills":["keybindings-help","debug"],"plugins":[],"uuid":"c1d6dced-ca04-4335-a624-624660479b7b"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RjSiQY1RUgT47j73Dom93j","type":"message","role":"assistant","content":[{"type":"text","text":"{\"status\":\"done\",\"result\":\"ok\"}"}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":5958,"cache_read_input_tokens":18026,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":5958},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","uuid":"f1c8695a-534e-4de2-a684-fa4a1ec03749"}
{"type":"result","subtype":"success","is_error":false,"duration_ms":2465,"duration_api_ms":2453,"num_turns":1,"result":"{\"status\":\"done\",\"result\":\"ok\"}","stop_reason":null,"session_id":"a0aa6272-b3a6-443c-9ef5-de3a2450dc6d","total_cost_usd":0.046565499999999996,"usage":{"input_tokens":3,"cache_creation_input_tokens":5958,"cache_read_input_tokens":18026,"output_tokens":12,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":5958,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":3,"outputTokens":12,"cacheReadInputTokens":18026,"cacheCreationInputTokens":5958,"webSearchRequests":0,"costUSD":0.046565499999999996,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"uuid":"53139e08-b4f3-4f94-b129-82759f77fdca"}

View File

@@ -0,0 +1,5 @@
{"type":"thread.started","thread_id":"019c3242-955e-7140-9978-517f0b5a22cb"}
{"type":"turn.started"}
{"type":"item.completed","item":{"id":"item_0","type":"reasoning","text":"**Confirming simple greeting task**"}}
{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"Hello!"}}
{"type":"turn.completed","usage":{"input_tokens":8458,"cached_input_tokens":6912,"output_tokens":32}}

View File

@@ -0,0 +1,306 @@
/**
* Schema Validation & Retry Integration Tests
*
* IMPORTANT: These tests call the REAL Claude CLI and incur API costs!
* They are SKIPPED by default and should only be run manually for validation.
*
* To run these tests:
* ```bash
* REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/schema-retry.test.ts --test-timeout=300000
* ```
*
* Tests covered:
* - Valid JSON output validation
* - Questions status parsing
* - Schema validation failure with retry
* - Max retry limit handling
*
* Estimated cost: ~$0.20 per full run (includes retries)
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import {
createRealProviderHarness,
describeRealClaude,
REAL_TEST_TIMEOUT,
EXTENDED_TEST_TIMEOUT,
type RealProviderHarness,
} from './harness.js';
import { MINIMAL_PROMPTS } from './prompts.js';
import type { AgentResumedEvent, AgentCrashedEvent } from '../../../events/types.js';
describeRealClaude('Schema Validation & Retry', () => {
let harness: RealProviderHarness;
beforeAll(async () => {
console.log('\n=== Running Schema Validation & Retry Tests ===');
console.log('These tests call the real Claude API and incur costs.');
console.log('Retry tests may take longer and cost more.\n');
harness = await createRealProviderHarness({ provider: 'claude' });
});
afterAll(async () => {
await harness.cleanup();
});
beforeEach(() => {
harness.clearEvents();
});
describe('Valid Output', () => {
it(
'validates done status output',
async () => {
// Spawn agent with minimal done prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.done,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
// Verify completion
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
// No retry events should have been emitted
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
expect(resumeEvents.length).toBe(0);
console.log(' Status: idle (valid done output)');
console.log(' Result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'validates questions status output',
async () => {
// Spawn agent with questions prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.questions,
mode: 'execute',
provider: 'claude',
});
// Wait for waiting_for_input
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
// Verify questions were validated
expect(questions).toBeTruthy();
expect(questions?.questions).toBeInstanceOf(Array);
expect(questions?.questions.length).toBeGreaterThan(0);
// Each question should have id and question fields
for (const q of questions?.questions ?? []) {
expect(q.id).toBeTruthy();
expect(q.question).toBeTruthy();
}
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('waiting_for_input');
// No retry events
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
expect(resumeEvents.length).toBe(0);
console.log(' Status: waiting_for_input (valid questions output)');
console.log(' Questions:', questions?.questions.length);
},
REAL_TEST_TIMEOUT
);
it(
'validates multiple questions',
async () => {
// Spawn agent with multiple questions prompt
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.multipleQuestions,
mode: 'execute',
provider: 'claude',
});
// Wait for waiting_for_input
const questions = await harness.waitForAgentWaiting(agent.id, REAL_TEST_TIMEOUT);
// Verify multiple questions
expect(questions?.questions.length).toBeGreaterThanOrEqual(2);
// Each question should have unique ID
const ids = questions?.questions.map((q) => q.id) ?? [];
const uniqueIds = new Set(ids);
expect(uniqueIds.size).toBe(ids.length);
console.log(' Questions:', questions?.questions.map((q) => q.id).join(', '));
},
REAL_TEST_TIMEOUT
);
});
describe('Retry Logic', () => {
it(
'retries when output does not match schema',
async () => {
// Prompt that produces non-JSON first, then valid JSON
// Note: Claude may or may not produce invalid output first
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.badThenGood,
mode: 'execute',
provider: 'claude',
});
// Wait for completion (may involve retries)
const result = await harness.waitForAgentCompletion(agent.id, EXTENDED_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
// Either succeeded with retry OR succeeded first time
expect(['idle', 'crashed']).toContain(dbAgent?.status);
// Check for retry events
const resumeEvents = harness.getEventsByType<AgentResumedEvent>('agent:resumed');
console.log(' Retry attempts:', resumeEvents.length);
console.log(' Final status:', dbAgent?.status);
if (dbAgent?.status === 'idle') {
expect(result?.success).toBe(true);
console.log(' Result:', result?.message);
} else {
// Crashed after max retries
const crashedEvents = harness.getEventsByType<AgentCrashedEvent>('agent:crashed');
expect(crashedEvents.length).toBeGreaterThan(0);
console.log(' Crashed after retries');
}
},
EXTENDED_TEST_TIMEOUT
);
it(
'extracts JSON from markdown code blocks',
async () => {
// Prompt that produces JSON wrapped in markdown
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: `Output the result wrapped in a markdown code block like this:
\`\`\`json
{"status":"done","result":"extracted from markdown"}
\`\`\``,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Status:', dbAgent?.status);
console.log(' Result:', result?.message);
// Should succeed (JSON extraction from code block)
if (dbAgent?.status === 'idle') {
expect(result?.success).toBe(true);
}
},
REAL_TEST_TIMEOUT
);
it(
'extracts JSON from text with surrounding content',
async () => {
// Prompt that produces JSON with text before it
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: `First say "Here is my response:" then output the JSON:
{"status":"done","result":"extracted from text"}`,
mode: 'execute',
provider: 'claude',
});
// Wait for completion
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
console.log(' Status:', dbAgent?.status);
console.log(' Result:', result?.message);
// Should succeed (JSON extraction from last {...} block)
if (dbAgent?.status === 'idle') {
expect(result?.success).toBe(true);
}
},
REAL_TEST_TIMEOUT
);
});
describe('Mode-Specific Schemas', () => {
it(
'validates discuss mode output',
async () => {
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.discussComplete,
mode: 'discuss',
provider: 'claude',
});
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
console.log(' Discuss mode result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'validates plan mode output',
async () => {
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.planComplete,
mode: 'plan',
provider: 'claude',
});
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
console.log(' Plan mode result:', result?.message);
},
REAL_TEST_TIMEOUT
);
it(
'validates detail mode output',
async () => {
const agent = await harness.agentManager.spawn({
taskId: null,
prompt: MINIMAL_PROMPTS.detailComplete,
mode: 'detail',
provider: 'claude',
});
const result = await harness.waitForAgentCompletion(agent.id, REAL_TEST_TIMEOUT);
const dbAgent = await harness.agentRepository.findById(agent.id);
expect(dbAgent?.status).toBe('idle');
expect(result?.success).toBe(true);
console.log(' Detail mode result:', result?.message);
},
REAL_TEST_TIMEOUT
);
});
});