diff --git a/CLAUDE.md b/CLAUDE.md index 6e8dde4..87e8b6e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -48,6 +48,11 @@ npm test CW_CASSETTE_RECORD=1 npm test -- # Record new cassettes locally REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50) FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000 # Full end-to-end test (~$2-5) + +# Record full-flow cassettes (one-time, costs ~$2–5 in API credits): +CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000 +# Commit the generated src/test/cassettes/.json files afterward. +# Subsequent runs replay from cassettes at no cost: npm test ``` See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs. diff --git a/src/test/cassette/cassette.test.ts b/src/test/cassette/cassette.test.ts index 643eb8a..f782a90 100644 --- a/src/test/cassette/cassette.test.ts +++ b/src/test/cassette/cassette.test.ts @@ -56,6 +56,14 @@ describe('normalizePrompt', () => { expect(result).toBe(prompt); }); + it('strips nanoid strings (21-char alphanumeric)', () => { + const nanoid = 'V1StGXR8_Z5jdHi6B-myT'; + const prompt = `Agent worktree: /tmp/cw-preview-${nanoid}/app`; + const result = normalizePrompt(prompt, ''); + expect(result).not.toContain(nanoid); + expect(result).toContain('__ID__'); + }); + it('strips workspace root before UUID replacement to avoid double-normalizing', () => { const workspaceRoot = '/tmp/cw-test-abc123'; const uuid = '550e8400-e29b-41d4-a716-446655440000'; diff --git a/src/test/cassette/normalizer.ts b/src/test/cassette/normalizer.ts index 56c7c76..9bd3e1c 100644 --- a/src/test/cassette/normalizer.ts +++ b/src/test/cassette/normalizer.ts @@ -8,6 +8,7 @@ */ const UUID_RE = /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi; +const NANOID_RE = /(?.json files afterward + * + * Replay (default — runs in seconds): + * npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts + * + * Force re-record (overwrites existing cassettes): + * CW_CASSETTE_FORCE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000 + */ + +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { existsSync, readdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import type { Phase, Task } from '../../../db/schema.js'; +import type { AgentResult } from '../../../agent/types.js'; +import { buildExecutePrompt } from '../../../agent/prompts/index.js'; +import { CassetteStore } from '../../cassette/store.js'; +import { CassetteProcessManager, type CassetteMode } from '../../cassette/process-manager.js'; +import { + createFullFlowHarness, + type FullFlowHarness, +} from './harness.js'; +import { + printHeader, + printDiscussResult, + printPlanResult, + printDetailResult, + printExecuteResult, + printFinalSummary, + type ExecutedTask, +} from './report.js'; + +// ============================================================================= +// Constants +// ============================================================================= + +/** Total test timeout: 5 minutes (replay=seconds; 5min covers accidental record) */ +const CASSETTE_FLOW_TIMEOUT = 5 * 60_000; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const CASSETTE_DIR = + process.env.CW_CASSETTE_DIR ?? join(__dirname, '../../cassettes'); + +// ============================================================================= +// Mode helper +// ============================================================================= + +function cassetteMode(): CassetteMode { + if (process.env.CW_CASSETTE_FORCE_RECORD === '1') return 'record'; + if (process.env.CW_CASSETTE_RECORD === '1') return 'auto'; + return 'replay'; +} + +/** + * True when cassettes are available (at least one .json file) OR we're in a + * recording run. Skips the suite if no cassettes have been recorded yet so + * that `npm test` doesn't fail on a fresh clone before cassettes are committed. + */ +function cassettesAvailable(): boolean { + const mode = cassetteMode(); + if (mode !== 'replay') return true; // recording runs always proceed + if (!existsSync(CASSETTE_DIR)) return false; + return readdirSync(CASSETTE_DIR).some((f) => f.endsWith('.json')); +} + +// ============================================================================= +// Test +// ============================================================================= + +describe.skipIf(!cassettesAvailable())('full flow (cassette replay)', () => { + let harness: FullFlowHarness; + const startedAt = Date.now(); + + beforeAll(async () => { + const store = new CassetteStore(CASSETTE_DIR); + const mode = cassetteMode(); + + harness = await createFullFlowHarness('Add complete() method to TodoStore', { + processManagerFactory: (workspaceRoot, projectRepo) => + new CassetteProcessManager(workspaceRoot, projectRepo, store, mode), + }); + + printHeader(harness.initiative.name); + console.log(` Cassette mode : ${mode}`); + console.log(` Cassette dir : ${CASSETTE_DIR}`); + console.log(` Initiative ID : ${harness.initiative.id}`); + console.log(` Workspace : ${harness.workspaceRoot}`); + }, CASSETTE_FLOW_TIMEOUT); + + afterAll(async () => { + if (harness) await harness.cleanup(); + }); + + it( + 'runs the complete multi-agent workflow from cassettes', + async () => { + const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness; + const initiativeId = initiative.id; + + // ── Stage 2: Discuss ─────────────────────────────────────────────────── + console.log('\n\n>>> Stage 2: DISCUSS <<<'); + const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId }); + expect(discussAgent.id).toBeTruthy(); + console.log(` Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`); + + const discussResult = await harness.driveToCompletion( + discussAgent.id, + 'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.', + CASSETTE_FLOW_TIMEOUT, + ); + printDiscussResult(discussAgent.id, discussResult); + + if (!discussResult?.success) { + console.warn(' [WARN] discuss agent did not succeed; continuing to plan stage'); + } + + // ── Stage 3: Plan ────────────────────────────────────────────────────── + console.log('\n\n>>> Stage 3: PLAN <<<'); + const planAgent = await caller.spawnArchitectPlan({ initiativeId }); + expect(planAgent.id).toBeTruthy(); + console.log(` Spawned plan agent: ${planAgent.name} (${planAgent.id})`); + + const planResult = await harness.driveToCompletion( + planAgent.id, + 'Keep it simple.', + CASSETTE_FLOW_TIMEOUT, + ); + expect(planResult).toBeTruthy(); + + const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId); + expect(phases.length).toBeGreaterThan(0); + printPlanResult(phases); + + // ── Stage 4: Detail (per phase) ──────────────────────────────────────── + console.log('\n\n>>> Stage 4: DETAIL <<<'); + for (const phase of phases) { + const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id }); + expect(detailAgent.id).toBeTruthy(); + console.log(` Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`); + + const detailResult = await harness.driveToCompletion( + detailAgent.id, + 'Keep it simple.', + CASSETTE_FLOW_TIMEOUT, + ); + expect(detailResult).toBeTruthy(); + + const phaseTasks = await taskRepository.findByPhaseId(phase.id); + const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto'); + expect(executeTasks.length).toBeGreaterThan(0); + printDetailResult(phase, phaseTasks); + } + + // ── Stage 5: Execute ─────────────────────────────────────────────────── + console.log('\n\n>>> Stage 5: EXECUTE <<<'); + const allTasks = await gatherAllExecuteTasks(taskRepository, phases); + console.log(` Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`); + + const executed: ExecutedTask[] = []; + for (const task of allTasks) { + console.log(` Spawning execute agent for: "${task.name}"`); + const execAgent = await agentManager.spawn({ + taskId: task.id, + prompt: buildExecutePrompt(task.description ?? task.name), + mode: 'execute', + initiativeId, + phaseId: task.phaseId ?? undefined, + inputContext: { + initiative, + task, + }, + }); + console.log(` Agent: ${execAgent.name} (${execAgent.id})`); + + const result = await harness.driveToCompletion( + execAgent.id, + 'Use your best judgment and keep it simple.', + CASSETTE_FLOW_TIMEOUT, + ); + executed.push({ task, result }); + + const icon = result?.success ? '✓' : '✗'; + console.log(` ${icon} Completed with success=${result?.success ?? null}`); + if (result && !result.success) { + console.log(` Message: ${result.message?.slice(0, 200)}`); + } + } + + printExecuteResult(executed); + + // ── Assertions ───────────────────────────────────────────────────────── + expect(executed.length).toBeGreaterThan(0); + + const allSucceeded = executed.every((e) => e.result?.success === true); + if (!allSucceeded) { + const failed = executed.filter((e) => !e.result?.success); + console.warn(` [WARN] ${failed.length} execute task(s) did not succeed`); + } + + // ── Final summary ────────────────────────────────────────────────────── + printFinalSummary( + initiative.name, + phases, + allTasks, + executed, + Date.now() - startedAt, + ); + }, + CASSETTE_FLOW_TIMEOUT, + ); +}); + +// ============================================================================= +// Helpers +// ============================================================================= + +async function gatherAllExecuteTasks( + taskRepository: FullFlowHarness['taskRepository'], + phases: Phase[], +): Promise { + const result: Task[] = []; + for (const phase of phases) { + const phaseTasks = await taskRepository.findByPhaseId(phase.id); + const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto'); + result.push(...execTasks); + } + return result; +} diff --git a/src/test/integration/full-flow/harness.ts b/src/test/integration/full-flow/harness.ts index 56c31c1..f214a63 100644 --- a/src/test/integration/full-flow/harness.ts +++ b/src/test/integration/full-flow/harness.ts @@ -36,6 +36,7 @@ import type { AccountRepository } from '../../../db/repositories/account-reposit import type { ChangeSetRepository } from '../../../db/repositories/change-set-repository.js'; import type { LogChunkRepository } from '../../../db/repositories/log-chunk-repository.js'; import type { ConversationRepository } from '../../../db/repositories/conversation-repository.js'; +import type { ProcessManager } from '../../../agent/process-manager.js'; import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js'; import { createRepositories } from '../../../container.js'; import { DefaultDispatchManager } from '../../../dispatch/manager.js'; @@ -162,6 +163,11 @@ const POLL_INTERVAL_MS = 1500; const __dirname = dirname(fileURLToPath(import.meta.url)); const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api'); +export interface FullFlowHarnessOptions { + /** Factory called after workspaceRoot + repos are created. Return a custom ProcessManager. */ + processManagerFactory?: (workspaceRoot: string, projectRepo: ProjectRepository) => ProcessManager; +} + /** * Create a full-flow test harness. * @@ -177,6 +183,7 @@ const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api'); */ export async function createFullFlowHarness( initiativeName = 'Add complete() method to TodoStore', + options?: FullFlowHarnessOptions, ): Promise { // ── 0. Allow nested claude invocations ──────────────────────────────────── // Claude Code sets CLAUDECODE in the environment, which prevents nested @@ -219,6 +226,7 @@ export async function createFullFlowHarness( const eventBus = new CapturingEventBus(); // ── 5. Real agent manager ───────────────────────────────────────────────── + const customProcessManager = options?.processManagerFactory?.(workspaceRoot, repos.projectRepository); const agentManager = new MultiProviderAgentManager( repos.agentRepository, workspaceRoot, @@ -231,6 +239,8 @@ export async function createFullFlowHarness( repos.taskRepository, repos.pageRepository, repos.logChunkRepository, + false, // debug + customProcessManager, // processManagerOverride ); // ── 6. Dispatch manager (for execute stage) ───────────────────────────────