diff --git a/CLAUDE.md b/CLAUDE.md index 87e8b6e..5c58cc9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -47,7 +47,6 @@ Run after any change to server-side code (`src/**`). npm test # Unit + E2E tests (no API cost) CW_CASSETTE_RECORD=1 npm test -- # Record new cassettes locally REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50) -FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000 # Full end-to-end test (~$2-5) # Record full-flow cassettes (one-time, costs ~$2–5 in API credits): CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000 diff --git a/src/test/integration/full-flow/full-flow.test.ts b/src/test/integration/full-flow/full-flow.test.ts deleted file mode 100644 index 006fd36..0000000 --- a/src/test/integration/full-flow/full-flow.test.ts +++ /dev/null @@ -1,280 +0,0 @@ -/** - * Full-Flow Integration Test - * - * Tests a complete multi-agent workflow from "create initiative" through - * discuss → plan → detail → execute, validating that: - * - discuss mode gathers requirements (handling questions if asked) - * - plan mode produces sensible phases - * - detail mode breaks phases into executable tasks - * - execute mode implements the missing complete() method - * - npm test passes in the todo-api project after execution - * - * COSTS REAL API CREDITS (~$2–5 per run). - * Only runs when FULL_FLOW_TESTS=1 is set. - * - * Usage: - * FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=1800000 - */ - -import { describe, it, expect, beforeAll, afterAll } from 'vitest'; -import { join } from 'node:path'; -import { execSync } from 'node:child_process'; -import type { Phase, Task } from '../../../db/schema.js'; -import type { AgentResult } from '../../../agent/types.js'; -import { buildExecutePrompt } from '../../../agent/prompts/index.js'; -import { - createFullFlowHarness, - shouldRunFullFlowTests, - type FullFlowHarness, -} from './harness.js'; -import { - printHeader, - printDiscussResult, - printPlanResult, - printDetailResult, - printExecuteResult, - printGitDiff, - printFinalSummary, - type ExecutedTask, -} from './report.js'; - -// ============================================================================= -// Constants -// ============================================================================= - -/** Total test timeout: 60 minutes */ -const FULL_FLOW_TIMEOUT = 60 * 60 * 1000; - -/** Per-stage timeouts — architect agents are variable; these are upper bounds not expectations */ -const DISCUSS_TIMEOUT_MS = 8 * 60_000; -const PLAN_TIMEOUT_MS = 12 * 60_000; -const DETAIL_TIMEOUT_MS = 15 * 60_000; // per phase -const EXECUTE_TIMEOUT_MS = 20 * 60_000; // per task — real RED-GREEN-REFACTOR cycles take time - -// ============================================================================= -// Test -// ============================================================================= - -describe.skipIf(!shouldRunFullFlowTests)('full flow (real agents — costs API credits)', () => { - let harness: FullFlowHarness; - const startedAt = Date.now(); - - beforeAll(async () => { - harness = await createFullFlowHarness('Add complete() method to TodoStore'); - printHeader(harness.initiative.name); - console.log(` Initiative ID : ${harness.initiative.id}`); - console.log(` Project ID : ${harness.project.id}`); - console.log(` Workspace : ${harness.workspaceRoot}`); - console.log(` Fixture dir : ${harness.fixtureRoot}`); - }, FULL_FLOW_TIMEOUT); - - afterAll(async () => { - if (harness) { - await harness.cleanup(); - } - }); - - it( - 'runs the complete multi-agent workflow', - async () => { - const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness; - const initiativeId = initiative.id; - - // ── Stage 2: Discuss ───────────────────────────────────────────────────── - console.log('\n\n>>> Stage 2: DISCUSS <<<'); - const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId }); - expect(discussAgent.id).toBeTruthy(); - console.log(` Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`); - - const discussResult = await harness.driveToCompletion( - discussAgent.id, - 'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.', - DISCUSS_TIMEOUT_MS, - ); - printDiscussResult(discussAgent.id, discussResult); - - // Discuss agents can complete without asking questions — success means it ran - // without crashing. A crashed discuss agent is a blocker but not fatal for - // subsequent stages (plan can still run with the initiative description alone). - if (!discussResult?.success) { - console.warn(' [WARN] discuss agent did not succeed; continuing to plan stage'); - } - - // ── Stage 3: Plan ───────────────────────────────────────────────────────── - console.log('\n\n>>> Stage 3: PLAN <<<'); - const planAgent = await caller.spawnArchitectPlan({ initiativeId }); - expect(planAgent.id).toBeTruthy(); - console.log(` Spawned plan agent: ${planAgent.name} (${planAgent.id})`); - - const planResult = await harness.driveToCompletion(planAgent.id, 'Keep it simple.', PLAN_TIMEOUT_MS); - expect(planResult).toBeTruthy(); - - const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId); - expect(phases.length).toBeGreaterThan(0); - printPlanResult(phases); - - // ── Stage 4: Detail (per phase) ─────────────────────────────────────────── - console.log('\n\n>>> Stage 4: DETAIL <<<'); - for (const phase of phases) { - const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id }); - expect(detailAgent.id).toBeTruthy(); - console.log(` Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`); - - const detailResult = await harness.driveToCompletion( - detailAgent.id, - 'Keep it simple.', - DETAIL_TIMEOUT_MS, - ); - expect(detailResult).toBeTruthy(); - - const phaseTasks = await taskRepository.findByPhaseId(phase.id); - const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto'); - expect(executeTasks.length).toBeGreaterThan(0); - printDetailResult(phase, phaseTasks); - } - - // ── Stage 5: Execute ────────────────────────────────────────────────────── - console.log('\n\n>>> Stage 5: EXECUTE <<<'); - const allTasks = await gatherAllExecuteTasks(taskRepository, phases); - console.log(` Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`); - - const executed: ExecutedTask[] = []; - for (const task of allTasks) { - console.log(` Spawning execute agent for: "${task.name}"`); - const execAgent = await agentManager.spawn({ - taskId: task.id, - prompt: buildExecutePrompt(task.description ?? task.name), - mode: 'execute', - initiativeId, - phaseId: task.phaseId ?? undefined, - inputContext: { - initiative, - task, - }, - }); - console.log(` Agent: ${execAgent.name} (${execAgent.id})`); - - const result = await harness.driveToCompletion( - execAgent.id, - 'Use your best judgment and keep it simple.', - EXECUTE_TIMEOUT_MS, - ); - executed.push({ task, result }); - - const icon = result?.success ? '✓' : '✗'; - console.log(` ${icon} Completed with success=${result?.success ?? null}`); - if (result && !result.success) { - console.log(` Message: ${result.message?.slice(0, 200)}`); - } - } - - printExecuteResult(executed); - printGitDiff(harness.workspaceRoot, harness.project.name); - - // ── Stage 6: Validate ───────────────────────────────────────────────────── - console.log('\n\n>>> Stage 6: VALIDATE <<<'); - - // Find the last execute agent's worktree for the todo-api project - const lastExecuteAgent = executed[executed.length - 1]; - const projectWorktreeDir = findProjectWorktree( - harness.workspaceRoot, - harness.project.name, - lastExecuteAgent, - ); - - if (projectWorktreeDir) { - console.log(` Running npm test in: ${projectWorktreeDir}`); - try { - execSync('node --test src/todo.test.js', { - cwd: projectWorktreeDir, - stdio: 'pipe', - }); - console.log(' ✓ All tests passed'); - } catch (err: unknown) { - const e = err as { stdout?: Buffer; stderr?: Buffer }; - console.log(' ✗ Tests failed:'); - if (e.stdout) console.log(e.stdout.toString()); - if (e.stderr) console.log(e.stderr.toString()); - // Don't hard-fail on test validation — the important check is all execute agents succeeded - console.warn(' [WARN] npm test failed in project worktree (may be expected if task ordering differs)'); - } - } else { - console.warn(' [WARN] Could not find project worktree dir for npm test validation'); - } - - // Core assertions - const allSucceeded = executed.every((e) => e.result?.success === true); - if (!allSucceeded) { - const failed = executed.filter((e) => !e.result?.success); - console.warn(` [WARN] ${failed.length} execute task(s) did not succeed`); - } - expect(executed.length).toBeGreaterThan(0); - - // ── Final summary ───────────────────────────────────────────────────────── - printFinalSummary( - initiative.name, - phases, - allTasks, - executed, - Date.now() - startedAt, - ); - }, - FULL_FLOW_TIMEOUT, - ); -}); - -// ============================================================================= -// Helpers -// ============================================================================= - -/** - * Gather all auto execute tasks across all phases, in order. - * Excludes planning tasks (discuss, plan, detail, refine, research). - */ -async function gatherAllExecuteTasks( - taskRepository: FullFlowHarness['taskRepository'], - phases: Phase[], -): Promise { - const result: Task[] = []; - for (const phase of phases) { - const phaseTasks = await taskRepository.findByPhaseId(phase.id); - const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto'); - result.push(...execTasks); - } - return result; -} - -/** - * Find the project worktree directory for the last executed task. - * Worktrees live at: /agent-workdirs/// - */ -function findProjectWorktree( - workspaceRoot: string, - projectName: string, - lastExecuted: ExecutedTask | undefined, -): string | null { - if (!lastExecuted) return null; - - try { - const worktreesBase = join(workspaceRoot, 'agent-workdirs'); - const dirs = execSync(`ls "${worktreesBase}" 2>/dev/null || true`, { encoding: 'utf8' }) - .trim() - .split('\n') - .filter(Boolean); - - // Try all agent worktrees and return the first one with a project subdirectory - for (const dir of dirs.reverse()) { - const candidate = join(worktreesBase, dir, projectName); - try { - execSync(`test -d "${candidate}"`, { stdio: 'ignore' }); - return candidate; - } catch { - // Not found in this worktree - } - } - } catch { - // ls failed or no worktrees yet - } - - return null; -} diff --git a/src/test/integration/full-flow/harness.ts b/src/test/integration/full-flow/harness.ts index f214a63..869bfab 100644 --- a/src/test/integration/full-flow/harness.ts +++ b/src/test/integration/full-flow/harness.ts @@ -11,7 +11,7 @@ * - A self-contained fixture git repo (todo-api) for agents to work on * - Helpers for driving agents through question/answer loops * - * COSTS REAL API CREDITS. Controlled by FULL_FLOW_TESTS=1. + * Used by full-flow-cassette.test.ts (replay) and for manual recording runs. */ import { mkdtemp, rm, cp } from 'node:fs/promises'; @@ -397,9 +397,3 @@ export async function createFullFlowHarness( return harness; } - -// ============================================================================= -// Guard -// ============================================================================= - -export const shouldRunFullFlowTests = process.env.FULL_FLOW_TESTS === '1';