refactor: Remove full-flow.test.ts in favour of cassette variant

The cassette-backed test (full-flow-cassette.test.ts) covers the same
discuss→plan→detail→execute pipeline without API cost. The real-agent
test added no unique value once cassettes were committed, and the
Stage 6 npm-test validation it included was soft (warn, not fail).

Also removes the now-unused shouldRunFullFlowTests export and the
FULL_FLOW_TESTS=1 entry from CLAUDE.md.
This commit is contained in:
Lukas May
2026-03-03 10:53:41 +01:00
parent 25360e1711
commit 8c38d958ce
3 changed files with 1 additions and 288 deletions

View File

@@ -1,280 +0,0 @@
/**
* Full-Flow Integration Test
*
* Tests a complete multi-agent workflow from "create initiative" through
* discuss → plan → detail → execute, validating that:
* - discuss mode gathers requirements (handling questions if asked)
* - plan mode produces sensible phases
* - detail mode breaks phases into executable tasks
* - execute mode implements the missing complete() method
* - npm test passes in the todo-api project after execution
*
* COSTS REAL API CREDITS (~$25 per run).
* Only runs when FULL_FLOW_TESTS=1 is set.
*
* Usage:
* FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=1800000
*/
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import { join } from 'node:path';
import { execSync } from 'node:child_process';
import type { Phase, Task } from '../../../db/schema.js';
import type { AgentResult } from '../../../agent/types.js';
import { buildExecutePrompt } from '../../../agent/prompts/index.js';
import {
createFullFlowHarness,
shouldRunFullFlowTests,
type FullFlowHarness,
} from './harness.js';
import {
printHeader,
printDiscussResult,
printPlanResult,
printDetailResult,
printExecuteResult,
printGitDiff,
printFinalSummary,
type ExecutedTask,
} from './report.js';
// =============================================================================
// Constants
// =============================================================================
/** Total test timeout: 60 minutes */
const FULL_FLOW_TIMEOUT = 60 * 60 * 1000;
/** Per-stage timeouts — architect agents are variable; these are upper bounds not expectations */
const DISCUSS_TIMEOUT_MS = 8 * 60_000;
const PLAN_TIMEOUT_MS = 12 * 60_000;
const DETAIL_TIMEOUT_MS = 15 * 60_000; // per phase
const EXECUTE_TIMEOUT_MS = 20 * 60_000; // per task — real RED-GREEN-REFACTOR cycles take time
// =============================================================================
// Test
// =============================================================================
describe.skipIf(!shouldRunFullFlowTests)('full flow (real agents — costs API credits)', () => {
let harness: FullFlowHarness;
const startedAt = Date.now();
beforeAll(async () => {
harness = await createFullFlowHarness('Add complete() method to TodoStore');
printHeader(harness.initiative.name);
console.log(` Initiative ID : ${harness.initiative.id}`);
console.log(` Project ID : ${harness.project.id}`);
console.log(` Workspace : ${harness.workspaceRoot}`);
console.log(` Fixture dir : ${harness.fixtureRoot}`);
}, FULL_FLOW_TIMEOUT);
afterAll(async () => {
if (harness) {
await harness.cleanup();
}
});
it(
'runs the complete multi-agent workflow',
async () => {
const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness;
const initiativeId = initiative.id;
// ── Stage 2: Discuss ─────────────────────────────────────────────────────
console.log('\n\n>>> Stage 2: DISCUSS <<<');
const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId });
expect(discussAgent.id).toBeTruthy();
console.log(` Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`);
const discussResult = await harness.driveToCompletion(
discussAgent.id,
'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.',
DISCUSS_TIMEOUT_MS,
);
printDiscussResult(discussAgent.id, discussResult);
// Discuss agents can complete without asking questions — success means it ran
// without crashing. A crashed discuss agent is a blocker but not fatal for
// subsequent stages (plan can still run with the initiative description alone).
if (!discussResult?.success) {
console.warn(' [WARN] discuss agent did not succeed; continuing to plan stage');
}
// ── Stage 3: Plan ─────────────────────────────────────────────────────────
console.log('\n\n>>> Stage 3: PLAN <<<');
const planAgent = await caller.spawnArchitectPlan({ initiativeId });
expect(planAgent.id).toBeTruthy();
console.log(` Spawned plan agent: ${planAgent.name} (${planAgent.id})`);
const planResult = await harness.driveToCompletion(planAgent.id, 'Keep it simple.', PLAN_TIMEOUT_MS);
expect(planResult).toBeTruthy();
const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId);
expect(phases.length).toBeGreaterThan(0);
printPlanResult(phases);
// ── Stage 4: Detail (per phase) ───────────────────────────────────────────
console.log('\n\n>>> Stage 4: DETAIL <<<');
for (const phase of phases) {
const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id });
expect(detailAgent.id).toBeTruthy();
console.log(` Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`);
const detailResult = await harness.driveToCompletion(
detailAgent.id,
'Keep it simple.',
DETAIL_TIMEOUT_MS,
);
expect(detailResult).toBeTruthy();
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
expect(executeTasks.length).toBeGreaterThan(0);
printDetailResult(phase, phaseTasks);
}
// ── Stage 5: Execute ──────────────────────────────────────────────────────
console.log('\n\n>>> Stage 5: EXECUTE <<<');
const allTasks = await gatherAllExecuteTasks(taskRepository, phases);
console.log(` Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`);
const executed: ExecutedTask[] = [];
for (const task of allTasks) {
console.log(` Spawning execute agent for: "${task.name}"`);
const execAgent = await agentManager.spawn({
taskId: task.id,
prompt: buildExecutePrompt(task.description ?? task.name),
mode: 'execute',
initiativeId,
phaseId: task.phaseId ?? undefined,
inputContext: {
initiative,
task,
},
});
console.log(` Agent: ${execAgent.name} (${execAgent.id})`);
const result = await harness.driveToCompletion(
execAgent.id,
'Use your best judgment and keep it simple.',
EXECUTE_TIMEOUT_MS,
);
executed.push({ task, result });
const icon = result?.success ? '✓' : '✗';
console.log(` ${icon} Completed with success=${result?.success ?? null}`);
if (result && !result.success) {
console.log(` Message: ${result.message?.slice(0, 200)}`);
}
}
printExecuteResult(executed);
printGitDiff(harness.workspaceRoot, harness.project.name);
// ── Stage 6: Validate ─────────────────────────────────────────────────────
console.log('\n\n>>> Stage 6: VALIDATE <<<');
// Find the last execute agent's worktree for the todo-api project
const lastExecuteAgent = executed[executed.length - 1];
const projectWorktreeDir = findProjectWorktree(
harness.workspaceRoot,
harness.project.name,
lastExecuteAgent,
);
if (projectWorktreeDir) {
console.log(` Running npm test in: ${projectWorktreeDir}`);
try {
execSync('node --test src/todo.test.js', {
cwd: projectWorktreeDir,
stdio: 'pipe',
});
console.log(' ✓ All tests passed');
} catch (err: unknown) {
const e = err as { stdout?: Buffer; stderr?: Buffer };
console.log(' ✗ Tests failed:');
if (e.stdout) console.log(e.stdout.toString());
if (e.stderr) console.log(e.stderr.toString());
// Don't hard-fail on test validation — the important check is all execute agents succeeded
console.warn(' [WARN] npm test failed in project worktree (may be expected if task ordering differs)');
}
} else {
console.warn(' [WARN] Could not find project worktree dir for npm test validation');
}
// Core assertions
const allSucceeded = executed.every((e) => e.result?.success === true);
if (!allSucceeded) {
const failed = executed.filter((e) => !e.result?.success);
console.warn(` [WARN] ${failed.length} execute task(s) did not succeed`);
}
expect(executed.length).toBeGreaterThan(0);
// ── Final summary ─────────────────────────────────────────────────────────
printFinalSummary(
initiative.name,
phases,
allTasks,
executed,
Date.now() - startedAt,
);
},
FULL_FLOW_TIMEOUT,
);
});
// =============================================================================
// Helpers
// =============================================================================
/**
* Gather all auto execute tasks across all phases, in order.
* Excludes planning tasks (discuss, plan, detail, refine, research).
*/
async function gatherAllExecuteTasks(
taskRepository: FullFlowHarness['taskRepository'],
phases: Phase[],
): Promise<Task[]> {
const result: Task[] = [];
for (const phase of phases) {
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
result.push(...execTasks);
}
return result;
}
/**
* Find the project worktree directory for the last executed task.
* Worktrees live at: <workspaceRoot>/agent-workdirs/<alias>/<projectName>/
*/
function findProjectWorktree(
workspaceRoot: string,
projectName: string,
lastExecuted: ExecutedTask | undefined,
): string | null {
if (!lastExecuted) return null;
try {
const worktreesBase = join(workspaceRoot, 'agent-workdirs');
const dirs = execSync(`ls "${worktreesBase}" 2>/dev/null || true`, { encoding: 'utf8' })
.trim()
.split('\n')
.filter(Boolean);
// Try all agent worktrees and return the first one with a project subdirectory
for (const dir of dirs.reverse()) {
const candidate = join(worktreesBase, dir, projectName);
try {
execSync(`test -d "${candidate}"`, { stdio: 'ignore' });
return candidate;
} catch {
// Not found in this worktree
}
}
} catch {
// ls failed or no worktrees yet
}
return null;
}

View File

@@ -11,7 +11,7 @@
* - A self-contained fixture git repo (todo-api) for agents to work on
* - Helpers for driving agents through question/answer loops
*
* COSTS REAL API CREDITS. Controlled by FULL_FLOW_TESTS=1.
* Used by full-flow-cassette.test.ts (replay) and for manual recording runs.
*/
import { mkdtemp, rm, cp } from 'node:fs/promises';
@@ -397,9 +397,3 @@ export async function createFullFlowHarness(
return harness;
}
// =============================================================================
// Guard
// =============================================================================
export const shouldRunFullFlowTests = process.env.FULL_FLOW_TESTS === '1';