refactor: Remove full-flow.test.ts in favour of cassette variant
The cassette-backed test (full-flow-cassette.test.ts) covers the same discuss→plan→detail→execute pipeline without API cost. The real-agent test added no unique value once cassettes were committed, and the Stage 6 npm-test validation it included was soft (warn, not fail). Also removes the now-unused shouldRunFullFlowTests export and the FULL_FLOW_TESTS=1 entry from CLAUDE.md.
This commit is contained in:
@@ -47,7 +47,6 @@ Run after any change to server-side code (`src/**`).
|
|||||||
npm test # Unit + E2E tests (no API cost)
|
npm test # Unit + E2E tests (no API cost)
|
||||||
CW_CASSETTE_RECORD=1 npm test -- <test-file> # Record new cassettes locally
|
CW_CASSETTE_RECORD=1 npm test -- <test-file> # Record new cassettes locally
|
||||||
REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50)
|
REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50)
|
||||||
FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000 # Full end-to-end test (~$2-5)
|
|
||||||
|
|
||||||
# Record full-flow cassettes (one-time, costs ~$2–5 in API credits):
|
# Record full-flow cassettes (one-time, costs ~$2–5 in API credits):
|
||||||
CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
|
CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
|
||||||
|
|||||||
@@ -1,280 +0,0 @@
|
|||||||
/**
|
|
||||||
* Full-Flow Integration Test
|
|
||||||
*
|
|
||||||
* Tests a complete multi-agent workflow from "create initiative" through
|
|
||||||
* discuss → plan → detail → execute, validating that:
|
|
||||||
* - discuss mode gathers requirements (handling questions if asked)
|
|
||||||
* - plan mode produces sensible phases
|
|
||||||
* - detail mode breaks phases into executable tasks
|
|
||||||
* - execute mode implements the missing complete() method
|
|
||||||
* - npm test passes in the todo-api project after execution
|
|
||||||
*
|
|
||||||
* COSTS REAL API CREDITS (~$2–5 per run).
|
|
||||||
* Only runs when FULL_FLOW_TESTS=1 is set.
|
|
||||||
*
|
|
||||||
* Usage:
|
|
||||||
* FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=1800000
|
|
||||||
*/
|
|
||||||
|
|
||||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
||||||
import { join } from 'node:path';
|
|
||||||
import { execSync } from 'node:child_process';
|
|
||||||
import type { Phase, Task } from '../../../db/schema.js';
|
|
||||||
import type { AgentResult } from '../../../agent/types.js';
|
|
||||||
import { buildExecutePrompt } from '../../../agent/prompts/index.js';
|
|
||||||
import {
|
|
||||||
createFullFlowHarness,
|
|
||||||
shouldRunFullFlowTests,
|
|
||||||
type FullFlowHarness,
|
|
||||||
} from './harness.js';
|
|
||||||
import {
|
|
||||||
printHeader,
|
|
||||||
printDiscussResult,
|
|
||||||
printPlanResult,
|
|
||||||
printDetailResult,
|
|
||||||
printExecuteResult,
|
|
||||||
printGitDiff,
|
|
||||||
printFinalSummary,
|
|
||||||
type ExecutedTask,
|
|
||||||
} from './report.js';
|
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// Constants
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
/** Total test timeout: 60 minutes */
|
|
||||||
const FULL_FLOW_TIMEOUT = 60 * 60 * 1000;
|
|
||||||
|
|
||||||
/** Per-stage timeouts — architect agents are variable; these are upper bounds not expectations */
|
|
||||||
const DISCUSS_TIMEOUT_MS = 8 * 60_000;
|
|
||||||
const PLAN_TIMEOUT_MS = 12 * 60_000;
|
|
||||||
const DETAIL_TIMEOUT_MS = 15 * 60_000; // per phase
|
|
||||||
const EXECUTE_TIMEOUT_MS = 20 * 60_000; // per task — real RED-GREEN-REFACTOR cycles take time
|
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// Test
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
describe.skipIf(!shouldRunFullFlowTests)('full flow (real agents — costs API credits)', () => {
|
|
||||||
let harness: FullFlowHarness;
|
|
||||||
const startedAt = Date.now();
|
|
||||||
|
|
||||||
beforeAll(async () => {
|
|
||||||
harness = await createFullFlowHarness('Add complete() method to TodoStore');
|
|
||||||
printHeader(harness.initiative.name);
|
|
||||||
console.log(` Initiative ID : ${harness.initiative.id}`);
|
|
||||||
console.log(` Project ID : ${harness.project.id}`);
|
|
||||||
console.log(` Workspace : ${harness.workspaceRoot}`);
|
|
||||||
console.log(` Fixture dir : ${harness.fixtureRoot}`);
|
|
||||||
}, FULL_FLOW_TIMEOUT);
|
|
||||||
|
|
||||||
afterAll(async () => {
|
|
||||||
if (harness) {
|
|
||||||
await harness.cleanup();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
it(
|
|
||||||
'runs the complete multi-agent workflow',
|
|
||||||
async () => {
|
|
||||||
const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness;
|
|
||||||
const initiativeId = initiative.id;
|
|
||||||
|
|
||||||
// ── Stage 2: Discuss ─────────────────────────────────────────────────────
|
|
||||||
console.log('\n\n>>> Stage 2: DISCUSS <<<');
|
|
||||||
const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId });
|
|
||||||
expect(discussAgent.id).toBeTruthy();
|
|
||||||
console.log(` Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`);
|
|
||||||
|
|
||||||
const discussResult = await harness.driveToCompletion(
|
|
||||||
discussAgent.id,
|
|
||||||
'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.',
|
|
||||||
DISCUSS_TIMEOUT_MS,
|
|
||||||
);
|
|
||||||
printDiscussResult(discussAgent.id, discussResult);
|
|
||||||
|
|
||||||
// Discuss agents can complete without asking questions — success means it ran
|
|
||||||
// without crashing. A crashed discuss agent is a blocker but not fatal for
|
|
||||||
// subsequent stages (plan can still run with the initiative description alone).
|
|
||||||
if (!discussResult?.success) {
|
|
||||||
console.warn(' [WARN] discuss agent did not succeed; continuing to plan stage');
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Stage 3: Plan ─────────────────────────────────────────────────────────
|
|
||||||
console.log('\n\n>>> Stage 3: PLAN <<<');
|
|
||||||
const planAgent = await caller.spawnArchitectPlan({ initiativeId });
|
|
||||||
expect(planAgent.id).toBeTruthy();
|
|
||||||
console.log(` Spawned plan agent: ${planAgent.name} (${planAgent.id})`);
|
|
||||||
|
|
||||||
const planResult = await harness.driveToCompletion(planAgent.id, 'Keep it simple.', PLAN_TIMEOUT_MS);
|
|
||||||
expect(planResult).toBeTruthy();
|
|
||||||
|
|
||||||
const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId);
|
|
||||||
expect(phases.length).toBeGreaterThan(0);
|
|
||||||
printPlanResult(phases);
|
|
||||||
|
|
||||||
// ── Stage 4: Detail (per phase) ───────────────────────────────────────────
|
|
||||||
console.log('\n\n>>> Stage 4: DETAIL <<<');
|
|
||||||
for (const phase of phases) {
|
|
||||||
const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id });
|
|
||||||
expect(detailAgent.id).toBeTruthy();
|
|
||||||
console.log(` Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`);
|
|
||||||
|
|
||||||
const detailResult = await harness.driveToCompletion(
|
|
||||||
detailAgent.id,
|
|
||||||
'Keep it simple.',
|
|
||||||
DETAIL_TIMEOUT_MS,
|
|
||||||
);
|
|
||||||
expect(detailResult).toBeTruthy();
|
|
||||||
|
|
||||||
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
|
|
||||||
const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
|
|
||||||
expect(executeTasks.length).toBeGreaterThan(0);
|
|
||||||
printDetailResult(phase, phaseTasks);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Stage 5: Execute ──────────────────────────────────────────────────────
|
|
||||||
console.log('\n\n>>> Stage 5: EXECUTE <<<');
|
|
||||||
const allTasks = await gatherAllExecuteTasks(taskRepository, phases);
|
|
||||||
console.log(` Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`);
|
|
||||||
|
|
||||||
const executed: ExecutedTask[] = [];
|
|
||||||
for (const task of allTasks) {
|
|
||||||
console.log(` Spawning execute agent for: "${task.name}"`);
|
|
||||||
const execAgent = await agentManager.spawn({
|
|
||||||
taskId: task.id,
|
|
||||||
prompt: buildExecutePrompt(task.description ?? task.name),
|
|
||||||
mode: 'execute',
|
|
||||||
initiativeId,
|
|
||||||
phaseId: task.phaseId ?? undefined,
|
|
||||||
inputContext: {
|
|
||||||
initiative,
|
|
||||||
task,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
console.log(` Agent: ${execAgent.name} (${execAgent.id})`);
|
|
||||||
|
|
||||||
const result = await harness.driveToCompletion(
|
|
||||||
execAgent.id,
|
|
||||||
'Use your best judgment and keep it simple.',
|
|
||||||
EXECUTE_TIMEOUT_MS,
|
|
||||||
);
|
|
||||||
executed.push({ task, result });
|
|
||||||
|
|
||||||
const icon = result?.success ? '✓' : '✗';
|
|
||||||
console.log(` ${icon} Completed with success=${result?.success ?? null}`);
|
|
||||||
if (result && !result.success) {
|
|
||||||
console.log(` Message: ${result.message?.slice(0, 200)}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printExecuteResult(executed);
|
|
||||||
printGitDiff(harness.workspaceRoot, harness.project.name);
|
|
||||||
|
|
||||||
// ── Stage 6: Validate ─────────────────────────────────────────────────────
|
|
||||||
console.log('\n\n>>> Stage 6: VALIDATE <<<');
|
|
||||||
|
|
||||||
// Find the last execute agent's worktree for the todo-api project
|
|
||||||
const lastExecuteAgent = executed[executed.length - 1];
|
|
||||||
const projectWorktreeDir = findProjectWorktree(
|
|
||||||
harness.workspaceRoot,
|
|
||||||
harness.project.name,
|
|
||||||
lastExecuteAgent,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (projectWorktreeDir) {
|
|
||||||
console.log(` Running npm test in: ${projectWorktreeDir}`);
|
|
||||||
try {
|
|
||||||
execSync('node --test src/todo.test.js', {
|
|
||||||
cwd: projectWorktreeDir,
|
|
||||||
stdio: 'pipe',
|
|
||||||
});
|
|
||||||
console.log(' ✓ All tests passed');
|
|
||||||
} catch (err: unknown) {
|
|
||||||
const e = err as { stdout?: Buffer; stderr?: Buffer };
|
|
||||||
console.log(' ✗ Tests failed:');
|
|
||||||
if (e.stdout) console.log(e.stdout.toString());
|
|
||||||
if (e.stderr) console.log(e.stderr.toString());
|
|
||||||
// Don't hard-fail on test validation — the important check is all execute agents succeeded
|
|
||||||
console.warn(' [WARN] npm test failed in project worktree (may be expected if task ordering differs)');
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.warn(' [WARN] Could not find project worktree dir for npm test validation');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Core assertions
|
|
||||||
const allSucceeded = executed.every((e) => e.result?.success === true);
|
|
||||||
if (!allSucceeded) {
|
|
||||||
const failed = executed.filter((e) => !e.result?.success);
|
|
||||||
console.warn(` [WARN] ${failed.length} execute task(s) did not succeed`);
|
|
||||||
}
|
|
||||||
expect(executed.length).toBeGreaterThan(0);
|
|
||||||
|
|
||||||
// ── Final summary ─────────────────────────────────────────────────────────
|
|
||||||
printFinalSummary(
|
|
||||||
initiative.name,
|
|
||||||
phases,
|
|
||||||
allTasks,
|
|
||||||
executed,
|
|
||||||
Date.now() - startedAt,
|
|
||||||
);
|
|
||||||
},
|
|
||||||
FULL_FLOW_TIMEOUT,
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// Helpers
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gather all auto execute tasks across all phases, in order.
|
|
||||||
* Excludes planning tasks (discuss, plan, detail, refine, research).
|
|
||||||
*/
|
|
||||||
async function gatherAllExecuteTasks(
|
|
||||||
taskRepository: FullFlowHarness['taskRepository'],
|
|
||||||
phases: Phase[],
|
|
||||||
): Promise<Task[]> {
|
|
||||||
const result: Task[] = [];
|
|
||||||
for (const phase of phases) {
|
|
||||||
const phaseTasks = await taskRepository.findByPhaseId(phase.id);
|
|
||||||
const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
|
|
||||||
result.push(...execTasks);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Find the project worktree directory for the last executed task.
|
|
||||||
* Worktrees live at: <workspaceRoot>/agent-workdirs/<alias>/<projectName>/
|
|
||||||
*/
|
|
||||||
function findProjectWorktree(
|
|
||||||
workspaceRoot: string,
|
|
||||||
projectName: string,
|
|
||||||
lastExecuted: ExecutedTask | undefined,
|
|
||||||
): string | null {
|
|
||||||
if (!lastExecuted) return null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
const worktreesBase = join(workspaceRoot, 'agent-workdirs');
|
|
||||||
const dirs = execSync(`ls "${worktreesBase}" 2>/dev/null || true`, { encoding: 'utf8' })
|
|
||||||
.trim()
|
|
||||||
.split('\n')
|
|
||||||
.filter(Boolean);
|
|
||||||
|
|
||||||
// Try all agent worktrees and return the first one with a project subdirectory
|
|
||||||
for (const dir of dirs.reverse()) {
|
|
||||||
const candidate = join(worktreesBase, dir, projectName);
|
|
||||||
try {
|
|
||||||
execSync(`test -d "${candidate}"`, { stdio: 'ignore' });
|
|
||||||
return candidate;
|
|
||||||
} catch {
|
|
||||||
// Not found in this worktree
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// ls failed or no worktrees yet
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
* - A self-contained fixture git repo (todo-api) for agents to work on
|
* - A self-contained fixture git repo (todo-api) for agents to work on
|
||||||
* - Helpers for driving agents through question/answer loops
|
* - Helpers for driving agents through question/answer loops
|
||||||
*
|
*
|
||||||
* COSTS REAL API CREDITS. Controlled by FULL_FLOW_TESTS=1.
|
* Used by full-flow-cassette.test.ts (replay) and for manual recording runs.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { mkdtemp, rm, cp } from 'node:fs/promises';
|
import { mkdtemp, rm, cp } from 'node:fs/promises';
|
||||||
@@ -397,9 +397,3 @@ export async function createFullFlowHarness(
|
|||||||
|
|
||||||
return harness;
|
return harness;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// Guard
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
export const shouldRunFullFlowTests = process.env.FULL_FLOW_TESTS === '1';
|
|
||||||
|
|||||||
Reference in New Issue
Block a user