feat: Add cassette support for full-flow integration test

- normalizer.ts: Add NANOID_RE (21-char alphanumeric) → __ID__ as step 2.5, fixing cassette key instability from nanoid agent IDs in prompts - harness.ts: Add FullFlowHarnessOptions.processManagerFactory for injecting CassetteProcessManager without duplicating harness setup - full-flow-cassette.test.ts: New cassette-backed variant of full-flow test; skips automatically when no cassettes exist (fresh clone), runs in ~seconds once cassettes are recorded and committed - CLAUDE.md: Document cassette recording command for the full-flow test
2026-03-02 17:42:43 +09:00
parent 89db580ca4
commit 41b1d0e986
5 changed files with 263 additions and 0 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -48,6 +48,11 @@ npm test
 CW_CASSETTE_RECORD=1 npm test -- <test-file>                                    # Record new cassettes locally
 REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000  # Real provider tests (~$0.50)
 FULL_FLOW_TESTS=1 npm test -- src/test/integration/full-flow/ --test-timeout=3600000        # Full end-to-end test (~$2-5)
+
+# Record full-flow cassettes (one-time, costs ~$2–5 in API credits):
+CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
+# Commit the generated src/test/cassettes/<hash>.json files afterward.
+# Subsequent runs replay from cassettes at no cost: npm test
 ```

 See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs.
--- a/src/test/cassette/cassette.test.ts
+++ b/src/test/cassette/cassette.test.ts
@@ -56,6 +56,14 @@ describe('normalizePrompt', () => {
    expect(result).toBe(prompt);
  });

+  it('strips nanoid strings (21-char alphanumeric)', () => {
+    const nanoid = 'V1StGXR8_Z5jdHi6B-myT';
+    const prompt = `Agent worktree: /tmp/cw-preview-${nanoid}/app`;
+    const result = normalizePrompt(prompt, '');
+    expect(result).not.toContain(nanoid);
+    expect(result).toContain('__ID__');
+  });
+
  it('strips workspace root before UUID replacement to avoid double-normalizing', () => {
    const workspaceRoot = '/tmp/cw-test-abc123';
    const uuid = '550e8400-e29b-41d4-a716-446655440000';
--- a/src/test/cassette/normalizer.ts
+++ b/src/test/cassette/normalizer.ts
@@ -8,6 +8,7 @@
 */

 const UUID_RE = /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi;
+const NANOID_RE = /(?<![A-Za-z0-9])[A-Za-z0-9_-]{21}(?![A-Za-z0-9_-])/g;
 const ISO_TIMESTAMP_RE = /\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})?/g;
 const UNIX_EPOCH_MS_RE = /\b1[0-9]{12}\b/g;
 const SESSION_NUM_RE = /\bsession[_\s-]?\d+\b/gi;
@@ -18,6 +19,7 @@ const SESSION_NUM_RE = /\bsession[_\s-]?\d+\b/gi;
 * Replacements applied in order (most-specific first to avoid partial matches):
 * 1. Absolute workspace root path → __WORKSPACE__
 * 2. UUIDs → __UUID__
+ * 2.5. Nanoid IDs (21-char alphanumeric) → __ID__
 * 3. ISO 8601 timestamps → __TIMESTAMP__
 * 4. Unix epoch milliseconds → __EPOCH__
 * 5. Session numbers → session__N__
@@ -30,6 +32,7 @@ export function normalizePrompt(prompt: string, workspaceRoot: string): string {
  }

  normalized = normalized.replace(UUID_RE, '__UUID__');
+  normalized = normalized.replace(NANOID_RE, '__ID__');
  normalized = normalized.replace(ISO_TIMESTAMP_RE, '__TIMESTAMP__');
  normalized = normalized.replace(UNIX_EPOCH_MS_RE, '__EPOCH__');
  normalized = normalized.replace(SESSION_NUM_RE, 'session__N__');
--- a/src/test/integration/full-flow/full-flow-cassette.test.ts
+++ b/src/test/integration/full-flow/full-flow-cassette.test.ts
@@ -0,0 +1,237 @@
+/**
+ * Full-Flow Cassette Integration Test
+ *
+ * Cassette-backed variant of the full multi-agent workflow test.
+ * Runs the same discuss → plan → detail → execute pipeline but intercepts
+ * subprocess spawning with CassetteProcessManager — no real API calls in CI.
+ *
+ * Recording (one-time, costs ~$2–5):
+ *   CW_CASSETTE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
+ *   # Commit the generated src/test/cassettes/<hash>.json files afterward
+ *
+ * Replay (default — runs in seconds):
+ *   npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts
+ *
+ * Force re-record (overwrites existing cassettes):
+ *   CW_CASSETTE_FORCE_RECORD=1 npm test -- src/test/integration/full-flow/full-flow-cassette.test.ts --test-timeout=3600000
+ */
+
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import { existsSync, readdirSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import type { Phase, Task } from '../../../db/schema.js';
+import type { AgentResult } from '../../../agent/types.js';
+import { buildExecutePrompt } from '../../../agent/prompts/index.js';
+import { CassetteStore } from '../../cassette/store.js';
+import { CassetteProcessManager, type CassetteMode } from '../../cassette/process-manager.js';
+import {
+  createFullFlowHarness,
+  type FullFlowHarness,
+} from './harness.js';
+import {
+  printHeader,
+  printDiscussResult,
+  printPlanResult,
+  printDetailResult,
+  printExecuteResult,
+  printFinalSummary,
+  type ExecutedTask,
+} from './report.js';
+
+// =============================================================================
+// Constants
+// =============================================================================
+
+/** Total test timeout: 5 minutes (replay=seconds; 5min covers accidental record) */
+const CASSETTE_FLOW_TIMEOUT = 5 * 60_000;
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const CASSETTE_DIR =
+  process.env.CW_CASSETTE_DIR ?? join(__dirname, '../../cassettes');
+
+// =============================================================================
+// Mode helper
+// =============================================================================
+
+function cassetteMode(): CassetteMode {
+  if (process.env.CW_CASSETTE_FORCE_RECORD === '1') return 'record';
+  if (process.env.CW_CASSETTE_RECORD === '1') return 'auto';
+  return 'replay';
+}
+
+/**
+ * True when cassettes are available (at least one .json file) OR we're in a
+ * recording run. Skips the suite if no cassettes have been recorded yet so
+ * that `npm test` doesn't fail on a fresh clone before cassettes are committed.
+ */
+function cassettesAvailable(): boolean {
+  const mode = cassetteMode();
+  if (mode !== 'replay') return true; // recording runs always proceed
+  if (!existsSync(CASSETTE_DIR)) return false;
+  return readdirSync(CASSETTE_DIR).some((f) => f.endsWith('.json'));
+}
+
+// =============================================================================
+// Test
+// =============================================================================
+
+describe.skipIf(!cassettesAvailable())('full flow (cassette replay)', () => {
+  let harness: FullFlowHarness;
+  const startedAt = Date.now();
+
+  beforeAll(async () => {
+    const store = new CassetteStore(CASSETTE_DIR);
+    const mode = cassetteMode();
+
+    harness = await createFullFlowHarness('Add complete() method to TodoStore', {
+      processManagerFactory: (workspaceRoot, projectRepo) =>
+        new CassetteProcessManager(workspaceRoot, projectRepo, store, mode),
+    });
+
+    printHeader(harness.initiative.name);
+    console.log(`  Cassette mode : ${mode}`);
+    console.log(`  Cassette dir  : ${CASSETTE_DIR}`);
+    console.log(`  Initiative ID : ${harness.initiative.id}`);
+    console.log(`  Workspace     : ${harness.workspaceRoot}`);
+  }, CASSETTE_FLOW_TIMEOUT);
+
+  afterAll(async () => {
+    if (harness) await harness.cleanup();
+  });
+
+  it(
+    'runs the complete multi-agent workflow from cassettes',
+    async () => {
+      const { initiative, caller, agentManager, phaseRepository, taskRepository } = harness;
+      const initiativeId = initiative.id;
+
+      // ── Stage 2: Discuss ───────────────────────────────────────────────────
+      console.log('\n\n>>> Stage 2: DISCUSS <<<');
+      const discussAgent = await caller.spawnArchitectDiscuss({ initiativeId });
+      expect(discussAgent.id).toBeTruthy();
+      console.log(`  Spawned discuss agent: ${discussAgent.name} (${discussAgent.id})`);
+
+      const discussResult = await harness.driveToCompletion(
+        discussAgent.id,
+        'Use your best judgment and keep it simple. The focus is implementing complete(id) on TodoStore.',
+        CASSETTE_FLOW_TIMEOUT,
+      );
+      printDiscussResult(discussAgent.id, discussResult);
+
+      if (!discussResult?.success) {
+        console.warn('  [WARN] discuss agent did not succeed; continuing to plan stage');
+      }
+
+      // ── Stage 3: Plan ──────────────────────────────────────────────────────
+      console.log('\n\n>>> Stage 3: PLAN <<<');
+      const planAgent = await caller.spawnArchitectPlan({ initiativeId });
+      expect(planAgent.id).toBeTruthy();
+      console.log(`  Spawned plan agent: ${planAgent.name} (${planAgent.id})`);
+
+      const planResult = await harness.driveToCompletion(
+        planAgent.id,
+        'Keep it simple.',
+        CASSETTE_FLOW_TIMEOUT,
+      );
+      expect(planResult).toBeTruthy();
+
+      const phases: Phase[] = await phaseRepository.findByInitiativeId(initiativeId);
+      expect(phases.length).toBeGreaterThan(0);
+      printPlanResult(phases);
+
+      // ── Stage 4: Detail (per phase) ────────────────────────────────────────
+      console.log('\n\n>>> Stage 4: DETAIL <<<');
+      for (const phase of phases) {
+        const detailAgent = await caller.spawnArchitectDetail({ phaseId: phase.id });
+        expect(detailAgent.id).toBeTruthy();
+        console.log(`  Spawned detail agent for phase "${phase.name}": ${detailAgent.name}`);
+
+        const detailResult = await harness.driveToCompletion(
+          detailAgent.id,
+          'Keep it simple.',
+          CASSETTE_FLOW_TIMEOUT,
+        );
+        expect(detailResult).toBeTruthy();
+
+        const phaseTasks = await taskRepository.findByPhaseId(phase.id);
+        const executeTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
+        expect(executeTasks.length).toBeGreaterThan(0);
+        printDetailResult(phase, phaseTasks);
+      }
+
+      // ── Stage 5: Execute ───────────────────────────────────────────────────
+      console.log('\n\n>>> Stage 5: EXECUTE <<<');
+      const allTasks = await gatherAllExecuteTasks(taskRepository, phases);
+      console.log(`  Found ${allTasks.length} execute task(s) across ${phases.length} phase(s)`);
+
+      const executed: ExecutedTask[] = [];
+      for (const task of allTasks) {
+        console.log(`  Spawning execute agent for: "${task.name}"`);
+        const execAgent = await agentManager.spawn({
+          taskId: task.id,
+          prompt: buildExecutePrompt(task.description ?? task.name),
+          mode: 'execute',
+          initiativeId,
+          phaseId: task.phaseId ?? undefined,
+          inputContext: {
+            initiative,
+            task,
+          },
+        });
+        console.log(`    Agent: ${execAgent.name} (${execAgent.id})`);
+
+        const result = await harness.driveToCompletion(
+          execAgent.id,
+          'Use your best judgment and keep it simple.',
+          CASSETTE_FLOW_TIMEOUT,
+        );
+        executed.push({ task, result });
+
+        const icon = result?.success ? '✓' : '✗';
+        console.log(`    ${icon} Completed with success=${result?.success ?? null}`);
+        if (result && !result.success) {
+          console.log(`      Message: ${result.message?.slice(0, 200)}`);
+        }
+      }
+
+      printExecuteResult(executed);
+
+      // ── Assertions ─────────────────────────────────────────────────────────
+      expect(executed.length).toBeGreaterThan(0);
+
+      const allSucceeded = executed.every((e) => e.result?.success === true);
+      if (!allSucceeded) {
+        const failed = executed.filter((e) => !e.result?.success);
+        console.warn(`  [WARN] ${failed.length} execute task(s) did not succeed`);
+      }
+
+      // ── Final summary ──────────────────────────────────────────────────────
+      printFinalSummary(
+        initiative.name,
+        phases,
+        allTasks,
+        executed,
+        Date.now() - startedAt,
+      );
+    },
+    CASSETTE_FLOW_TIMEOUT,
+  );
+});
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+async function gatherAllExecuteTasks(
+  taskRepository: FullFlowHarness['taskRepository'],
+  phases: Phase[],
+): Promise<Task[]> {
+  const result: Task[] = [];
+  for (const phase of phases) {
+    const phaseTasks = await taskRepository.findByPhaseId(phase.id);
+    const execTasks = phaseTasks.filter((t) => t.category === 'execute' && t.type === 'auto');
+    result.push(...execTasks);
+  }
+  return result;
+}
--- a/src/test/integration/full-flow/harness.ts
+++ b/src/test/integration/full-flow/harness.ts
@@ -36,6 +36,7 @@ import type { AccountRepository } from '../../../db/repositories/account-reposit
 import type { ChangeSetRepository } from '../../../db/repositories/change-set-repository.js';
 import type { LogChunkRepository } from '../../../db/repositories/log-chunk-repository.js';
 import type { ConversationRepository } from '../../../db/repositories/conversation-repository.js';
+import type { ProcessManager } from '../../../agent/process-manager.js';
 import { createTestDatabase } from '../../../db/repositories/drizzle/test-helpers.js';
 import { createRepositories } from '../../../container.js';
 import { DefaultDispatchManager } from '../../../dispatch/manager.js';
@@ -162,6 +163,11 @@ const POLL_INTERVAL_MS = 1500;
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');

+export interface FullFlowHarnessOptions {
+  /** Factory called after workspaceRoot + repos are created. Return a custom ProcessManager. */
+  processManagerFactory?: (workspaceRoot: string, projectRepo: ProjectRepository) => ProcessManager;
+}
+
 /**
 * Create a full-flow test harness.
 *
@@ -177,6 +183,7 @@ const FIXTURES_DIR = join(__dirname, '../../fixtures/todo-api');
 */
 export async function createFullFlowHarness(
  initiativeName = 'Add complete() method to TodoStore',
+  options?: FullFlowHarnessOptions,
 ): Promise<FullFlowHarness> {
  // ── 0. Allow nested claude invocations ────────────────────────────────────
  // Claude Code sets CLAUDECODE in the environment, which prevents nested
@@ -219,6 +226,7 @@ export async function createFullFlowHarness(
  const eventBus = new CapturingEventBus();

  // ── 5. Real agent manager ─────────────────────────────────────────────────
+  const customProcessManager = options?.processManagerFactory?.(workspaceRoot, repos.projectRepository);
  const agentManager = new MultiProviderAgentManager(
    repos.agentRepository,
    workspaceRoot,
@@ -231,6 +239,8 @@ export async function createFullFlowHarness(
    repos.taskRepository,
    repos.pageRepository,
    repos.logChunkRepository,
+    false,                 // debug
+    customProcessManager,  // processManagerOverride
  );

  // ── 6. Dispatch manager (for execute stage) ───────────────────────────────