From e9ec5143fd7dd47d4ed6bf8da4b930b9dcf537f7 Mon Sep 17 00:00:00 2001 From: Lukas May Date: Mon, 2 Mar 2026 12:22:46 +0900 Subject: [PATCH] docs: Document cassette testing system in docs/testing.md and CLAUDE.md --- CLAUDE.md | 5 +- docs/testing.md | 168 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 170 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 11fb2ec..ee6fb33 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -44,11 +44,12 @@ Run after any change to server-side code (`src/**`). ## Testing ```sh -npm test # Unit tests +npm test # Unit + E2E tests (no API cost) +CW_CASSETTE_RECORD=1 npm test -- # Record new cassettes locally REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 # Real provider tests (~$0.50) ``` -See [docs/testing.md](docs/testing.md) for details. +See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs. ## Documentation Maintenance diff --git a/docs/testing.md b/docs/testing.md index 9c646dc..c3702be 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -29,6 +29,15 @@ Located alongside source files (`*.test.ts`): | `edge-cases.test.ts` | Boundary conditions | | `extended-scenarios.test.ts` | Advanced multi-phase workflows | +These use `MockAgentManager` which bypasses the real subprocess pipeline. They test dispatch/coordination logic only. + +### Cassette Tests (Pipeline Integration, Zero API Cost) +`src/test/cassette/` — Tests the full agent execution pipeline using pre-recorded cassettes. + +Unlike E2E tests, cassette tests exercise the real `ProcessManager → FileTailer → OutputHandler → SignalManager` path. Unlike real provider tests, they cost nothing to run in CI. + +See **[Cassette System](#cassette-system)** below for full documentation. + ### Integration Tests (Real Providers) `src/test/integration/real-providers/` — **skipped by default** (cost real money): | File | Provider | Cost | @@ -65,15 +74,172 @@ Pre-built task hierarchies for testing: - Provides `describeRealClaude()` / `describeRealCodex()` that skip when env var not set - `MINIMAL_PROMPTS` — cheap prompts for testing output parsing +## Test Inventory + +See **[test-inventory.md](test-inventory.md)** for a complete catalog of every test, what it verifies, coverage gaps, redundancy map, and fragility assessment. + ## Running Tests ```sh -# Unit tests +# Unit + E2E tests (no API cost) npm test # Specific test file npm test -- src/agent/manager.test.ts +# Cassette tests — replay pre-recorded cassettes (no API cost) +npm test -- src/test/cassette/ + +# Record new cassettes locally (requires real Claude CLI) +CW_CASSETTE_RECORD=1 npm test -- src/test/integration/real-providers/claude-manager.test.ts + # Real provider tests (costs money!) REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000 ``` + +--- + +## Cassette System + +`src/test/cassette/` — VCR-style recording and replay for the agent subprocess pipeline. + +### Why it exists + +The `MockAgentManager` used in E2E tests skips from "spawn called" directly to "agent:stopped emitted". It never exercises `ProcessManager`, `FileTailer`, `OutputHandler`, or `SignalManager`. Bugs in those layers (signal.json race conditions, JSONL parsing failures, crash detection) are invisible to E2E tests. + +Real provider tests do exercise those layers, but they are slow, expensive, and can't run in CI without credentials. + +Cassette tests bridge this gap: they run the **real** `MultiProviderAgentManager` pipeline but replace the live Claude/Codex subprocess with a replay worker that writes pre-recorded output. + +### Coverage the cassette layer adds + +- `FileTailer` — fs.watch + poll cycle, incremental JSONL reading +- `OutputHandler` — stream event parsing, signal detection, result capture +- `SignalManager` — signal.json read/write/timing +- `LifecycleController` — retry logic, missing signal recovery +- `ProcessManager` — subprocess PID tracking, poll-for-completion +- Prompt normalization drift detection — key mismatch = re-record = visible diff + +### Key generation + +Each cassette is identified by a SHA256 hash of four components: + +| Component | What it captures | +|-----------|-----------------| +| `normalizedPrompt` | Prompt with UUIDs, temp paths, timestamps, session numbers replaced with placeholders | +| `providerName` | e.g. `claude`, `codex` | +| `modelArgs` | Provider CLI args with the prompt value stripped (sorted for stability) | +| `worktreeHash` | SHA256 of all non-hidden files in the agent worktree at spawn time | + +The `worktreeHash` is what detects content drift for execute-mode agents: if the worktree changes, the key misses and the cassette is re-recorded. + +**Normalization** (`src/test/cassette/normalizer.ts`) strips dynamic content that varies between runs but doesn't affect agent behavior: +- UUIDs → `__UUID__` +- Workspace root path → `__WORKSPACE__` +- ISO 8601 timestamps → `__TIMESTAMP__` +- Unix epoch milliseconds → `__EPOCH__` +- Session numbers → `session__N__` + +If a prompt *template* changes (e.g. someone edits `buildExecutePrompt()`), the normalized hash changes → cassette miss → test fails in CI → developer must re-record → the diff shows the new agent response in the PR. This makes prompt drift auditable. + +### Cassette file format + +Cassettes live in `src/test/cassettes/<32-char-hash>.json` and are committed to git. + +```json +{ + "version": 1, + "key": { + "normalizedPrompt": "You are a Worker agent...", + "providerName": "claude", + "modelArgs": ["--dangerously-skip-permissions", "--verbose", "--output-format", "stream-json"], + "worktreeHash": "empty" + }, + "recording": { + "jsonlLines": [ + "{\"type\":\"system\",\"session_id\":\"abc\"}", + "{\"type\":\"result\",\"subtype\":\"success\",\"result\":\"ok\"}" + ], + "signalJson": { "status": "done", "message": "Task complete" }, + "exitCode": 0, + "recordedAt": "2026-03-02T12:00:00.000Z" + } +} +``` + +### How replay works + +`CassetteProcessManager` (extends `ProcessManager`) overrides two methods: + +1. **`spawnDetached()`** — on a cache hit, spawns `replay-worker.mjs` instead of the real CLI. The worker writes the recorded JSONL lines to stdout (which `spawnDetached` redirects to the output file via fd) and writes `signal.json` relative to its cwd. Everything above — `FileTailer`, `OutputHandler`, poll loop — runs unmodified. + +2. **`pollForCompletion()`** — on a cache miss (record mode), wraps the `onComplete` callback to read the output file and `signal.json` after the process exits, then saves the cassette before handing off to `OutputHandler`. + +`MultiProviderAgentManager` accepts an optional `processManagerOverride` constructor parameter so `CassetteProcessManager` can be injected without changing production callers. + +### Mode control + +| Env var | Mode | Behaviour | +|---------|------|-----------| +| *(none)* | `replay` | Cassette must exist; throws if missing. Safe for CI. | +| `CW_CASSETTE_RECORD=1` | `auto` | Replays if cassette exists, runs real agent and records if missing. | +| `CW_CASSETTE_FORCE_RECORD=1` | `record` | Always runs real agent; overwrites existing cassette. Use when prompt changed intentionally. | + +### Writing cassette tests + +```ts +import { createCassetteHarness } from '../cassette/index.js'; +import { MINIMAL_PROMPTS } from '../integration/real-providers/prompts.js'; +import type { RealProviderHarness } from '../integration/real-providers/harness.js'; + +describe('agent pipeline (cassette)', () => { + let harness: RealProviderHarness; + + beforeAll(async () => { + harness = await createCassetteHarness({ provider: 'claude' }); + }); + + afterAll(() => harness.cleanup()); + + it('completes a task and emits agent:stopped', async () => { + const agent = await harness.agentManager.spawn({ + taskId: null, + prompt: MINIMAL_PROMPTS.done, + mode: 'execute', + provider: 'claude', + }); + + const result = await harness.waitForAgentCompletion(agent.id); + expect(result?.success).toBe(true); + + const stopped = harness.getEventsByType('agent:stopped'); + expect(stopped).toHaveLength(1); + }); +}); +``` + +`createCassetteHarness()` returns a `RealProviderHarness`, so tests written for real providers work unchanged. + +### Cassette directory + +``` +src/test/cassettes/ + .json ← committed to git; one file per recorded scenario + .gitkeep +``` + +Cassettes are committed so CI can run without any AI API credentials. When a cassette needs updating (prompt changed, provider output format changed), re-record locally with `CW_CASSETTE_RECORD=1` and commit the updated file. + +### Files + +| File | Purpose | +|------|---------| +| `types.ts` | `CassetteKey`, `CassetteRecording`, `CassetteEntry` interfaces | +| `normalizer.ts` | `normalizePrompt()`, `stripPromptFromArgs()` | +| `key.ts` | `hashWorktreeFiles()`, `buildCassetteKey()` | +| `store.ts` | `CassetteStore` — find/save cassette JSON files | +| `replay-worker.mjs` | Subprocess that replays a cassette (plain JS ESM, no build step) | +| `process-manager.ts` | `CassetteProcessManager` — overrides `spawnDetached` and `pollForCompletion` | +| `harness.ts` | `createCassetteHarness()` — factory returning `RealProviderHarness` | +| `index.ts` | Barrel exports | +| `cassette.test.ts` | Unit tests for normalizer, key generation, and store |