From e9ec5143fd7dd47d4ed6bf8da4b930b9dcf537f7 Mon Sep 17 00:00:00 2001
From: Lukas May <lukas.may@carealytix.com>
Date: Mon, 2 Mar 2026 12:22:46 +0900
Subject: [PATCH] docs: Document cassette testing system in docs/testing.md and
 CLAUDE.md

---
 CLAUDE.md       |   5 +-
 docs/testing.md | 168 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 170 insertions(+), 3 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 11fb2ec..ee6fb33 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -44,11 +44,12 @@ Run after any change to server-side code (`src/**`).
 ## Testing
 
 ```sh
-npm test                    # Unit tests
+npm test                                                                         # Unit + E2E tests (no API cost)
+CW_CASSETTE_RECORD=1 npm test -- <test-file>                                    # Record new cassettes locally
 REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000  # Real provider tests (~$0.50)
 ```
 
-See [docs/testing.md](docs/testing.md) for details.
+See [docs/testing.md](docs/testing.md) for details, including the **cassette system** for pipeline integration tests that run without API costs.
 
 ## Documentation Maintenance
 
diff --git a/docs/testing.md b/docs/testing.md
index 9c646dc..c3702be 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -29,6 +29,15 @@ Located alongside source files (`*.test.ts`):
 | `edge-cases.test.ts` | Boundary conditions |
 | `extended-scenarios.test.ts` | Advanced multi-phase workflows |
 
+These use `MockAgentManager` which bypasses the real subprocess pipeline. They test dispatch/coordination logic only.
+
+### Cassette Tests (Pipeline Integration, Zero API Cost)
+`src/test/cassette/` — Tests the full agent execution pipeline using pre-recorded cassettes.
+
+Unlike E2E tests, cassette tests exercise the real `ProcessManager → FileTailer → OutputHandler → SignalManager` path. Unlike real provider tests, they cost nothing to run in CI.
+
+See **[Cassette System](#cassette-system)** below for full documentation.
+
 ### Integration Tests (Real Providers)
 `src/test/integration/real-providers/` — **skipped by default** (cost real money):
 | File | Provider | Cost |
@@ -65,15 +74,172 @@ Pre-built task hierarchies for testing:
 - Provides `describeRealClaude()` / `describeRealCodex()` that skip when env var not set
 - `MINIMAL_PROMPTS` — cheap prompts for testing output parsing
 
+## Test Inventory
+
+See **[test-inventory.md](test-inventory.md)** for a complete catalog of every test, what it verifies, coverage gaps, redundancy map, and fragility assessment.
+
 ## Running Tests
 
 ```sh
-# Unit tests
+# Unit + E2E tests (no API cost)
 npm test
 
 # Specific test file
 npm test -- src/agent/manager.test.ts
 
+# Cassette tests — replay pre-recorded cassettes (no API cost)
+npm test -- src/test/cassette/
+
+# Record new cassettes locally (requires real Claude CLI)
+CW_CASSETTE_RECORD=1 npm test -- src/test/integration/real-providers/claude-manager.test.ts
+
 # Real provider tests (costs money!)
 REAL_CLAUDE_TESTS=1 npm test -- src/test/integration/real-providers/ --test-timeout=300000
 ```
+
+---
+
+## Cassette System
+
+`src/test/cassette/` — VCR-style recording and replay for the agent subprocess pipeline.
+
+### Why it exists
+
+The `MockAgentManager` used in E2E tests skips from "spawn called" directly to "agent:stopped emitted". It never exercises `ProcessManager`, `FileTailer`, `OutputHandler`, or `SignalManager`. Bugs in those layers (signal.json race conditions, JSONL parsing failures, crash detection) are invisible to E2E tests.
+
+Real provider tests do exercise those layers, but they are slow, expensive, and can't run in CI without credentials.
+
+Cassette tests bridge this gap: they run the **real** `MultiProviderAgentManager` pipeline but replace the live Claude/Codex subprocess with a replay worker that writes pre-recorded output.
+
+### Coverage the cassette layer adds
+
+- `FileTailer` — fs.watch + poll cycle, incremental JSONL reading
+- `OutputHandler` — stream event parsing, signal detection, result capture
+- `SignalManager` — signal.json read/write/timing
+- `LifecycleController` — retry logic, missing signal recovery
+- `ProcessManager` — subprocess PID tracking, poll-for-completion
+- Prompt normalization drift detection — key mismatch = re-record = visible diff
+
+### Key generation
+
+Each cassette is identified by a SHA256 hash of four components:
+
+| Component | What it captures |
+|-----------|-----------------|
+| `normalizedPrompt` | Prompt with UUIDs, temp paths, timestamps, session numbers replaced with placeholders |
+| `providerName` | e.g. `claude`, `codex` |
+| `modelArgs` | Provider CLI args with the prompt value stripped (sorted for stability) |
+| `worktreeHash` | SHA256 of all non-hidden files in the agent worktree at spawn time |
+
+The `worktreeHash` is what detects content drift for execute-mode agents: if the worktree changes, the key misses and the cassette is re-recorded.
+
+**Normalization** (`src/test/cassette/normalizer.ts`) strips dynamic content that varies between runs but doesn't affect agent behavior:
+- UUIDs → `__UUID__`
+- Workspace root path → `__WORKSPACE__`
+- ISO 8601 timestamps → `__TIMESTAMP__`
+- Unix epoch milliseconds → `__EPOCH__`
+- Session numbers → `session__N__`
+
+If a prompt *template* changes (e.g. someone edits `buildExecutePrompt()`), the normalized hash changes → cassette miss → test fails in CI → developer must re-record → the diff shows the new agent response in the PR. This makes prompt drift auditable.
+
+### Cassette file format
+
+Cassettes live in `src/test/cassettes/<32-char-hash>.json` and are committed to git.
+
+```json
+{
+  "version": 1,
+  "key": {
+    "normalizedPrompt": "You are a Worker agent...",
+    "providerName": "claude",
+    "modelArgs": ["--dangerously-skip-permissions", "--verbose", "--output-format", "stream-json"],
+    "worktreeHash": "empty"
+  },
+  "recording": {
+    "jsonlLines": [
+      "{\"type\":\"system\",\"session_id\":\"abc\"}",
+      "{\"type\":\"result\",\"subtype\":\"success\",\"result\":\"ok\"}"
+    ],
+    "signalJson": { "status": "done", "message": "Task complete" },
+    "exitCode": 0,
+    "recordedAt": "2026-03-02T12:00:00.000Z"
+  }
+}
+```
+
+### How replay works
+
+`CassetteProcessManager` (extends `ProcessManager`) overrides two methods:
+
+1. **`spawnDetached()`** — on a cache hit, spawns `replay-worker.mjs` instead of the real CLI. The worker writes the recorded JSONL lines to stdout (which `spawnDetached` redirects to the output file via fd) and writes `signal.json` relative to its cwd. Everything above — `FileTailer`, `OutputHandler`, poll loop — runs unmodified.
+
+2. **`pollForCompletion()`** — on a cache miss (record mode), wraps the `onComplete` callback to read the output file and `signal.json` after the process exits, then saves the cassette before handing off to `OutputHandler`.
+
+`MultiProviderAgentManager` accepts an optional `processManagerOverride` constructor parameter so `CassetteProcessManager` can be injected without changing production callers.
+
+### Mode control
+
+| Env var | Mode | Behaviour |
+|---------|------|-----------|
+| *(none)* | `replay` | Cassette must exist; throws if missing. Safe for CI. |
+| `CW_CASSETTE_RECORD=1` | `auto` | Replays if cassette exists, runs real agent and records if missing. |
+| `CW_CASSETTE_FORCE_RECORD=1` | `record` | Always runs real agent; overwrites existing cassette. Use when prompt changed intentionally. |
+
+### Writing cassette tests
+
+```ts
+import { createCassetteHarness } from '../cassette/index.js';
+import { MINIMAL_PROMPTS } from '../integration/real-providers/prompts.js';
+import type { RealProviderHarness } from '../integration/real-providers/harness.js';
+
+describe('agent pipeline (cassette)', () => {
+  let harness: RealProviderHarness;
+
+  beforeAll(async () => {
+    harness = await createCassetteHarness({ provider: 'claude' });
+  });
+
+  afterAll(() => harness.cleanup());
+
+  it('completes a task and emits agent:stopped', async () => {
+    const agent = await harness.agentManager.spawn({
+      taskId: null,
+      prompt: MINIMAL_PROMPTS.done,
+      mode: 'execute',
+      provider: 'claude',
+    });
+
+    const result = await harness.waitForAgentCompletion(agent.id);
+    expect(result?.success).toBe(true);
+
+    const stopped = harness.getEventsByType('agent:stopped');
+    expect(stopped).toHaveLength(1);
+  });
+});
+```
+
+`createCassetteHarness()` returns a `RealProviderHarness`, so tests written for real providers work unchanged.
+
+### Cassette directory
+
+```
+src/test/cassettes/
+  <hash>.json     ← committed to git; one file per recorded scenario
+  .gitkeep
+```
+
+Cassettes are committed so CI can run without any AI API credentials. When a cassette needs updating (prompt changed, provider output format changed), re-record locally with `CW_CASSETTE_RECORD=1` and commit the updated file.
+
+### Files
+
+| File | Purpose |
+|------|---------|
+| `types.ts` | `CassetteKey`, `CassetteRecording`, `CassetteEntry` interfaces |
+| `normalizer.ts` | `normalizePrompt()`, `stripPromptFromArgs()` |
+| `key.ts` | `hashWorktreeFiles()`, `buildCassetteKey()` |
+| `store.ts` | `CassetteStore` — find/save cassette JSON files |
+| `replay-worker.mjs` | Subprocess that replays a cassette (plain JS ESM, no build step) |
+| `process-manager.ts` | `CassetteProcessManager` — overrides `spawnDetached` and `pollForCompletion` |
+| `harness.ts` | `createCassetteHarness()` — factory returning `RealProviderHarness` |
+| `index.ts` | Barrel exports |
+| `cassette.test.ts` | Unit tests for normalizer, key generation, and store |