diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 57dc956..da62abb 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
 
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 44aee2c..fec606b 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -17,7 +17,9 @@ env:
 
 jobs:
   release:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
+    outputs:
+      version: ${{ steps.vars.outputs.version }}
     steps:
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
 
@@ -95,3 +97,13 @@ jobs:
             "$PLUGIN_DIR"/bin/codex-hook-darwin-x64 \
             "$PLUGIN_DIR"/bin/codex-hook-linux-x64 \
             "$PLUGIN_DIR"/bin/codex-hook-linux-arm64
+
+  # After a successful publish, run the end-to-end smoke test (macOS + Linux)
+  # against the just-released version.
+  smoke:
+    needs: release
+    uses: ./.github/workflows/smoke.yaml
+    with:
+      version: ${{ needs.release.outputs.version }}
+    secrets:
+      openai_api_key: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
new file mode 100644
index 0000000..ea66015
--- /dev/null
+++ b/.github/workflows/smoke.yaml
@@ -0,0 +1,81 @@
+name: Smoke trace-codex
+
+# End-to-end smoke test for the trace-codex plugin against a published release.
+# Installs Codex + the plugin from the marketplace, runs a real `codex exec`
+# session with tracing pointed at a local mock Braintrust collector, and asserts
+# at least one trace row was reported.
+#
+# Two entry points:
+#   - workflow_call: invoked by release.yaml after a successful release, with the
+#     version it just published.
+#   - workflow_dispatch: run manually from the Actions tab against any published
+#     release version (e.g. to retest an older release).
+
+on:
+  workflow_call:
+    inputs:
+      version:
+        description: "Published release version to test, e.g. 0.0.1 (with or without leading v)."
+        required: true
+        type: string
+    secrets:
+      openai_api_key:
+        description: "OpenAI API key used by codex exec."
+        required: true
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Published release version to test, e.g. 0.0.1 (with or without leading v)."
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+env:
+  PLUGIN_DIR: plugins/trace-codex
+
+jobs:
+  smoke:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - runner: ubuntu-24.04
+            label: linux-x64
+          - runner: macos-15
+            label: darwin-arm64
+          - runner: macos-15-intel
+            label: darwin-x64
+    runs-on: ${{ matrix.runner }}
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Install Bun
+        uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2
+        with:
+          bun-version: latest
+
+      - name: Install plugin dependencies
+        working-directory: ${{ env.PLUGIN_DIR }}
+        run: bun install --frozen-lockfile
+
+      - name: Install Codex CLI
+        run: npm install -g @openai/codex
+
+      - name: Install trace-codex plugin from the marketplace
+        env:
+          # The release repo is private; the launcher downloads the binary using
+          # this token. `codex plugin` also uses gh credentials for the clone.
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          codex plugin marketplace add braintrustdata/braintrust-codex-plugin
+          codex plugin add trace-codex@braintrust-codex-plugins
+
+      - name: Run smoke test (${{ matrix.label }})
+        working-directory: ${{ env.PLUGIN_DIR }}
+        env:
+          OPENAI_API_KEY: ${{ secrets.openai_api_key || secrets.OPENAI_API_KEY }}
+          GH_TOKEN: ${{ github.token }}
+          SMOKE_RELEASE_VERSION: ${{ inputs.version }}
+        run: sh scripts/smoke-test.sh
diff --git a/AGENTS.md b/AGENTS.md
index d2016b0..2e4bf58 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -17,8 +17,9 @@ Key files for the MCP/skills plugin:
 - `plugins/braintrust-codex-plugin/.mcp.json` — MCP server definition
 - `plugins/braintrust-codex-plugin/skills/braintrust/` — agent skills exposed through the plugin
 
-Key files for the tracing plugin:
+Key files for the tracing plugin (see [`plugins/trace-codex/AGENTS.md`](plugins/trace-codex/AGENTS.md) for its architecture):
 
+- `plugins/trace-codex/AGENTS.md` — architecture and contributor guide for this plugin
 - `plugins/trace-codex/.codex-plugin/plugin.json` — plugin manifest
 - `plugins/trace-codex/hooks/hooks.json` — lifecycle hook config
 - `plugins/trace-codex/src/` — the hook client + event server (compiled to `bin/codex-hook`)
diff --git a/plugins/trace-codex/AGENTS.md b/plugins/trace-codex/AGENTS.md
new file mode 100644
index 0000000..1c0edc1
--- /dev/null
+++ b/plugins/trace-codex/AGENTS.md
@@ -0,0 +1,195 @@
+# AGENTS.md — trace-codex
+
+Guidelines for AI agents working in the `trace-codex` plugin. This is the
+opt-in plugin that traces Codex sessions to Braintrust. It is **independent**
+from the MCP/skills plugin (`braintrust-codex-plugin`) — do not merge behavior
+between them. See the [repo AGENTS.md](../../AGENTS.md) for the monorepo
+overview.
+
+## What this plugin is
+
+A long-lived, local **event server** plus a short-lived **hook client**, both
+compiled into a single `codex-hook` binary. Codex fires lifecycle hooks; each
+hook invocation runs the client, which forwards the event to the server, which
+turns the stream of events into Braintrust spans (session → turn → tool).
+
+## Architecture
+
+Request/data flow:
+
+```
+Codex hook → bin/codex-hook.sh → codex-hook (client mode)
+          → POST /enqueue → in-memory FIFO queue
+          → ProcessorRegistry (one EventProcessor per session)
+          → Braintrust spans
+```
+
+Key pieces:
+
+- `src/index.ts` — entry point. One binary, three modes: `serve` (the
+  background server), `hook` (default; read one event from stdin, enqueue it),
+  and `replay` (re-POST a recorded NDJSON session).
+- `src/client/` — the hook client. Ensures a server is up (booting a detached
+  one if needed), POSTs events, and on a terminal event (`Stop`) calls `/flush`
+  so final spans are delivered before the process tree is torn down.
+- `src/server/` — the HTTP server, the FIFO `EventQueue`, routes
+  (`/enqueue`, `/flush`, `/health`, `/shutdown`), and the idle watchdog.
+- `src/processor/` — the generic `ProcessorRegistry` (LRU map of per-session
+  processors) and the `EventProcessor` interface.
+- `src/agents/codex/` — Codex-specific: translate hook payloads into
+  `EnqueueEvent`s (`event-builder.ts`), build spans (`event-processor.ts`), and
+  read user settings (`settings.ts`).
+- `src/braintrust/` — thin wrapper over the Braintrust SDK (span factory).
+
+The server is **single-version**: a client whose version doesn't match a running
+server bails rather than talking to it. It shuts itself down after an idle
+window (default 5 min) or on `/shutdown`.
+
+The design has three deliberate properties worth understanding before changing
+anything:
+
+### 1. Agent-agnostic "event server"
+
+The core (`src/server/`, `src/processor/`, `src/braintrust/`) knows nothing
+about Codex. It speaks a generic `EnqueueEvent` shape and routes events to a
+per-session processor. All Codex-specific knowledge lives in `src/agents/codex/`
+and is registered into the generic core at startup. Adding another agent (Claude
+Code, opencode, etc.) means adding a sibling `src/agents/<agent>/` module that
+exports the same `Agent` shape — the server, queue, and registry should not need
+to change.
+
+Keep this boundary clean: nothing under `src/server/`, `src/processor/`, or
+`src/braintrust/` should reference Codex by name. The generic event server may
+eventually be extracted into its own package, so treat any Codex-specific leak
+into the generic layers as a bug, not a shortcut.
+
+### 2. Bun-compiled, cross-platform binaries
+
+The hook command in `hooks.json` is a fixed, platform-agnostic string that
+invokes `bin/codex-hook.sh` (or `.cmd` on Windows), **not** the binary directly.
+That launcher resolves and runs the real, platform-specific `codex-hook` binary.
+
+- `scripts/build.ts` compiles `src/index.ts` with `bun build --compile` for each
+  target (`darwin-arm64/x64`, `linux-x64/arm64`; Windows slots in but isn't
+  built yet). `BUILD_HOST_ONLY=1` builds just the host target (used by the
+  repo-root `install.sh` for local dev installs).
+- The compiled binaries are **large and not committed**. The launcher downloads
+  the matching binary from the plugin's GitHub release on first use and caches
+  it at `$PLUGIN_ROOT/bin/codex-hook`. Codex wipes `$PLUGIN_ROOT` on every
+  install/upgrade, so the cache self-invalidates and the next hook re-downloads
+  the right version.
+- Plugin version is the single source of truth in
+  `.codex-plugin/plugin.json`; the launcher reads it to construct the release
+  tag (`trace-codex-v<version>`).
+
+**Hard rule:** the hook must never fail the Codex turn. The launcher and client
+log to stderr and exit 0 on any error; the server swallows errors so a turn is
+never blocked by tracing.
+
+### 3. Hooks are non-blocking by default
+
+Tracing must not degrade the experience of using the coding agent. Every hook
+fires synchronously in Codex's path, so blocking work on a hook directly adds
+latency to the user's session. The design therefore makes the common path
+fire-and-forget: the client POSTs to `/enqueue` and returns immediately, while
+the background server does the slow work (Braintrust SDK calls, flushes) off the
+critical path.
+
+The terminal event is the one place where blocking is even an option, and it's
+**opt-in**:
+
+- On a **terminal event** (`Stop`), the client calls `/flush` **only** when
+  `BRAINTRUST_FLUSH_ON_TURN_END` is set, blocking until the server confirms the
+  queue has drained and buffered spans reached Braintrust. The blocking mode
+  exists for short-lived hosts (e.g. a CI job that ends right after the last turn)
+  that would otherwise tear the process tree down before the final spans are
+  delivered. When enabled, the wait is bounded by a timeout (`/flush` gives up
+  rather than hanging the turn). By default the client does nothing on `Stop`: the
+  long-lived server flushes on its own when the queue goes idle, so in normal
+  interactive use no spans are lost and the turn isn't stalled by a flush request.
+
+When adding behavior, prefer enqueue-and-return. Reach for a blocking
+request/await only when data would otherwise be lost, keep it off the
+per-hook hot path where possible, and always bound it so a slow or stuck backend
+can't stall the agent.
+
+### 4. Sessions can outlive the server (resume)
+
+The server is short-lived (idle shutdown after ~5 min, or the user closes and
+later resumes). A Codex session, though, can span that gap. To avoid dropping
+the tail of a trace — or worse, re-emitting it as duplicate/orphaned spans — the
+Codex processor **persists its resumable state** and rehydrates it when a new
+processor is created for the same session id.
+
+How it works:
+
+- The transcript (rollout JSONL) on disk is the durable source of truth for span
+  *content*. What a restart loses is the processor's in-memory bookkeeping:
+  transcript byte offsets, which spans are still open (and their identities),
+  the reconstructed conversation history, and the subagent/compaction side-maps.
+- On every `flush()` (idle drain, eviction, terminal Stop, shutdown), the
+  processor writes a JSON snapshot of that bookkeeping to
+  `PLUGIN_DATA/state/<sessionId>.json` (`src/agents/codex/snapshot-store.ts`).
+  Span handles are stored as identities (`span_id`/`root_span_id`/`parents` plus
+  name/type), captured from the SDK's synchronous span getters.
+- On its first event, a processor attempts a one-time restore for its session
+  id. If a compatible snapshot exists, it recreates each span handle bound to the
+  original id via `SpanFactory.rehydrateSpan` (Braintrust merges rows by
+  `span_id`, so further `log()`/`end()` calls continue the same trace). Restore
+  runs **before** the tracing-enabled master switch, because the snapshot also
+  carries the session's reporting config (a bare mid-session restart may have no
+  leading config event).
+- The snapshot is **not** deleted when the root span ends. Codex's `Stop` hook is
+  per-**turn**, not per-session (there is no session-end hook), and we end the
+  root on the first `Stop` — but later turns still attach as children of that same
+  root, so a restart *between* turns must be able to resume. The snapshot
+  therefore persists past root-end; stale ones (sessions that never resume) are
+  reclaimed by a startup GC sweep that removes snapshots older than a TTL.
+
+Invariants to preserve when changing this:
+
+- **Persistence is Codex-owned**, not part of the generic event server: *where* a
+  plugin may persist state is the host agent's call (Codex gives us
+  `PLUGIN_DATA`). Keep the store and snapshot shape under `src/agents/codex/`.
+  The only generic addition is `SpanFactory.rehydrateSpan` (pure SDK behavior).
+- **Never persist secrets.** The `apiKey` is stripped from the snapshot's
+  reporting config and re-resolved from env / the config event on resume (mirrors
+  the event recorder's redaction).
+- **Never throw.** The store swallows and logs all I/O errors; a persistence or
+  restore failure must not break a turn — it just falls back to starting fresh.
+- **Version-gate snapshots.** Each carries `pluginVersion` + a schema version;
+  a mismatch discards the snapshot. Bump `SNAPSHOT_SCHEMA_VERSION` in
+  `state-snapshot.ts` whenever the shape changes incompatibly.
+
+## Making changes
+
+- **Generic vs. agent code**: put agent-specific logic in `src/agents/<agent>/`;
+  keep the server/processor/braintrust layers agent-agnostic.
+- **Config**: two layers. The agent-specific layer (`src/agents/codex/settings.ts`)
+  reads the user's `config.json` from `PLUGIN_DATA`, maps its friendly camelCase
+  keys onto `BRAINTRUST_*` / `BRAINTRUST_EVENT_SERVER_*` env vars (env wins over
+  the file), and is run by the hook client before it boots the server — so the
+  spawned server inherits the resolved env. The generic layer (`src/config.ts`)
+  then reads **only** those env vars and never touches `config.json`. Keep it
+  that way: config-file parsing is deliberately agent-specific. User-facing
+  settings are documented in the [README](./README.md); update it (and the
+  `Settings` map in `settings.ts`) when adding one.
+- **Hooks**: `hooks/hooks.json` lists the lifecycle events wired to the client.
+  Most are forwarded but not yet turned into spans (the processor no-ops the
+  ones it doesn't handle).
+- **Build**: edit `scripts/build.ts` for targets/output; the launcher scripts
+  (`bin/codex-hook.sh`, `bin/codex-hook.cmd`) for download/exec behavior.
+
+## Commands
+
+Run from `plugins/trace-codex/`:
+
+- `bun test` — run the test suite (tests live next to sources as `*.test.ts`).
+- `bun run typecheck` — `tsc --noEmit`.
+- `bun run lint` / `bun run check` — Biome lint / lint+format.
+- `bun run build` — compile all target binaries (`BUILD_HOST_ONLY=1` for host
+  only).
+- `bun run dev` — run the server in watch mode.
+
+Add or update tests alongside any behavior change; keep typecheck and lint
+clean.
diff --git a/plugins/trace-codex/Makefile b/plugins/trace-codex/Makefile
index 0f20746..ead81c7 100644
--- a/plugins/trace-codex/Makefile
+++ b/plugins/trace-codex/Makefile
@@ -9,6 +9,14 @@
 #   make build         Compile the host binary (and per-platform binaries).
 #   make integration   Build, boot the server via the hook binary, health-check,
 #                      assert the version, then shut it down.
+#   make smoke         End-to-end: run a real `codex exec` session with the
+#                      installed plugin, traced to a local mock collector, and
+#                      assert >=1 trace row. Requires codex, bun, and
+#                      OPENAI_API_KEY; optionally SMOKE_VERSION to pin a release.
+#   make token-proxy   Run the token-counting proxy (debugging aid) that sits
+#                      between Codex and OpenAI and prints each call's token usage
+#                      for comparison against the trace. Requires OPENAI_API_KEY.
+#                      See scripts/token-proxy.ts for the config.toml snippet.
 #   make clean         Remove build artifacts.
 
 SHELL := /bin/bash
@@ -24,7 +32,7 @@ EXPECTED_VERSION := $(shell sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"
 BIN := bin/codex-hook
 BASE_URL := http://127.0.0.1:$(TEST_PORT)
 
-.PHONY: test lint typecheck unit build integration clean
+.PHONY: test lint typecheck unit build integration smoke token-proxy clean
 
 test: lint typecheck unit integration
 
@@ -81,6 +89,20 @@ integration: build
 	echo "    -> server stopped cleanly"; \
 	echo "PASS: integration test"
 
+# End-to-end smoke test. Assumes the plugin is installed in Codex (run
+# `../../install.sh trace-codex` first for a local dev install). Set
+# SMOKE_VERSION to pin the release whose binary the launcher downloads; leave it
+# unset to use the installed plugin manifest version / local dev binary.
+smoke:
+	@echo "==> Smoke test"
+	SMOKE_RELEASE_VERSION="$(SMOKE_VERSION)" sh scripts/smoke-test.sh
+
+# Token-counting proxy for verifying trace-codex's token accounting. Run it, then
+# point Codex at it via a custom model_provider (see scripts/token-proxy.ts).
+token-proxy:
+	@echo "==> Token proxy (Ctrl-C for the session total)"
+	bun run scripts/token-proxy.ts
+
 clean:
 	@echo "==> Cleaning"
 	rm -rf bin
diff --git a/plugins/trace-codex/README.md b/plugins/trace-codex/README.md
index 6371b6e..b2d059c 100644
--- a/plugins/trace-codex/README.md
+++ b/plugins/trace-codex/README.md
@@ -12,6 +12,50 @@ codex plugin add trace-codex@braintrust-codex-plugins
 TRACE_TO_BRAINTRUST=true BRAINTRUST_PROJECT=my-coding-agent codex
 ```
 
+## Using the plugin in CI
+
+Tracing `codex exec` runs in CI works. A few notes:
+
+- **Block on flush.** Set `BRAINTRUST_FLUSH_ON_TURN_END=true` so the final spans are delivered before the job exits (a short-lived CI job can otherwise tear down before the background server's idle-drain flush fires).
+- **Trust the hooks non-interactively.** Pass `codex exec --dangerously-bypass-hook-trust` to skip the one-time interactive hook-trust prompt. (Only for plugins you trust.)
+
+GitHub Actions example:
+
+```yaml
+name: codex-traced
+on: [workflow_dispatch]
+
+jobs:
+  run:
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Codex CLI
+        run: npm install -g @openai/codex   # or your preferred install method
+
+      - name: Install the trace-codex plugin (pinned to a release tag)
+        # TODO: replace with desired version ---------------------------------vvvvvvvvvvvvvvvvvv
+        run: |
+          codex plugin marketplace add braintrustdata/braintrust-codex-plugin@trace-codex-v0.0.X
+          codex plugin add trace-codex@braintrust-codex-plugins
+
+      - name: Run a traced Codex session
+        env:
+          TRACE_TO_BRAINTRUST: "true"
+          BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+          BRAINTRUST_PROJECT: my-coding-agent # <--- TODO: replace with your project name
+          BRAINTRUST_FLUSH_ON_TURN_END: "true"
+        run: |
+          codex exec \
+            --skip-git-repo-check \
+            --dangerously-bypass-hook-trust \
+            --sandbox read-only \
+            "summarize the changes in this repo"
+```
+
+If your plugin release repo is private, also expose a token the launcher can use to download the binary (`GH_TOKEN` or `GITHUB_TOKEN`, or an authenticated `gh`).
+
 ## Configuration
 
 There are two ways to configure the plugin. **Environment variables always win over the config file**
@@ -30,8 +74,22 @@ Every setting can be provided as a `config.json` key or as an environment variab
 | `project`            | `BRAINTRUST_PROJECT`                  | _(unset)_          | Project to log traces into.                                                                                                                  |
 | `apiUrl`             | `BRAINTRUST_API_URL`                  | api.braintrust.dev | Braintrust API URL.                                                                                                                          |
 | `additionalMetadata` | `BRAINTRUST_ADDITIONAL_METADATA`      | _(unset)_          | JSON object of extra metadata merged into the root span. Standard keys (`session_id`, `model`, `project`, etc.) take precedence on conflict. |
+| `flushOnTurnEnd`     | `BRAINTRUST_FLUSH_ON_TURN_END`        | `false`            | When `true`, the hook blocks at each turn's end (the `Stop` event) until the server confirms all spans are flushed. Use in programmatic/CI runs to guarantee traces are delivered before Codex exits. |
 | `recordFile`         | `BRAINTRUST_EVENT_SERVER_RECORD_FILE` | _(unset)_          | If set, record every event to this NDJSON file (for `replay`).                                                                               |
 
+### Resuming sessions
+
+Note that when resuming a session, the original session's options will remain in effect.
+
+For example:
+
+```sh
+TRACE_TO_BRAINTRUST=true codex # first session enables braintrust
+TRACE_TO_BRAINTRUST=false codex resume abcde # the resumed session will still be traced because the original session was
+```
+
+The trace itself also survives the background server stopping. The server shuts down after the idle window (or on an explicit shutdown), but a session can outlive it — you might leave it open past the timeout, send another message after the server stopped, or `codex resume` later. The plugin snapshots each session's in-progress trace state under `$PLUGIN_DATA/state/` and restores it when the session continues, so later turns keep landing in the same trace instead of starting a new one. Stale snapshots age out automatically, and secrets (your API key) are never written to them.
+
 ### Advanced Options
 
 Advanced plugin settings for debugging or developing the plugin
diff --git a/plugins/trace-codex/bin/codex-hook.sh b/plugins/trace-codex/bin/codex-hook.sh
index eaed11e..102c3d9 100755
--- a/plugins/trace-codex/bin/codex-hook.sh
+++ b/plugins/trace-codex/bin/codex-hook.sh
@@ -51,8 +51,17 @@ suffix="$os_name-$arch_name"
 
 # Read the plugin version (single source of truth) from the manifest. Only on
 # this slow path, so no dependency (jq) and no parsing on the hot path.
+#
+# TRACE_CODEX_RELEASE_VERSION overrides the manifest version. This is intended
+# for testing (e.g. the smoke test pointing at an arbitrary published release);
+# normal installs leave it unset and use the manifest.
 manifest="$ROOT/.codex-plugin/plugin.json"
-version=$(sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$manifest" 2>/dev/null | head -1)
+version="${TRACE_CODEX_RELEASE_VERSION:-}"
+if [ -z "$version" ]; then
+  version=$(sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$manifest" 2>/dev/null | head -1)
+fi
+# Allow an optional leading 'v' on the override (v0.0.1 or 0.0.1).
+version="${version#v}"
 if [ -z "$version" ]; then
   log "could not read plugin version from $manifest; tracing disabled this session"
   exit 0
diff --git a/plugins/trace-codex/config.json.example b/plugins/trace-codex/config.json.example
index 8f05530..b9b0d93 100644
--- a/plugins/trace-codex/config.json.example
+++ b/plugins/trace-codex/config.json.example
@@ -6,6 +6,7 @@
   "appUrl": "https://www.braintrust.dev",
   "project": "my-codex-project",
   "additionalMetadata": { "team": "platform", "env": "dev" },
+  "flushOnTurnEnd": false,
   "recordFile": "",
   "port": 52734,
   "idleTimeoutMs": 300000,
diff --git a/plugins/trace-codex/scripts/mock-collector.ts b/plugins/trace-codex/scripts/mock-collector.ts
new file mode 100644
index 0000000..51c29c8
--- /dev/null
+++ b/plugins/trace-codex/scripts/mock-collector.ts
@@ -0,0 +1,116 @@
+// Mock Braintrust collector for the smoke test.
+//
+// The smoke test runs a real Codex session with the trace plugin enabled, but
+// points the Braintrust SDK at this local server instead of the real backend.
+// It implements just enough of the Braintrust HTTP API for a span flush to
+// succeed, and counts the trace rows it receives so the smoke test can assert
+// that tracing actually reported something.
+//
+// The Braintrust SDK (v3.x) makes these calls during initLogger + flush:
+//   POST {BRAINTRUST_APP_URL}/api/apikey/login       -> org_info handshake
+//   POST {BRAINTRUST_APP_URL}/api/project/register   -> resolves the project id
+//   GET  {BRAINTRUST_API_URL}/version                -> payload-limit probe (tolerated)
+//   POST {BRAINTRUST_API_URL}/logs3                  -> the actual span data
+// Pointing both BRAINTRUST_APP_URL and BRAINTRUST_API_URL at this one origin
+// captures the whole flow.
+//
+// On shutdown (SIGTERM/SIGINT) it writes a JSON summary to MOCK_COLLECTOR_OUT:
+//   { "logs3Requests": <n>, "totalRows": <n> }
+// so the smoke test can read the result after the Codex session ends.
+
+import { writeFileSync } from "node:fs";
+
+const PORT = Number(process.env.MOCK_COLLECTOR_PORT || "53999");
+const OUT = process.env.MOCK_COLLECTOR_OUT || "";
+
+let logs3Requests = 0;
+let totalRows = 0;
+
+function log(message: string): void {
+  process.stderr.write(`mock-collector: ${message}\n`);
+}
+
+function json(body: unknown, status = 200): Response {
+  return new Response(JSON.stringify(body), {
+    status,
+    headers: { "content-type": "application/json" },
+  });
+}
+
+/** Count the trace rows in a /logs3 body: { rows: [...], api_version: 2 }. */
+function countRows(body: unknown): number {
+  if (
+    typeof body === "object" &&
+    body !== null &&
+    Array.isArray((body as { rows?: unknown[] }).rows)
+  ) {
+    return (body as { rows: unknown[] }).rows.length;
+  }
+  return 0;
+}
+
+const server = Bun.serve({
+  port: PORT,
+  hostname: "127.0.0.1",
+  async fetch(req) {
+    const { pathname } = new URL(req.url);
+
+    // Login handshake. api_url/proxy_url are echoed back, but the SDK uses
+    // BRAINTRUST_API_URL (set by the smoke test) for the data plane regardless.
+    if (req.method === "POST" && pathname === "/api/apikey/login") {
+      const self = `http://127.0.0.1:${PORT}`;
+      return json({
+        org_info: [{ id: "smoke-org", name: "smoke", api_url: self, proxy_url: self }],
+      });
+    }
+
+    // Project registration -> returns a project id the rows reference.
+    if (req.method === "POST" && pathname === "/api/project/register") {
+      return json({ project: { id: "00000000-0000-0000-0000-000000000000", name: "smoke" } });
+    }
+
+    // Payload-limit probe. Tolerated by the SDK; return no limit.
+    if (req.method === "GET" && pathname === "/version") {
+      return json({ logs3_payload_max_bytes: null });
+    }
+
+    // The actual span data. Count the rows and acknowledge.
+    if (req.method === "POST" && (pathname === "/logs3" || pathname === "/logs3/overflow")) {
+      let rows = 0;
+      try {
+        rows = countRows(await req.json());
+      } catch (err) {
+        log(`failed to parse ${pathname} body: ${String(err)}`);
+      }
+      logs3Requests += 1;
+      totalRows += rows;
+      log(`received ${pathname}: ${rows} row(s) (total ${totalRows})`);
+      return json({});
+    }
+
+    // Anything else: 200 with an empty object so the SDK never hard-fails.
+    log(`unhandled ${req.method} ${pathname}`);
+    return json({});
+  },
+});
+
+log(`listening on http://127.0.0.1:${server.port}`);
+
+function writeSummaryAndExit(): void {
+  const summary = { logs3Requests, totalRows };
+  if (OUT) {
+    try {
+      writeFileSync(OUT, JSON.stringify(summary));
+      log(`wrote summary to ${OUT}: ${JSON.stringify(summary)}`);
+    } catch (err) {
+      log(`failed to write summary to ${OUT}: ${String(err)}`);
+    }
+  } else {
+    log(`summary: ${JSON.stringify(summary)}`);
+  }
+  server.stop(true);
+  process.exit(0);
+}
+
+process.on("SIGTERM", writeSummaryAndExit);
+process.on("SIGINT", writeSummaryAndExit);
diff --git a/plugins/trace-codex/scripts/smoke-test.sh b/plugins/trace-codex/scripts/smoke-test.sh
new file mode 100755
index 0000000..d91d646
--- /dev/null
+++ b/plugins/trace-codex/scripts/smoke-test.sh
@@ -0,0 +1,176 @@
+#!/bin/sh
+# End-to-end smoke test: run a real Codex session with the trace plugin enabled,
+# pointed at a local mock Braintrust collector, and assert at least one trace row
+# was reported.
+#
+# Usage:
+#   scripts/smoke-test.sh [VERSION]
+#
+# VERSION (optional; or the SMOKE_RELEASE_VERSION env var) pins the release whose
+# codex-hook binary the plugin launcher downloads, e.g. "0.0.1" or "v0.0.1". When
+# unset, the launcher uses the installed plugin manifest version (the normal
+# behavior). CI passes the version it just released; locally you can point this
+# at any published release.
+#
+# Prerequisites (the caller is responsible for these):
+#   - `codex` is on PATH and authenticated for `codex exec` (we pass
+#     CODEX_API_KEY from OPENAI_API_KEY below).
+#   - The trace-codex plugin is installed and enabled in Codex (the CI workflow
+#     installs it from the published release; locally use ./install.sh).
+#   - Bun is on PATH (used to run the mock collector).
+#   - OPENAI_API_KEY is set.
+#   - For a private release repo, GH_TOKEN or GITHUB_TOKEN (or an authenticated
+#     `gh`) so the launcher can download the binary.
+#
+# Exit 0 on success (>=1 trace row received), non-zero otherwise.
+
+set -eu
+
+# Resolve the plugin dir from this script's location so it works from anywhere.
+SCRIPT_DIR=$(CDPATH= cd "$(dirname "$0")" && pwd)
+PLUGIN_DIR=$(dirname "$SCRIPT_DIR")
+
+# Target release version: positional arg wins, then env, else empty (= manifest).
+RELEASE_VERSION="${1:-${SMOKE_RELEASE_VERSION:-}}"
+
+PORT="${MOCK_COLLECTOR_PORT:-53999}"
+BASE_URL="http://127.0.0.1:$PORT"
+
+log() { printf 'smoke: %s\n' "$1" >&2; }
+fail() { printf 'smoke: FAIL: %s\n' "$1" >&2; exit 1; }
+
+# Redact the API key from any text we print (defense-in-depth for the public
+# repo: GitHub already masks the secret, but we never want to echo subprocess
+# output that might contain it, transformed or otherwise). Reads stdin, writes
+# scrubbed text to stderr. A no-op if the key is empty.
+scrub() {
+  if [ -n "${OPENAI_API_KEY:-}" ]; then
+    sed "s/${OPENAI_API_KEY}/[REDACTED]/g" 2>/dev/null || cat
+  else
+    cat
+  fi
+}
+
+# --- Preconditions ---------------------------------------------------------
+
+[ -n "${OPENAI_API_KEY:-}" ] || fail "OPENAI_API_KEY is not set"
+command -v codex >/dev/null 2>&1 || fail "codex is not on PATH"
+command -v bun >/dev/null 2>&1 || fail "bun is not on PATH"
+
+# Belt-and-suspenders: explicitly mask the key in GitHub Actions logs. This
+# covers any reformatted appearance beyond GitHub's automatic secret masking.
+# Harmless (and silent) outside Actions.
+if [ -n "${GITHUB_ACTIONS:-}" ]; then
+  printf '::add-mask::%s\n' "$OPENAI_API_KEY"
+fi
+
+WORK=$(mktemp -d)
+SUMMARY="$WORK/summary.json"
+COLLECTOR_LOG="$WORK/collector.log"
+CODEX_LOG="$WORK/codex.log"
+
+COLLECTOR_PID=""
+cleanup() {
+  # Ask the collector to flush its summary and stop; fall back to KILL.
+  if [ -n "$COLLECTOR_PID" ] && kill -0 "$COLLECTOR_PID" 2>/dev/null; then
+    kill -TERM "$COLLECTOR_PID" 2>/dev/null || true
+    # Give it a moment to write the summary.
+    for _ in 1 2 3 4 5 6 7 8 9 10; do
+      kill -0 "$COLLECTOR_PID" 2>/dev/null || break
+      sleep 0.2
+    done
+    kill -KILL "$COLLECTOR_PID" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT INT TERM
+
+# --- Start the mock collector ----------------------------------------------
+
+log "starting mock collector on $BASE_URL"
+MOCK_COLLECTOR_PORT="$PORT" MOCK_COLLECTOR_OUT="$SUMMARY" \
+  bun run "$PLUGIN_DIR/scripts/mock-collector.ts" >"$COLLECTOR_LOG" 2>&1 &
+COLLECTOR_PID=$!
+
+# Wait for it to accept connections.
+ready=0
+for _ in $(seq 1 50); do
+  if curl -fsS -o /dev/null --max-time 1 "$BASE_URL/version" 2>/dev/null; then
+    ready=1
+    break
+  fi
+  kill -0 "$COLLECTOR_PID" 2>/dev/null || break
+  sleep 0.2
+done
+if [ "$ready" != "1" ]; then
+  scrub <"$COLLECTOR_LOG" >&2 || true
+  fail "mock collector never became ready on $BASE_URL"
+fi
+log "collector ready"
+
+# --- Run a real Codex session with tracing pointed at the collector --------
+
+# These BRAINTRUST_* vars are read by the hook client (inherited from this
+# process via Codex) and bundled into the session's reporting config. Pointing
+# both URLs at the collector captures the login handshake and the data POST.
+export TRACE_TO_BRAINTRUST=true
+export BRAINTRUST_API_KEY=smoke-key
+export BRAINTRUST_API_URL="$BASE_URL"
+export BRAINTRUST_APP_URL="$BASE_URL"
+export BRAINTRUST_PROJECT=trace-codex-smoke
+# Keep the background event server short-lived so it doesn't linger after CI.
+export BRAINTRUST_EVENT_SERVER_IDLE_TIMEOUT_MS=15000
+export BRAINTRUST_EVENT_SERVER_IDLE_CHECK_INTERVAL_MS=1000
+# Block on the terminal-event flush so the final spans are delivered before
+# `codex exec` returns. Without this the hook just enqueues and relies on the
+# server's idle-drain flush, which may not fire before this short-lived run tears
+# the process tree down — making the assertion below flaky.
+export BRAINTRUST_FLUSH_ON_TURN_END=true
+
+# Pin the release the launcher downloads, when requested. The launcher strips an
+# optional leading 'v', so either form is fine here.
+if [ -n "$RELEASE_VERSION" ]; then
+  export TRACE_CODEX_RELEASE_VERSION="$RELEASE_VERSION"
+  log "pinning plugin binary to release $RELEASE_VERSION"
+else
+  log "using installed plugin manifest version (no pin)"
+fi
+
+log "running codex exec (say hi)"
+# CODEX_API_KEY is supported only by `codex exec` and is set inline for just
+# this invocation. --dangerously-bypass-hook-trust runs the plugin's hooks
+# without the interactive /hooks trust prompt (one-off automation escape hatch).
+set +e
+CODEX_API_KEY="$OPENAI_API_KEY" codex exec \
+  --skip-git-repo-check \
+  --dangerously-bypass-hook-trust \
+  --sandbox read-only \
+  "say hi" >"$CODEX_LOG" 2>&1
+codex_status=$?
+set -e
+log "codex exec exited with status $codex_status"
+if [ "$codex_status" -ne 0 ]; then
+  log "--- codex output (key redacted) ---"
+  scrub <"$CODEX_LOG" >&2 || true
+fi
+
+# With BRAINTRUST_FLUSH_ON_TURN_END=true (set above), the hook client blocks
+# on a synchronous flush at session end, so by the time codex exec returns the
+# trace should already be delivered. Stop the collector (via cleanup) to make it
+# write its summary, then assert.
+cleanup
+COLLECTOR_PID=""  # already stopped; avoid double-kill in the EXIT trap
+
+log "--- collector log (key redacted) ---"
+scrub <"$COLLECTOR_LOG" >&2 || true
+
+[ -f "$SUMMARY" ] || fail "collector wrote no summary file"
+total_rows=$(sed -n 's/.*"totalRows"[[:space:]]*:[[:space:]]*\([0-9]*\).*/\1/p' "$SUMMARY")
+[ -n "$total_rows" ] || fail "could not parse totalRows from summary: $(cat "$SUMMARY")"
+log "summary: $(cat "$SUMMARY")"
+
+if [ "$total_rows" -ge 1 ]; then
+  log "PASS: received $total_rows trace row(s)"
+  exit 0
+fi
+
+fail "expected >=1 trace row, got $total_rows"
diff --git a/plugins/trace-codex/scripts/token-proxy.ts b/plugins/trace-codex/scripts/token-proxy.ts
new file mode 100644
index 0000000..3344402
--- /dev/null
+++ b/plugins/trace-codex/scripts/token-proxy.ts
@@ -0,0 +1,372 @@
+// Token-counting proxy for verifying trace-codex's token accounting.
+//
+// Codex has no per-session token readout we can compare against, so this is a
+// standalone debugging tool: a local HTTP proxy that sits between Codex and the
+// real OpenAI API. It forwards every request through unchanged and tees the
+// streaming response, parsing each model call's `usage` straight off the wire.
+// You then compare these ground-truth numbers against what trace-codex records
+// in Braintrust.
+//
+// It is NOT part of either plugin's runtime — purely a manual verification aid.
+//
+// ── Usage ──────────────────────────────────────────────────────────────────
+//   1. Run the proxy:
+//        bun run scripts/token-proxy.ts
+//      It listens on http://127.0.0.1:53800 and, by default, forwards to the
+//      ChatGPT backend (https://chatgpt.com) — for ChatGPT-login Codex, the
+//      common case. It logs every request it receives.
+//
+//   2. Point Codex at it via a custom provider in ~/.codex/config.toml. These
+//      keys are USER-LEVEL only and MUST go ABOVE any [table] headers (in TOML,
+//      keys after a `[table]` header belong to that table), e.g. at the very top
+//      of the file:
+//
+//      For ChatGPT login (auth_mode=chatgpt — check ~/.codex/auth.json):
+//        model_provider = "token-proxy"
+//        [model_providers.token-proxy]
+//        name = "token-proxy"
+//        # base_url must contain "/backend-api" so Codex uses ChatGPT path style;
+//        # the proxy forwards this exact path to https://chatgpt.com.
+//        base_url = "http://127.0.0.1:53800/backend-api/codex"
+//        requires_openai_auth = true   # relay the ChatGPT token + account headers
+//        wire_api = "responses"
+//
+//      For API-key login instead: run the proxy with
+//        TOKEN_PROXY_UPSTREAM=https://api.openai.com bun run scripts/token-proxy.ts
+//      and use:
+//        model_provider = "token-proxy"
+//        [model_providers.token-proxy]
+//        name = "token-proxy"
+//        base_url = "http://127.0.0.1:53800/v1"
+//        env_key = "OPENAI_API_KEY"
+//        wire_api = "responses"
+//
+//   3. Run a Codex session as usual. Each model call prints a usage line, and a
+//      running session total is kept. A machine-readable NDJSON log is written to
+//      TOKEN_PROXY_OUT (default: token-proxy.ndjson) — one line per call plus a
+//      final totals line — so you can diff against the trace.
+//
+// ── How usage is read ────────────────────────────────────────────────────────
+// Codex uses the Responses API with SSE streaming. The terminal
+// `response.completed` event carries `response.usage`:
+//   { input_tokens, output_tokens, total_tokens,
+//     input_tokens_details: { cached_tokens },
+//     output_tokens_details: { reasoning_tokens } }
+// We pass the stream through byte-for-byte to Codex while parsing a copy, so the
+// proxy is transparent. Non-streaming JSON responses are handled too (we read
+// `usage` from the parsed body) for completeness.
+
+import { appendFileSync, writeFileSync } from "node:fs";
+
+const PORT = Number(process.env.TOKEN_PROXY_PORT || "53800");
+// Upstream ORIGIN the proxy forwards to (scheme://host, no path). The proxy is a
+// transparent tunnel: it forwards the exact request path Codex builds onto this
+// origin. Default is the ChatGPT host Codex uses when logged in via ChatGPT
+// (auth_mode=chatgpt) — the common case. For an API-key login, set
+// TOKEN_PROXY_UPSTREAM=https://api.openai.com (and use the api-key provider).
+const UPSTREAM = (
+  process.env.TOKEN_PROXY_UPSTREAM ||
+  process.env.OPENAI_BASE_URL ||
+  "https://chatgpt.com"
+).replace(/\/+$/, "");
+const OUT = process.env.TOKEN_PROXY_OUT || "/tmp/token-proxy.ndjson";
+// When set, dump the raw upstream SSE event blocks to TOKEN_PROXY_DEBUG_OUT so we
+// can inspect the exact event shape (the ChatGPT backend may differ from the
+// public Responses API).
+const DEBUG = process.env.TOKEN_PROXY_DEBUG === "1";
+const DEBUG_OUT = process.env.TOKEN_PROXY_DEBUG_OUT || "token-proxy-raw.log";
+
+/** Per-call usage, normalized to the keys we care about. */
+interface Usage {
+  input_tokens: number;
+  cached_input_tokens: number;
+  output_tokens: number;
+  reasoning_output_tokens: number;
+  total_tokens: number;
+}
+
+const ZERO_USAGE: Usage = {
+  input_tokens: 0,
+  cached_input_tokens: 0,
+  output_tokens: 0,
+  reasoning_output_tokens: 0,
+  total_tokens: 0,
+};
+
+// Running totals across the proxy's lifetime (one session, typically).
+let callCount = 0;
+const sessionTotal: Usage = { ...ZERO_USAGE };
+
+function log(message: string): void {
+  process.stderr.write(`token-proxy: ${message}\n`);
+}
+
+// Truncate the NDJSON log on startup so each run is a fresh capture.
+try {
+  writeFileSync(OUT, "");
+} catch (err) {
+  log(`could not open ${OUT}: ${String(err)} (file logging disabled)`);
+}
+
+if (DEBUG) {
+  try {
+    writeFileSync(DEBUG_OUT, "");
+    log(`DEBUG on: dumping raw upstream SSE blocks to ${DEBUG_OUT}`);
+  } catch {
+    // ignore
+  }
+}
+
+function record(line: unknown): void {
+  try {
+    appendFileSync(OUT, `${JSON.stringify(line)}\n`);
+  } catch {
+    // best-effort; the stderr line is the primary output
+  }
+}
+
+function num(v: unknown): number {
+  return typeof v === "number" && Number.isFinite(v) ? v : 0;
+}
+
+/**
+ * Pull usage out of a Responses API `response` object (the shape under both the
+ * `response.completed` SSE event and a non-streaming JSON body). Returns null if
+ * there is no usage block.
+ */
+function usageFromResponse(response: unknown): Usage | null {
+  if (response === null || typeof response !== "object") return null;
+  const usage = (response as { usage?: unknown }).usage;
+  if (usage === null || typeof usage !== "object") return null;
+  const u = usage as Record<string, unknown>;
+  const inputDetails = (u.input_tokens_details ?? {}) as Record<string, unknown>;
+  const outputDetails = (u.output_tokens_details ?? {}) as Record<string, unknown>;
+  return {
+    input_tokens: num(u.input_tokens),
+    cached_input_tokens: num(inputDetails.cached_tokens),
+    output_tokens: num(u.output_tokens),
+    reasoning_output_tokens: num(outputDetails.reasoning_tokens),
+    total_tokens: num(u.total_tokens),
+  };
+}
+
+/** Record a call's usage: bump totals, print a line, append to the NDJSON log. */
+function reportUsage(usage: Usage, model: string | undefined): void {
+  callCount += 1;
+  sessionTotal.input_tokens += usage.input_tokens;
+  sessionTotal.cached_input_tokens += usage.cached_input_tokens;
+  sessionTotal.output_tokens += usage.output_tokens;
+  sessionTotal.reasoning_output_tokens += usage.reasoning_output_tokens;
+  sessionTotal.total_tokens += usage.total_tokens;
+
+  log(
+    `call #${callCount}${model ? ` (${model})` : ""}: ` +
+      `input=${usage.input_tokens} (cached=${usage.cached_input_tokens}) ` +
+      `output=${usage.output_tokens} (reasoning=${usage.reasoning_output_tokens}) ` +
+      `total=${usage.total_tokens}  ||  session total=${sessionTotal.total_tokens}`,
+  );
+  record({ kind: "call", call: callCount, model, usage, sessionTotal: { ...sessionTotal } });
+}
+
+/**
+ * A TransformStream that passes bytes through untouched while scanning the SSE
+ * text for the terminal usage. Codex receives the exact upstream bytes; we just
+ * watch a copy. We look for `response.completed` (and tolerate `response.usage`
+ * appearing on other terminal events) and parse the embedded JSON object's
+ * usage. The model id is pulled from the same event when present.
+ */
+function makeUsageSniffer(): TransformStream<Uint8Array, Uint8Array> {
+  const decoder = new TextDecoder();
+  let buffer = "";
+  let reported = false;
+
+  const tryParseEvent = (block: string): void => {
+    if (block.trim().length === 0) return;
+    if (DEBUG) {
+      try {
+        appendFileSync(DEBUG_OUT, `${block}\n---\n`);
+      } catch {
+        // ignore
+      }
+    }
+    // An SSE event block is lines like "event: ...\ndata: {json}". Concatenate
+    // all data: lines (the Responses API sends single-line JSON, but be safe).
+    const dataLines: string[] = [];
+    for (const line of block.split("\n")) {
+      const trimmed = line.replace(/\r$/, "");
+      if (trimmed.startsWith("data:")) dataLines.push(trimmed.slice(5).trimStart());
+    }
+    if (dataLines.length === 0) return;
+    const data = dataLines.join("");
+    if (data === "[DONE]" || data.length === 0) return;
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(data);
+    } catch {
+      return; // not JSON we can read; ignore
+    }
+    if (parsed === null || typeof parsed !== "object") return;
+    const obj = parsed as Record<string, unknown>;
+    // Find a usage block wherever it lives: nested under `response` (the
+    // completed/incomplete events) or at the top level. Report each one we see —
+    // a single model call's stream carries usage once, so the last one wins if
+    // there are several. (Don't short-circuit on the first: some backends emit a
+    // partial usage earlier in the stream.)
+    const response = (obj.response ?? obj) as Record<string, unknown>;
+    const usage = usageFromResponse(response) ?? usageFromResponse(obj);
+    if (usage === null) return;
+    const model =
+      typeof response.model === "string"
+        ? response.model
+        : typeof obj.model === "string"
+          ? obj.model
+          : undefined;
+    reportUsage(usage, model);
+    reported = true;
+  };
+
+  // Whole-body accumulator for the non-SSE JSON fallback, capped so a huge
+  // response can't balloon memory (usage lives in the small completed event /
+  // top-level object, so a few MB is plenty).
+  let fullBody = "";
+  const FULL_BODY_CAP = 4_000_000;
+
+  // Last-resort: try to parse the entire body as one JSON object and pull usage
+  // from it (covers non-streaming JSON responses).
+  const tryParseWholeBody = (): void => {
+    if (reported || fullBody.length === 0) return;
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(fullBody);
+    } catch {
+      return;
+    }
+    if (parsed === null || typeof parsed !== "object") return;
+    const obj = parsed as Record<string, unknown>;
+    const response = (obj.response ?? obj) as Record<string, unknown>;
+    const usage = usageFromResponse(response) ?? usageFromResponse(obj);
+    if (usage === null) return;
+    const model =
+      typeof response.model === "string"
+        ? response.model
+        : typeof obj.model === "string"
+          ? obj.model
+          : undefined;
+    reportUsage(usage, model);
+    reported = true;
+  };
+
+  return new TransformStream<Uint8Array, Uint8Array>({
+    transform(chunk, controller) {
+      controller.enqueue(chunk); // pass through immediately, unmodified
+      const text = decoder.decode(chunk, { stream: true });
+      buffer += text;
+      if (fullBody.length < FULL_BODY_CAP) fullBody += text;
+      // SSE events are separated by a blank line. Process complete blocks.
+      let sep = buffer.indexOf("\n\n");
+      while (sep !== -1) {
+        const block = buffer.slice(0, sep);
+        buffer = buffer.slice(sep + 2);
+        tryParseEvent(block);
+        sep = buffer.indexOf("\n\n");
+      }
+    },
+    flush() {
+      if (buffer.length > 0) tryParseEvent(buffer);
+      // SSE parsing found nothing — try the whole body as a single JSON object.
+      tryParseWholeBody();
+      if (!reported) {
+        log(`call completed with no usage found in response (path may be non-model)`);
+      }
+    },
+  });
+}
+
+/** Build the upstream URL: UPSTREAM + the part of the path after our `/v1`. */
+function upstreamUrl(reqUrl: string): string {
+  // Transparent tunnel: forward the exact path + query Codex built onto the
+  // upstream origin. (The provider's base_url is configured so Codex builds the
+  // correct downstream path; the proxy never rewrites it.)
+  const { pathname, search } = new URL(reqUrl);
+  return `${UPSTREAM}${pathname}${search}`;
+}
+
+const server = Bun.serve({
+  port: PORT,
+  hostname: "127.0.0.1",
+  // Generous: model calls can stream for minutes.
+  idleTimeout: 0,
+  async fetch(req) {
+    const target = upstreamUrl(req.url);
+    const path = new URL(req.url).pathname;
+    // Log every request so it's obvious when Codex is (or isn't) hitting us.
+    log(`${req.method} ${path} -> ${target}`);
+
+    // Forward the request verbatim (method, headers, body). Codex already set
+    // the Authorization header (ChatGPT token or API key) and the account/session
+    // headers, so we just relay them. Drop `host` so fetch derives the correct
+    // upstream host (otherwise it carries our 127.0.0.1:PORT and breaks routing).
+    const headers = new Headers(req.headers);
+    headers.delete("host");
+    let upstream: Response;
+    try {
+      upstream = await fetch(target, {
+        method: req.method,
+        headers,
+        body: req.body,
+        // `duplex: "half"` is required to stream a request body; not in the DOM
+        // RequestInit type, so widen via a cast.
+        ...({ duplex: "half" } as Record<string, unknown>),
+        redirect: "manual",
+      });
+    } catch (err) {
+      log(`upstream fetch failed for ${req.method} ${path}: ${String(err)}`);
+      return new Response(JSON.stringify({ error: "proxy_upstream_failed" }), {
+        status: 502,
+        headers: { "content-type": "application/json" },
+      });
+    }
+
+    const contentType = upstream.headers.get("content-type") ?? "";
+    log(`  <- ${upstream.status} ${contentType || "(no content-type)"}`);
+
+    // Always tee the body through the usage sniffer. We do NOT gate on
+    // content-type: the ChatGPT backend streams SSE with no (or an unexpected)
+    // content-type header, so matching "text/event-stream" misses it. The sniffer
+    // passes bytes through unchanged and looks for usage in either SSE `data:`
+    // events or a whole-body JSON object, so it's safe for any response.
+    if (upstream.body) {
+      const teed = upstream.body.pipeThrough(makeUsageSniffer());
+      return new Response(teed, {
+        status: upstream.status,
+        statusText: upstream.statusText,
+        headers: upstream.headers,
+      });
+    }
+
+    // No body (e.g. a HEAD or 204): pass straight through.
+    return new Response(upstream.body, {
+      status: upstream.status,
+      statusText: upstream.statusText,
+      headers: upstream.headers,
+    });
+  },
+});
+
+log(`listening on http://127.0.0.1:${server.port} -> ${UPSTREAM}`);
+log(`writing per-call usage to ${OUT}`);
+
+function summarize(): void {
+  log(
+    `SESSION TOTAL over ${callCount} call(s): ` +
+      `input=${sessionTotal.input_tokens} (cached=${sessionTotal.cached_input_tokens}) ` +
+      `output=${sessionTotal.output_tokens} (reasoning=${sessionTotal.reasoning_output_tokens}) ` +
+      `total=${sessionTotal.total_tokens}`,
+  );
+  record({ kind: "session_total", calls: callCount, sessionTotal: { ...sessionTotal } });
+  server.stop(true);
+  process.exit(0);
+}
+
+process.on("SIGTERM", summarize);
+process.on("SIGINT", summarize);
diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts
index ab63e57..7f5da2f 100644
--- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts
+++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts
@@ -1,175 +1,1392 @@
 import { describe, expect, test } from "bun:test";
 import type { ReportingConfig, SpanFactory } from "../../braintrust/logger.ts";
 import type { EnqueueEvent } from "../../server/routes.ts";
-import { createTestLogger, withCapturedTrace } from "../../test-helpers.ts";
+import { createTestLogger, spansToTree, withCapturedTrace } from "../../test-helpers.ts";
 import { CODEX_CONFIG_EVENT } from "./event-builder.ts";
 import { CodexEventProcessor } from "./event-processor.ts";
 import {
   assertProducesTrace,
+  assistantMessage,
+  compacted,
   configEvent,
+  customToolCall,
+  customToolCallOutput,
+  FakeTranscriptReader,
+  functionCall,
+  functionCallOutput,
+  MultiFileTranscriptReader,
+  postCompact,
+  postToolUse,
+  preCompact,
+  preToolUse,
+  reasoning,
+  sessionMeta,
   sessionStart,
   stop,
-  userPromptSubmit,
+  subagentStart,
+  subagentStop,
+  TEST_TRANSCRIPT_PATH,
+  taskComplete,
+  taskStarted,
+  tokenCount,
+  transcript,
+  turnContext,
+  userMessage,
+  userMessageItem,
 } from "./test-helpers.ts";
+import type {
+  TranscriptReader,
+  TranscriptReadResult,
+  TranscriptWaitResult,
+  WaitForOptions,
+} from "./transcript-reader.ts";
 
-describe("CodexEventProcessor", () => {
-  test("SessionStart produces a root span", async () => {
+// Helpers below build a session as an ordered list of transcript writes and
+// hooks. The final `stop()` hook triggers the catch-up that turns the buffered
+// transcript records into spans (waiting for the task_complete sentinel).
+
+describe("CodexEventProcessor: root span", () => {
+  test("session_meta opens a root span named after the cwd basename", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart({ source: "startup" }),
+        sessionMeta({ cwd: "/whatever/myapp", id: "session-1" }),
+        turnContext({ model: "gpt-5.5" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: myapp", type: "task" },
+        metadata: {
+          session_id: "session-1",
+          model: "gpt-5.5",
+          source: "startup",
+          transcript_path: TEST_TRANSCRIPT_PATH,
+        },
+        children: [],
+      },
+    );
+  });
+
+  test("root span name handles a trailing slash in cwd", async () => {
+    await assertProducesTrace(
+      [sessionStart(), sessionMeta({ cwd: "/whatever/myapp/" }), stop({ turn_id: "t1" })],
+      { span_attributes: { name: "codex: myapp", type: "task" }, children: [] },
+    );
+  });
+
+  test("root span falls back to 'codex session' when cwd is missing", async () => {
+    await assertProducesTrace([sessionStart(), sessionMeta({}), stop({ turn_id: "t1" })], {
+      span_attributes: { name: "codex session", type: "task" },
+      children: [],
+    });
+  });
+
+  test("model is backfilled from turn_context", async () => {
+    await assertProducesTrace(
+      [sessionStart(), sessionMeta({ cwd: "/work" }), turnContext({ model: "gpt-5.5" }), stop({})],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        metadata: { model: "gpt-5.5" },
+        children: [],
+      },
+    );
+  });
+
+  test("root carries source and permission_mode from the SessionStart hook", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart({ source: "resume", permission_mode: "acceptEdits" }),
+        sessionMeta({ cwd: "/work" }),
+        stop({}),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        metadata: { source: "resume", permission_mode: "acceptEdits" },
+        children: [],
+      },
+    );
+  });
+
+  test("duplicate session_meta still yields a single root span", async () => {
+    await assertProducesTrace(
+      [sessionStart(), sessionMeta({ cwd: "/work" }), sessionMeta({ cwd: "/work" }), stop({})],
+      { span_attributes: { name: "codex: work", type: "task" }, children: [] },
+    );
+  });
+
+  test("Stop ends the root span", async () => {
+    await assertProducesTrace([sessionStart(), sessionMeta({ cwd: "/work" }), stop({})], {
+      span_attributes: { name: "codex: work", type: "task" },
+      ended: true,
+      children: [],
+    });
+  });
+});
+
+describe("CodexEventProcessor: turn spans", () => {
+  test("task_started/task_complete become a turn span with prompt input and assistant output", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "what's your name?" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "I'm Codex." }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            input: "what's your name?",
+            output: "I'm Codex.",
+            metadata: { turn_id: "t1" },
+            ended: true,
+            children: [],
+          },
+        ],
+      },
+    );
+  });
+
+  test("two turns become two ordered child spans", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "one" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "first" }),
+        taskStarted({ turn_id: "t2" }),
+        userMessage({ message: "two" }),
+        taskComplete({ turn_id: "t2", last_agent_message: "second" }),
+        stop({ turn_id: "t2" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true },
+          { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true },
+        ],
+      },
+    );
+  });
+
+  test("a turn whose task_complete lands after its Stop is closed by the final flush catch-up", async () => {
+    // Regression: the last turn's task_complete can be written just after its
+    // Stop hook fires, so the Stop's bounded wait misses it. The final flush
+    // read must still pick it up and close the turn. Here task_complete is
+    // ordered AFTER stop() in the list (so it is only present at flush time).
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "one" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "first" }),
+        stop({ turn_id: "t1" }),
+        taskStarted({ turn_id: "t2" }),
+        userMessage({ message: "two" }),
+        // Tool call/output present at Stop time, but task_complete is not yet.
+        functionCall({ turn_id: "t2", name: "exec_command", call_id: "c2" }),
+        functionCallOutput({ call_id: "c2", output: "ok" }),
+        stop({ turn_id: "t2" }),
+        // task_complete written after the Stop — only seen by the flush catch-up.
+        taskComplete({ turn_id: "t2", last_agent_message: "second" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true },
+          {
+            span_attributes: { name: "turn: t2", type: "task" },
+            output: "second",
+            ended: true,
+            children: [
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              { span_attributes: { name: "exec_command", type: "tool" }, ended: true },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  test("each turn closes when its own Stop arrives (per-turn Stops)", async () => {
+    // Mirrors real Codex: Stop fires per turn. Turn 1 is processed/closed on its
+    // own Stop; turn 2 opens after and closes on the second Stop. Regression for
+    // the bug where the second turn never closed.
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "one" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "first" }),
+        stop({ turn_id: "t1" }),
+        taskStarted({ turn_id: "t2" }),
+        userMessage({ message: "two" }),
+        taskComplete({ turn_id: "t2", last_agent_message: "second" }),
+        stop({ turn_id: "t2" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true },
+          { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true },
+        ],
+      },
+    );
+  });
+
+  test("a turn still open when the root ends is closed by the root-end backstop", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "hi" }),
+        // No task_complete; the Stop hook ends the root and closes the turn.
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          { span_attributes: { name: "turn: t1", type: "task" }, input: "hi", ended: true },
+        ],
+      },
+    );
+  });
+});
+
+describe("CodexEventProcessor: span timing (from transcript timestamps)", () => {
+  // :10Z=1767225610, :11=...611, :12=...612, :13=...613, :14=...614, :15=...615.
+  test("an llm span starts at its turn's start (first child), not its output record", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        transcript({
+          type: "session_meta",
+          timestamp: "2026-01-01T00:00:10Z",
+          payload: { cwd: "/work", id: "session-1" },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:11Z",
+          payload: { type: "task_started", turn_id: "t1" },
+        }),
+        // LLM call: the assistant message lands at (:12) — but the model's work
+        // for this call began at the turn's start (:11), since this is the first
+        // child. It requests a tool (:13) — that's its last output, so the llm
+        // span ends there. The tool result arrives (:14) and the token_count
+        // (:15) supplies metrics only (it does NOT define the llm span's end).
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:12Z",
+          payload: {
+            type: "message",
+            role: "assistant",
+            content: [{ type: "output_text", text: "go" }],
+          },
+        }),
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:13Z",
+          payload: {
+            type: "function_call",
+            name: "exec_command",
+            call_id: "c1",
+            metadata: { turn_id: "t1" },
+          },
+        }),
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:14Z",
+          payload: { type: "function_call_output", call_id: "c1", output: "ok" },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:15Z",
+          payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:16Z",
+          payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" },
+        }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        metrics: { start: 1767225610, end: 1767225616 },
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            metrics: { start: 1767225611, end: 1767225616 },
+            ended: true,
+            children: [
+              {
+                // LLM call: starts at the turn's start (:11, first child), not
+                // the message record (:12); ENDS when it emits the tool call (:13),
+                // its last output — NOT at token_count (:15). This is the fix for
+                // the llm span swallowing the tool's execution.
+                span_attributes: { name: "llm", type: "llm" },
+                metrics: { start: 1767225611, end: 1767225613 },
+                ended: true,
+              },
+              {
+                // Tool span uses its own record timestamps: function_call (:13)
+                // to function_call_output (:14). Contiguous with the llm (no
+                // overlap): llm ends at :13, tool starts at :13.
+                span_attributes: { name: "exec_command", type: "tool" },
+                metrics: { start: 1767225613, end: 1767225614 },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  // When an llm call is the only child of its turn, its span should span almost
+  // the whole turn (start at turn start, end at token_count just before
+  // task_complete) — not appear near-instant.
+  test("a lone llm span fills its turn", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        transcript({
+          type: "session_meta",
+          timestamp: "2026-01-01T00:00:10Z",
+          payload: { cwd: "/work", id: "session-1" },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:11Z",
+          payload: { type: "task_started", turn_id: "t1" },
+        }),
+        // The model "thinks" for several seconds; its output only lands at :15.
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:15Z",
+          payload: {
+            type: "message",
+            role: "assistant",
+            content: [{ type: "output_text", text: "answer" }],
+          },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:15Z",
+          payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:16Z",
+          payload: { type: "task_complete", turn_id: "t1", last_agent_message: "answer" },
+        }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            metrics: { start: 1767225611, end: 1767225616 },
+            children: [
+              {
+                // Starts at turn start (:11), not the output record (:15).
+                span_attributes: { name: "llm", type: "llm" },
+                metrics: { start: 1767225611, end: 1767225615 },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  // Regression for the "front-of-turn gap" bug: a model call whose ONLY output is
+  // a tool call (no assistant message or readable reasoning before it) must still
+  // open an llm span, so the model's work (often many seconds of thinking before
+  // it emits the tool call) is covered rather than leaving an unaccounted gap
+  // between the turn start and the first child.
+  test("a tool-only model call still opens an llm span filling the front of the turn", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        transcript({
+          type: "session_meta",
+          timestamp: "2026-01-01T00:00:10Z",
+          payload: { cwd: "/work", id: "session-1" },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:11Z",
+          payload: { type: "task_started", turn_id: "t1" },
+        }),
+        // The model "thinks" for ~20s, then its first (and only) output for this
+        // call is a tool call at :30 — no preceding message or reasoning.
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:30Z",
+          payload: {
+            type: "function_call",
+            name: "exec_command",
+            call_id: "c1",
+            metadata: { turn_id: "t1" },
+          },
+        }),
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:32Z",
+          payload: { type: "function_call_output", call_id: "c1", output: "ok" },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:32Z",
+          payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:33Z",
+          payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" },
+        }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            metrics: { start: 1767225611, end: 1767225633 },
+            children: [
+              {
+                // The llm span covers the model's work from the turn start (:11)
+                // until it emits the tool call (:30, its only output) — filling
+                // the ~20s "thinking" gap. It does NOT extend to the token_count
+                // (:32); that would overlap the tool, which is impossible.
+                span_attributes: { name: "llm", type: "llm" },
+                metrics: { start: 1767225611, end: 1767225630 },
+                ended: true,
+              },
+              {
+                // The tool runs :30 -> :32, contiguous with (not inside) the llm.
+                span_attributes: { name: "exec_command", type: "tool" },
+                metrics: { start: 1767225630, end: 1767225632 },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  // Two model calls separated by a tool: the second call must start where the
+  // tool ended (the model's most recent sibling), so the spans tile the turn
+  // contiguously instead of each looking instant.
+  test("a later llm span starts where the prior sibling (a tool) ended", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        transcript({
+          type: "session_meta",
+          timestamp: "2026-01-01T00:00:10Z",
+          payload: { cwd: "/work", id: "session-1" },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:11Z",
+          payload: { type: "task_started", turn_id: "t1" },
+        }),
+        // Call 1: opens at turn start (:11) on its first output item (an
+        // assistant message at :11), then emits a tool call (:12, its last
+        // output → llm-1 ends there). The token_count (:13) is metrics-only.
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:11Z",
+          payload: {
+            type: "message",
+            role: "assistant",
+            content: [{ type: "output_text", text: "let me check" }],
+          },
+        }),
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:12Z",
+          payload: {
+            type: "function_call",
+            name: "exec_command",
+            call_id: "c1",
+            metadata: { turn_id: "t1" },
+          },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:13Z",
+          payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } },
+        }),
+        // Tool runs :12 -> :14 (output lands after call 1's token_count).
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:14Z",
+          payload: { type: "function_call_output", call_id: "c1", output: "ok" },
+        }),
+        // Call 2: the model resumes after the tool result. Its output lands at
+        // :18, but it began working when the tool returned (:14).
+        transcript({
+          type: "response_item",
+          timestamp: "2026-01-01T00:00:18Z",
+          payload: {
+            type: "message",
+            role: "assistant",
+            content: [{ type: "output_text", text: "done" }],
+          },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:18Z",
+          payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } },
+        }),
+        transcript({
+          type: "event_msg",
+          timestamp: "2026-01-01T00:00:19Z",
+          payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" },
+        }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            children: [
+              {
+                // Call 1: turn start (:11) -> its last output, the tool call
+                // (:12). Contiguous with the tool; does NOT extend to :13/:14.
+                span_attributes: { name: "llm", type: "llm" },
+                metrics: { start: 1767225611, end: 1767225612 },
+                ended: true,
+              },
+              {
+                // Tool: :12 -> :14.
+                span_attributes: { name: "exec_command", type: "tool" },
+                metrics: { start: 1767225612, end: 1767225614 },
+                ended: true,
+              },
+              {
+                // Call 2: starts at the tool's end (:14), the most recent sibling
+                // end — NOT its own output record (:18). Ends at its last output,
+                // the message (:18). The token_count (:18) is metrics-only.
+                span_attributes: { name: "llm", type: "llm" },
+                metrics: { start: 1767225614, end: 1767225618 },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+});
+
+describe("CodexEventProcessor: tool spans", () => {
+  test("a function_call/output pair becomes a tool span under its turn", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "run it" }),
+        functionCall({
+          turn_id: "t1",
+          name: "exec_command",
+          call_id: "call_1",
+          arguments: '{"cmd":"ls"}',
+        }),
+        functionCallOutput({ call_id: "call_1", output: "file.txt" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            output: "done",
+            ended: true,
+            children: [
+              // The model call that emitted the tool call (its only output) opens
+              // an llm span, so the model's work isn't an unaccounted gap.
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              {
+                span_attributes: { name: "exec_command", type: "tool" },
+                input: '{"cmd":"ls"}',
+                output: "file.txt",
+                metadata: { tool_name: "exec_command", call_id: "call_1", turn_id: "t1" },
+                ended: true,
+                children: [],
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  test("custom_tool_call (apply_patch) becomes a tool span using the raw name", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        customToolCall({
+          turn_id: "t1",
+          name: "apply_patch",
+          call_id: "call_2",
+          input: "*** Begin Patch",
+        }),
+        customToolCallOutput({ call_id: "call_2", output: "Success" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "ok" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              {
+                span_attributes: { name: "apply_patch", type: "tool" },
+                input: "*** Begin Patch",
+                output: "Success",
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  test("multiple tool calls in one turn become ordered sibling tool spans", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1" }),
+        functionCallOutput({ call_id: "c1", output: "a" }),
+        customToolCall({ turn_id: "t1", name: "apply_patch", call_id: "c2" }),
+        customToolCallOutput({ call_id: "c2", output: "b" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              // Both tool calls come from one model call, so a single llm span
+              // opens (on the first tool call) and closes at turn end.
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              { span_attributes: { name: "exec_command", type: "tool" }, output: "a", ended: true },
+              { span_attributes: { name: "apply_patch", type: "tool" }, output: "b", ended: true },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  test("an unpaired tool call (no output) is closed when its turn ends", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1", arguments: "{}" }),
+        // No output for c1; the turn completes anyway.
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              {
+                span_attributes: { name: "exec_command", type: "tool" },
+                input: "{}",
+                ended: true,
+                children: [],
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  test("a tool call with no matching open turn span is skipped", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        // turn_id "tX" was never opened; the tool span must be skipped.
+        functionCall({ turn_id: "tX", name: "exec_command", call_id: "c1" }),
+        functionCallOutput({ call_id: "c1", output: "x" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [] },
+        ],
+      },
+    );
+  });
+});
+
+describe("CodexEventProcessor: permissions", () => {
+  test("an escalated tool call is annotated with permission metadata and a tag", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        functionCall({
+          turn_id: "t1",
+          name: "exec_command",
+          call_id: "c1",
+          arguments: JSON.stringify({
+            cmd: "curl example.com",
+            sandbox_permissions: "require_escalated",
+            justification: "Need network access",
+            prefix_rule: ["curl"],
+          }),
+        }),
+        functionCallOutput({ call_id: "c1", output: "ok" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              {
+                span_attributes: { name: "exec_command", type: "tool" },
+                metadata: {
+                  permission: {
+                    sandbox_permissions: "require_escalated",
+                    justification: "Need network access",
+                    prefix_rule: ["curl"],
+                  },
+                },
+                tags: ["permission-request"],
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  // An escalation is a failed sandboxed attempt followed by an escalated retry.
+  // Both are separate tool spans (distinct call_ids); only the retry carries the
+  // permission annotation.
+  test("a sandboxed attempt then escalated retry: only the retry is annotated", async () => {
     await assertProducesTrace(
-      [sessionStart({ model: "gpt-5.5", cwd: "/work", source: "startup" })],
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        functionCall({
+          turn_id: "t1",
+          name: "exec_command",
+          call_id: "attempt",
+          arguments: JSON.stringify({ cmd: "curl example.com" }),
+        }),
+        functionCallOutput({ call_id: "attempt", output: "Error: network blocked" }),
+        functionCall({
+          turn_id: "t1",
+          name: "exec_command",
+          call_id: "retry",
+          arguments: JSON.stringify({
+            cmd: "curl example.com",
+            sandbox_permissions: "require_escalated",
+            justification: "Need network access",
+          }),
+        }),
+        functionCallOutput({ call_id: "retry", output: "ok" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
       {
         span_attributes: { name: "codex: work", type: "task" },
-        input: { model: "gpt-5.5", cwd: "/work", source: "startup" },
-        metadata: { session_id: "session-1", model: "gpt-5.5" },
-        children: [],
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              // Both attempts come from one model call, so a single llm span opens.
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              {
+                // The failed sandboxed attempt: no permission annotation.
+                span_attributes: { name: "exec_command", type: "tool" },
+                metadata: { call_id: "attempt", permission: undefined },
+                output: "Error: network blocked",
+                ended: true,
+              },
+              {
+                // The escalated retry: annotated.
+                span_attributes: { name: "exec_command", type: "tool" },
+                metadata: {
+                  call_id: "retry",
+                  permission: {
+                    sandbox_permissions: "require_escalated",
+                    justification: "Need network access",
+                  },
+                },
+                tags: ["permission-request"],
+                output: "ok",
+                ended: true,
+              },
+            ],
+          },
+        ],
       },
     );
   });
 
-  test("root span is named after the project directory (basename of cwd)", async () => {
-    await assertProducesTrace([sessionStart({ cwd: "/whatever/myapp" })], {
-      span_attributes: { name: "codex: myapp", type: "task" },
-      children: [],
-    });
-  });
-
-  test("root span name handles a trailing slash in cwd", async () => {
-    await assertProducesTrace([sessionStart({ cwd: "/whatever/myapp/" })], {
-      span_attributes: { name: "codex: myapp", type: "task" },
-      children: [],
-    });
+  test("a non-escalated tool call has no permission metadata or tag", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        functionCall({
+          turn_id: "t1",
+          name: "exec_command",
+          call_id: "c1",
+          arguments: JSON.stringify({ cmd: "ls" }),
+        }),
+        functionCallOutput({ call_id: "c1", output: "file.txt" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              { span_attributes: { name: "llm", type: "llm" }, ended: true },
+              {
+                span_attributes: { name: "exec_command", type: "tool" },
+                metadata: { permission: undefined },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
   });
+});
 
-  test("root span falls back to 'codex session' when cwd is missing", async () => {
-    await assertProducesTrace([sessionStart({})], {
-      span_attributes: { name: "codex session", type: "task" },
-      children: [],
-    });
+describe("CodexEventProcessor: compaction", () => {
+  // A compaction is its own turn containing a `compacted` record. That turn span
+  // is relabeled "compaction", tagged, annotated, and given an llm child span
+  // showing what the compaction did (input = prior history; output = the
+  // replacement history). The trigger comes from the compact hooks, and the turn
+  // is terminated by PostCompact (a compaction turn gets no Stop of its own).
+  test("a compaction turn becomes a compaction span with an llm child", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "tc" }),
+        preCompact({ turn_id: "tc", trigger: "manual" }),
+        compacted({
+          message: "",
+          replacement_history: [
+            { type: "message", role: "user", content: [{ type: "input_text", text: "hi" }] },
+            { type: "compaction", encrypted_content: "opaque" },
+          ],
+          window_id: 1,
+        }),
+        tokenCount({ total_tokens: 6451 }),
+        taskComplete({ turn_id: "tc", last_agent_message: null }),
+        postCompact({ turn_id: "tc", trigger: "manual" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        children: [
+          {
+            span_attributes: { name: "compaction", type: "task" },
+            metadata: {
+              compaction: { trigger: "manual", replaced_message_count: 2, window_id: 1 },
+            },
+            tags: ["compaction"],
+            ended: true,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                metadata: { compaction: true },
+                // Output is shaped as the kept context + a clear summary marker
+                // (the real summary is encrypted), not the raw replacement list.
+                output: {
+                  summary: "[summary unavailable — encrypted by Codex]",
+                  kept_messages: [
+                    {
+                      type: "message",
+                      role: "user",
+                      content: [{ type: "input_text", text: "hi" }],
+                    },
+                  ],
+                },
+                metrics: { tokens: 6451 },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
   });
 
-  test("a session with no Stop stays active (no end time)", async () => {
+  // The trigger must be captured whether the hook arrives before or after the
+  // transcript's compacted record. Here the hook comes AFTER (e.g. a resumed
+  // session whose compacted record is read during SessionStart's catch-up); the
+  // trigger is back-filled onto the already-built span.
+  test("trigger is back-filled when the compact hook arrives after the record", async () => {
     await assertProducesTrace(
-      [sessionStart({ model: "gpt-5.5", source: "startup" }), userPromptSubmit({ prompt: "hi" })],
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "tc" }),
+        compacted({ replacement_history: [{ type: "compaction" }], window_id: 2 }),
+        tokenCount({ total_tokens: 10 }),
+        taskComplete({ turn_id: "tc", last_agent_message: null }),
+        // Stop processes the compacted record (trigger not yet known)...
+        stop({ turn_id: "tc" }),
+        // ...then the PostCompact hook reveals the trigger, back-filled onto the span.
+        postCompact({ turn_id: "tc", trigger: "auto" }),
+      ],
       {
-        span_attributes: { name: "codex session", type: "task" },
-        ended: false,
-        children: [],
+        span_attributes: { name: "codex: work", type: "task" },
+        children: [
+          {
+            span_attributes: { name: "compaction", type: "task" },
+            metadata: {
+              compaction: { trigger: "auto", replaced_message_count: 1, window_id: 2 },
+            },
+            tags: ["compaction"],
+          },
+        ],
       },
     );
   });
 
-  test("Stop ends the root span", async () => {
+  // Regression for the live bug: a compaction turn gets no Stop, and PostCompact
+  // fires BEFORE Codex writes the turn's task_complete. PostCompact must do the
+  // same bounded wait Stop does, so the compaction span closes within the hook
+  // instead of lingering open until idle eviction (where it may never flush).
+  test("PostCompact waits for a task_complete that lands after it, closing the span", async () => {
+    class DelayedRevealReader implements TranscriptReader {
+      private lines: string[] = [];
+      private withheld: string | null = null;
+      private readsUntilReveal = 0;
+
+      append(line: string): void {
+        this.lines.push(line);
+      }
+
+      withhold(line: string, afterReads: number): void {
+        this.withheld = line;
+        this.readsUntilReveal = afterReads;
+      }
+
+      readFrom(_path: string, offset: number): TranscriptReadResult {
+        if (this.withheld !== null) {
+          if (this.readsUntilReveal <= 0) {
+            this.lines.push(this.withheld);
+            this.withheld = null;
+          } else {
+            this.readsUntilReveal -= 1;
+          }
+        }
+        const buf = this.lines.length > 0 ? `${this.lines.join("\n")}\n` : "";
+        const from = offset > buf.length ? 0 : offset;
+        const slice = buf.slice(from);
+        const out: string[] = [];
+        let consumed = 0;
+        let lineStart = 0;
+        for (let i = 0; i < slice.length; i++) {
+          if (slice[i] === "\n") {
+            out.push(slice.slice(lineStart, i));
+            lineStart = i + 1;
+            consumed = lineStart;
+          }
+        }
+        return { lines: out, offset: from + consumed };
+      }
+
+      async waitFor(
+        path: string,
+        offset: number,
+        predicate: (line: string) => boolean,
+        _opts: WaitForOptions,
+      ): Promise<TranscriptWaitResult> {
+        const { lines, offset: next } = this.readFrom(path, offset);
+        return { lines, offset: next, sentinelFound: lines.some(predicate) };
+      }
+    }
+
+    const reader = new DelayedRevealReader();
+    const write = (record: Record<string, unknown>) =>
+      reader.append(JSON.stringify({ timestamp: "2026-01-01T00:00:10Z", ...record }));
+    write({ type: "session_meta", payload: { id: "s", cwd: "/work" } });
+    write({ type: "event_msg", payload: { type: "task_started", turn_id: "tc" } });
+    write({
+      timestamp: "2026-01-01T00:00:11Z",
+      type: "compacted",
+      payload: { replacement_history: [{ type: "compaction" }], window_id: 1 },
+    });
+    write({
+      timestamp: "2026-01-01T00:00:11Z",
+      type: "event_msg",
+      payload: { type: "token_count", info: { last_token_usage: {} } },
+    });
+    // task_complete is withheld until after PostCompact's initial read, so only
+    // its bounded waitFor reveals it.
+    reader.withhold(
+      JSON.stringify({
+        timestamp: "2026-01-01T00:00:15Z",
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "tc", last_agent_message: null },
+      }),
+      1,
+    );
+
+    const trace = withCapturedTrace();
+    try {
+      const processor = new CodexEventProcessor(
+        "s",
+        createTestLogger(),
+        () => trace.spanFactory,
+        reader,
+      );
+      await processor.process(configEvent({ session_id: "s" }));
+      await processor.process(sessionStart({ session_id: "s" }));
+      await processor.process(preCompact({ session_id: "s", turn_id: "tc", trigger: "manual" }));
+      // PostCompact's initial read won't see task_complete; its bounded wait must.
+      await processor.process(postCompact({ session_id: "s", turn_id: "tc", trigger: "manual" }));
+
+      const tree = spansToTree(await trace.drain());
+      const compaction = tree?.children.find((c) => c.name === "compaction");
+      expect(compaction).toBeDefined();
+      // The span closed within the hook (end time from the late task_complete).
+      expect(compaction?.metrics?.end).toBe(1767225615); // 2026-01-01T00:00:15Z
+    } finally {
+      trace.cleanup();
+    }
+  });
+});
+
+describe("CodexEventProcessor: llm spans", () => {
+  test("a model call becomes an llm span (output + token metrics) under its turn", async () => {
     await assertProducesTrace(
       [
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
-        userPromptSubmit({ prompt: "hi" }),
-        stop({}),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "yo!" }),
+        reasoning(),
+        assistantMessage("Hey. What are we working on?"),
+        tokenCount({ input_tokens: 100, output_tokens: 14, total_tokens: 114 }),
+        taskComplete({ turn_id: "t1", last_agent_message: "Hey. What are we working on?" }),
+        stop({ turn_id: "t1" }),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
+        span_attributes: { name: "codex: work", type: "task" },
         ended: true,
-        children: [],
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                output: { role: "assistant", content: "Hey. What are we working on?" },
+                // Codex token keys mapped to Braintrust standard metric names.
+                metrics: { prompt_tokens: 100, completion_tokens: 14, tokens: 114 },
+                ended: true,
+              },
+            ],
+          },
+        ],
       },
     );
   });
 
-  test("ends on the first Stop; later events do not reopen it", async () => {
+  // A reasoning item with a readable summary is surfaced as a `reasoning` entry
+  // in the llm output (Braintrust's OpenAI Responses shape), interleaved before
+  // the assistant message.
+  test("a reasoning summary is captured as a reasoning item in the llm output", async () => {
     await assertProducesTrace(
       [
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
-        userPromptSubmit({ prompt: "hi", turn_id: "t1" }),
-        stop({ turn_id: "t1", last_assistant_message: "hello" }),
-        userPromptSubmit({ prompt: "another thing", turn_id: "t2" }),
-        stop({ turn_id: "t2", last_assistant_message: "ok" }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessageItem("solve it"),
+        reasoning(["**Inspecting the problem**", "**Deriving the formula**"]),
+        assistantMessage("The answer is 42."),
+        tokenCount({ input_tokens: 10, output_tokens: 5, total_tokens: 15 }),
+        taskComplete({ turn_id: "t1", last_agent_message: "The answer is 42." }),
+        stop({ turn_id: "t1" }),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
+        span_attributes: { name: "codex: work", type: "task" },
         ended: true,
         children: [
-          { span_attributes: { name: "turn: t1", type: "task" } },
-          { span_attributes: { name: "turn: t2", type: "task" } },
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                output: [
+                  {
+                    type: "reasoning",
+                    summary: [
+                      { type: "summary_text", text: "**Inspecting the problem**" },
+                      { type: "summary_text", text: "**Deriving the formula**" },
+                    ],
+                  },
+                  { role: "assistant", content: "The answer is 42." },
+                ],
+                ended: true,
+              },
+            ],
+          },
         ],
       },
     );
   });
 
-  test("a turn becomes a child span with prompt input and assistant output", async () => {
+  // A reasoning item with only encrypted content (no readable summary) opens the
+  // llm span but adds nothing to the output.
+  test("an encrypted-only reasoning item adds no reasoning to the output", async () => {
     await assertProducesTrace(
       [
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
-        userPromptSubmit({ prompt: "what's your name?", turn_id: "t1" }),
-        stop({ turn_id: "t1", last_assistant_message: "I'm Codex." }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessageItem("hi"),
+        reasoning(), // no summary
+        assistantMessage("Hello."),
+        tokenCount({ total_tokens: 5 }),
+        taskComplete({ turn_id: "t1", last_agent_message: "Hello." }),
+        stop({ turn_id: "t1" }),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
-        ended: true,
+        span_attributes: { name: "codex: work", type: "task" },
         children: [
           {
             span_attributes: { name: "turn: t1", type: "task" },
-            input: "what's your name?",
-            output: "I'm Codex.",
-            metadata: { turn_id: "t1" },
-            ended: true,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                // Single assistant message only — no reasoning entry.
+                output: { role: "assistant", content: "Hello." },
+              },
+            ],
           },
         ],
       },
     );
   });
 
-  test("an open turn with no matching Stop stays active", async () => {
+  test("llm input is reconstructed from prior conversation messages", async () => {
     await assertProducesTrace(
       [
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
-        userPromptSubmit({ prompt: "hi", turn_id: "t1" }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        // The prompt arrives as a response_item user message (history source).
+        userMessageItem("yo!"),
+        assistantMessage("Hey."),
+        tokenCount({ input_tokens: 10, output_tokens: 2, total_tokens: 12 }),
+        taskComplete({ turn_id: "t1", last_agent_message: "Hey." }),
+        stop({ turn_id: "t1" }),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
-        ended: false,
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
         children: [
           {
             span_attributes: { name: "turn: t1", type: "task" },
-            input: "hi",
-            ended: false,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                // Input = the user prompt seen before the call opened (chat msgs).
+                input: [{ role: "user", content: "yo!" }],
+                output: { role: "assistant", content: "Hey." },
+              },
+            ],
           },
         ],
       },
     );
   });
 
-  test("a Stop with no matching turn does not create a turn span", async () => {
+  test("multiple model calls interleave with tool spans in execution order", async () => {
     await assertProducesTrace(
       [
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
-        stop({ turn_id: "nope", last_assistant_message: "orphan" }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "do it" }),
+        // First model call: asks for a tool.
+        assistantMessage("calling tool"),
+        functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1" }),
+        tokenCount({ input_tokens: 10, output_tokens: 5, total_tokens: 15 }),
+        functionCallOutput({ call_id: "c1", output: "result" }),
+        // Second model call: final answer.
+        assistantMessage("done"),
+        tokenCount({ input_tokens: 20, output_tokens: 3, total_tokens: 23 }),
+        taskComplete({ turn_id: "t1", last_agent_message: "done" }),
+        stop({ turn_id: "t1" }),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
+        span_attributes: { name: "codex: work", type: "task" },
         ended: true,
-        children: [],
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            // Ordered siblings: llm(call 1), tool, llm(call 2).
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                // The call produced an assistant message plus a tool call.
+                output: [
+                  { role: "assistant", content: "calling tool" },
+                  {
+                    role: "assistant",
+                    content: null,
+                    tool_calls: [
+                      {
+                        id: "c1",
+                        type: "function",
+                        function: { name: "exec_command", arguments: "{}" },
+                      },
+                    ],
+                  },
+                ],
+                ended: true,
+              },
+              { span_attributes: { name: "exec_command", type: "tool" }, ended: true },
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                output: { role: "assistant", content: "done" },
+                ended: true,
+              },
+            ],
+          },
+        ],
       },
     );
   });
 
-  test("duplicate SessionStart still yields a single root span", async () => {
+  test("a model call with no closing token_count is closed when the turn ends", async () => {
     await assertProducesTrace(
       [
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
-        sessionStart({ model: "gpt-5.5", source: "startup" }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "hi" }),
+        assistantMessage("partial"),
+        // No token_count; turn completes anyway.
+        taskComplete({ turn_id: "t1", last_agent_message: "partial" }),
+        stop({ turn_id: "t1" }),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
-        children: [],
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                output: { role: "assistant", content: "partial" },
+                ended: true,
+              },
+            ],
+          },
+        ],
       },
     );
   });
+});
 
-  test("a config event builds the per-session factory with its config", () => {
-    // Spy provider records the config it was asked to build a factory for.
+describe("CodexEventProcessor: config / master switch", () => {
+  test("a config event builds the per-session factory with its config", async () => {
     let seen: ReportingConfig | undefined;
     const fake: SpanFactory = {
       startSpan: () =>
         ({ id: "s", end: () => 0, log: () => {} }) as unknown as ReturnType<
           SpanFactory["startSpan"]
         >,
+      rehydrateSpan: () =>
+        ({ id: "s", end: () => 0, log: () => {} }) as unknown as ReturnType<
+          SpanFactory["startSpan"]
+        >,
       flush: async () => {},
     };
     const provider = (config?: ReportingConfig) => {
@@ -177,7 +1394,8 @@ describe("CodexEventProcessor", () => {
       return fake;
     };
 
-    const processor = new CodexEventProcessor("sess-1", createTestLogger(), provider);
+    const reader = new FakeTranscriptReader();
+    const processor = new CodexEventProcessor("sess-1", createTestLogger(), provider, reader);
     const cfg: EnqueueEvent = {
       queueId: "sess-1",
       eventSource: "codex-hook",
@@ -190,9 +1408,16 @@ describe("CodexEventProcessor", () => {
         traceToBraintrust: true,
       },
     };
-    processor.process(cfg);
-    // Factory is built lazily on first span (the SessionStart).
-    processor.process(sessionStart({ session_id: "sess-1", model: "gpt-5.5" }));
+    await processor.process(cfg);
+    // Factory is built lazily on first span (session_meta).
+    reader.append(
+      JSON.stringify({
+        timestamp: "2026-01-01T00:00:00Z",
+        type: "session_meta",
+        payload: { id: "sess-1", cwd: "/work" },
+      }),
+    );
+    await processor.process(sessionStart({ session_id: "sess-1" }));
 
     expect(seen).toEqual({
       project: "team-project",
@@ -206,9 +1431,14 @@ describe("CodexEventProcessor", () => {
 
   test("root span metadata carries the configured project", async () => {
     await assertProducesTrace(
-      [configEvent({ project: "team-project" }), sessionStart({ model: "gpt-5.5" })],
+      [
+        configEvent({ project: "team-project" }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        stop({}),
+      ],
       {
-        span_attributes: { name: "codex session", type: "task" },
+        span_attributes: { name: "codex: work", type: "task" },
         metadata: { project: "team-project" },
         children: [],
       },
@@ -216,20 +1446,32 @@ describe("CodexEventProcessor", () => {
   });
 
   test("when tracing is disabled, no spans are produced", async () => {
+    const reader = new FakeTranscriptReader();
     const trace = withCapturedTrace();
     try {
-      const processor = new CodexEventProcessor("s", createTestLogger(), () => trace.spanFactory);
+      const processor = new CodexEventProcessor(
+        "s",
+        createTestLogger(),
+        () => trace.spanFactory,
+        reader,
+      );
       // Config event WITHOUT traceToBraintrust (defaults off).
-      processor.process({
+      await processor.process({
         queueId: "s",
         eventSource: "codex-hook",
         eventSourceVersion: null,
         eventName: CODEX_CONFIG_EVENT,
         eventData: { project: "p" },
       });
-      processor.process(sessionStart({ session_id: "s" }));
-      processor.process(userPromptSubmit({ session_id: "s", turn_id: "t1", prompt: "hi" }));
-      processor.process(stop({ session_id: "s", turn_id: "t1", last_assistant_message: "yo" }));
+      reader.append(
+        JSON.stringify({
+          timestamp: "2026-01-01T00:00:00Z",
+          type: "session_meta",
+          payload: { id: "s", cwd: "/work" },
+        }),
+      );
+      await processor.process(sessionStart({ session_id: "s" }));
+      await processor.process(stop({ session_id: "s", turn_id: "t1" }));
       await processor.flush();
 
       const spans = await trace.drain();
@@ -246,14 +1488,586 @@ describe("CodexEventProcessor", () => {
           project: "team-project",
           additionalMetadata: { team: "platform", model: "SHOULD_BE_OVERRIDDEN" },
         }),
-        sessionStart({ model: "gpt-5.5" }),
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        stop({}),
       ],
       {
-        span_attributes: { name: "codex session", type: "task" },
-        // team comes from additionalMetadata; model is the standard key (wins).
+        span_attributes: { name: "codex: work", type: "task" },
         metadata: { team: "platform", model: "gpt-5.5", project: "team-project" },
         children: [],
       },
     );
   });
 });
+
+describe("CodexEventProcessor: transcript catch-up plumbing", () => {
+  test("each hook reads from the advancing offset (each line read once)", async () => {
+    const reader = new FakeTranscriptReader();
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "hi" }),
+        taskComplete({ turn_id: "t1", last_agent_message: "yo" }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [{ span_attributes: { name: "turn: t1", type: "task" }, ended: true }],
+      },
+      { reader },
+    );
+
+    // Reads: SessionStart (readFrom), Stop (waitFor). The injected config event
+    // has no transcript_path, so it does not read.
+    expect(reader.readOffsets.length).toBeGreaterThanOrEqual(2);
+    for (let i = 1; i < reader.readOffsets.length; i++) {
+      expect(reader.readOffsets[i]).toBeGreaterThanOrEqual(reader.readOffsets[i - 1] as number);
+    }
+    expect(reader.readOffsets[0]).toBe(0);
+  });
+
+  test("a malformed transcript line is skipped without breaking handling", async () => {
+    const reader = new FakeTranscriptReader();
+    reader.append("this is not valid json");
+    await assertProducesTrace(
+      [sessionStart(), sessionMeta({ cwd: "/work" }), stop({ turn_id: "t1" })],
+      { span_attributes: { name: "codex: work", type: "task" }, ended: true, children: [] },
+      { reader },
+    );
+  });
+
+  test("a turn whose task_complete lands after its Stop is closed by the bounded wait", async () => {
+    // Regression for the live bug: Codex writes task_complete seconds after the
+    // Stop hook fires. The Stop's initial read doesn't see it (turn stays open),
+    // so the bounded waitFor must keep reading until it appears, then close the
+    // turn — with the transcript-derived end time, not a wall-clock value.
+    class DelayedRevealReader implements TranscriptReader {
+      private lines: string[] = [];
+      private withheld: string | null = null;
+      private readsUntilReveal = 0;
+
+      append(line: string): void {
+        this.lines.push(line);
+      }
+
+      /** Append a line that only becomes visible after `afterReads` more reads. */
+      withhold(line: string, afterReads: number): void {
+        this.withheld = line;
+        this.readsUntilReveal = afterReads;
+      }
+
+      private visible(): string[] {
+        return this.lines;
+      }
+
+      readFrom(_path: string, offset: number): TranscriptReadResult {
+        // Reveal the withheld line once enough reads have happened.
+        if (this.withheld !== null) {
+          if (this.readsUntilReveal <= 0) {
+            this.lines.push(this.withheld);
+            this.withheld = null;
+          } else {
+            this.readsUntilReveal -= 1;
+          }
+        }
+        const visible = this.visible();
+        const buf = visible.length > 0 ? `${visible.join("\n")}\n` : "";
+        const from = offset > buf.length ? 0 : offset;
+        const slice = buf.slice(from);
+        const out: string[] = [];
+        let consumed = 0;
+        let lineStart = 0;
+        for (let i = 0; i < slice.length; i++) {
+          if (slice[i] === "\n") {
+            out.push(slice.slice(lineStart, i));
+            lineStart = i + 1;
+            consumed = lineStart;
+          }
+        }
+        return { lines: out, offset: from + consumed };
+      }
+
+      async waitFor(
+        path: string,
+        offset: number,
+        predicate: (line: string) => boolean,
+        _opts: WaitForOptions,
+      ): Promise<TranscriptWaitResult> {
+        const { lines, offset: next } = this.readFrom(path, offset);
+        return { lines, offset: next, sentinelFound: lines.some(predicate) };
+      }
+    }
+
+    const reader = new DelayedRevealReader();
+    reader.append(
+      JSON.stringify({
+        timestamp: "2026-01-01T00:00:10Z",
+        type: "session_meta",
+        payload: { id: "s", cwd: "/work" },
+      }),
+    );
+    reader.append(
+      JSON.stringify({
+        timestamp: "2026-01-01T00:00:11Z",
+        type: "event_msg",
+        payload: { type: "task_started", turn_id: "t1" },
+      }),
+    );
+    // task_complete is withheld: not visible at the Stop's waitFor, appears later.
+    reader.withhold(
+      JSON.stringify({
+        timestamp: "2026-01-01T00:00:15Z",
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" },
+      }),
+      1,
+    );
+
+    const trace = withCapturedTrace();
+    try {
+      const processor = new CodexEventProcessor(
+        "s",
+        createTestLogger(),
+        () => trace.spanFactory,
+        reader,
+      );
+      await processor.process(configEvent({ session_id: "s" }));
+      await processor.process(sessionStart({ session_id: "s" }));
+      // Stop's bounded wait won't see task_complete yet; turn stays open.
+      await processor.process(stop({ session_id: "s", turn_id: "t1" }));
+      // flush polls until the withheld task_complete is revealed and closes t1.
+      await processor.flush();
+
+      const tree = spansToTree(await trace.drain());
+      const turn = tree?.children[0];
+      expect(turn?.name).toBe("turn: t1");
+      expect(turn?.metrics?.end).toBe(1767225615); // 2026-01-01T00:00:15Z
+    } finally {
+      trace.cleanup();
+    }
+  });
+});
+
+describe("CodexEventProcessor: subagents", () => {
+  const MAIN = "/test/main.jsonl";
+  const SUB = "/test/sub.jsonl";
+
+  // A main session that spawns one subagent. The subagent writes to its own
+  // transcript file; the processor must nest its full hierarchy (subagent root
+  // -> turn -> llm/tool) under the spawn_agent tool span that launched it, and
+  // never let the two files' offsets cross-contaminate.
+  test("a subagent renders as a nested trace under its spawn_agent tool", async () => {
+    const reader = new MultiFileTranscriptReader();
+    const w = (path: string, record: Record<string, unknown>) =>
+      reader.append(path, JSON.stringify(transcript(record).record));
+    const trace = withCapturedTrace();
+    try {
+      const processor = new CodexEventProcessor(
+        "s",
+        createTestLogger(),
+        () => trace.spanFactory,
+        reader,
+      );
+      await processor.process(configEvent({ session_id: "s" }));
+
+      // Main session: root + a turn that calls spawn_agent.
+      w(MAIN, { type: "session_meta", payload: { id: "s", cwd: "/work/app" } });
+      w(MAIN, { type: "event_msg", payload: { type: "task_started", turn_id: "main-1" } });
+      w(MAIN, { type: "turn_context", payload: { model: "gpt-5.5" } });
+      w(MAIN, { type: "response_item", payload: { type: "reasoning", summary: [] } });
+      w(MAIN, {
+        type: "response_item",
+        payload: {
+          type: "function_call",
+          name: "spawn_agent",
+          call_id: "call_spawn",
+          arguments: "{}",
+          metadata: { turn_id: "main-1" },
+        },
+      });
+      w(MAIN, {
+        type: "response_item",
+        payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" },
+      });
+      w(MAIN, {
+        type: "event_msg",
+        payload: { type: "token_count", info: { last_token_usage: {} } },
+      });
+      await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN }));
+      await processor.process(
+        preToolUse({
+          session_id: "s",
+          transcript_path: MAIN,
+          tool_name: "spawn_agent",
+          tool_use_id: "call_spawn",
+        }),
+      );
+      // PostToolUse reveals the agent_id; maps it to the spawn tool span.
+      await processor.process(
+        postToolUse({
+          session_id: "s",
+          transcript_path: MAIN,
+          tool_name: "spawn_agent",
+          tool_use_id: "call_spawn",
+          tool_response: JSON.stringify({ agent_id: "agent-1" }),
+        }),
+      );
+
+      // Subagent: its own transcript + lifecycle hooks (carry agent_id).
+      w(SUB, { type: "session_meta", payload: { id: "agent-1" } });
+      w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-1" } });
+      w(SUB, { type: "turn_context", payload: { model: "gpt-5.5" } });
+      w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } });
+      w(SUB, {
+        type: "response_item",
+        payload: {
+          type: "function_call",
+          name: "exec_command",
+          call_id: "call_exec",
+          arguments: "{}",
+          metadata: { turn_id: "sub-1" },
+        },
+      });
+      w(SUB, {
+        type: "response_item",
+        payload: { type: "function_call_output", call_id: "call_exec", output: "ok" },
+      });
+      w(SUB, {
+        type: "event_msg",
+        payload: { type: "token_count", info: { last_token_usage: {} } },
+      });
+      w(SUB, {
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" },
+      });
+      await processor.process(
+        subagentStart({
+          session_id: "s",
+          transcript_path: SUB,
+          agent_id: "agent-1",
+          agent_type: "default",
+        }),
+      );
+      await processor.process(
+        subagentStop({
+          session_id: "s",
+          transcript_path: MAIN,
+          agent_transcript_path: SUB,
+          agent_id: "agent-1",
+        }),
+      );
+
+      // Main session completes.
+      w(MAIN, {
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "all done" },
+      });
+      await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" }));
+      await processor.flush();
+
+      const tree = spansToTree(await trace.drain());
+      expect(tree?.name).toBe("codex: app");
+      // The root carries the main session's transcript path.
+      expect(tree?.metadata?.transcript_path).toBe(MAIN);
+      const mainTurn = tree?.children.find((c) => c.name === "turn: main-1");
+      expect(mainTurn).toBeDefined();
+
+      // The spawn_agent tool span is under the main turn, and the subagent root
+      // is a SIBLING of it (both under the main turn) — not nested in the tool.
+      const spawn = mainTurn?.children.find((c) => c.name === "spawn_agent");
+      expect(spawn?.type).toBe("tool");
+      expect(spawn?.children.some((c) => c.name === "subagent: agent-1")).toBe(false);
+      const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1");
+      expect(subRoot).toBeDefined();
+      expect(subRoot?.type).toBe("task");
+      // The subagent root carries its own transcript path.
+      expect(subRoot?.metadata?.transcript_path).toBe(SUB);
+
+      const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1");
+      expect(subTurn).toBeDefined();
+      const exec = subTurn?.children.find((c) => c.name === "exec_command");
+      expect(exec?.type).toBe("tool");
+
+      // No subagent turn leaked to the top level (the original bug).
+      expect(tree?.children.some((c) => c.name === "turn: sub-1")).toBe(false);
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // Regression for the live bug: after SubagentStop finalizes a subagent, a later
+  // flush (which drains every scope) must NOT re-read the subagent transcript and
+  // re-open its already-closed spans — doing so left the subagent root/turn/llm
+  // hanging (no end) because the re-opened, un-ended spans overwrote the closed
+  // ones. A finalized subagent scope must be inert.
+  test("a finalized subagent's spans stay closed across later flushes", async () => {
+    const reader = new MultiFileTranscriptReader();
+    const w = (path: string, record: Record<string, unknown>) =>
+      reader.append(path, JSON.stringify(transcript(record).record));
+    const trace = withCapturedTrace();
+    try {
+      const processor = new CodexEventProcessor(
+        "s",
+        createTestLogger(),
+        () => trace.spanFactory,
+        reader,
+      );
+      await processor.process(configEvent({ session_id: "s" }));
+
+      // Main session spawns a subagent and stays open (no main Stop yet).
+      w(MAIN, { type: "session_meta", payload: { id: "s", cwd: "/work/app" } });
+      w(MAIN, { type: "event_msg", payload: { type: "task_started", turn_id: "main-1" } });
+      w(MAIN, {
+        type: "response_item",
+        payload: {
+          type: "function_call",
+          name: "spawn_agent",
+          call_id: "call_spawn",
+          arguments: "{}",
+          metadata: { turn_id: "main-1" },
+        },
+      });
+      w(MAIN, {
+        type: "response_item",
+        payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" },
+      });
+      await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN }));
+      await processor.process(
+        postToolUse({
+          session_id: "s",
+          transcript_path: MAIN,
+          tool_name: "spawn_agent",
+          tool_use_id: "call_spawn",
+          tool_response: JSON.stringify({ agent_id: "agent-1" }),
+        }),
+      );
+
+      // Subagent transcript: a turn with a model call, then task_complete, then a
+      // TRAILING record after task_complete. SubagentStop's bounded wait stops at
+      // the task_complete sentinel, leaving the trailing record unconsumed; a
+      // later flush would otherwise re-read it and re-open spans on the finalized
+      // scope (the bug). With the fix, the finalized scope is inert.
+      w(SUB, { type: "session_meta", payload: { id: "agent-1" } });
+      w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-1" } });
+      w(SUB, { type: "turn_context", payload: { model: "gpt-5.5" } });
+      w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } });
+      w(SUB, {
+        type: "event_msg",
+        payload: { type: "token_count", info: { last_token_usage: {} } },
+      });
+      w(SUB, {
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" },
+      });
+      await processor.process(
+        subagentStart({
+          session_id: "s",
+          transcript_path: SUB,
+          agent_id: "agent-1",
+          agent_type: "default",
+        }),
+      );
+      await processor.process(
+        subagentStop({
+          session_id: "s",
+          transcript_path: MAIN,
+          agent_transcript_path: SUB,
+          agent_id: "agent-1",
+        }),
+      );
+
+      // After finalize, MORE records land in the subagent transcript (Codex can
+      // write trailing records, and the rollout file persists). A later flush
+      // drains every scope and re-reads this file. If the finalized scope weren't
+      // inert, these would re-open a turn + llm on it, leaving spans hanging.
+      w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-2" } });
+      w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } });
+      w(SUB, {
+        type: "event_msg",
+        payload: { type: "token_count", info: { last_token_usage: {} } },
+      });
+      await processor.flush();
+
+      // Main session finishes.
+      w(MAIN, {
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "done" },
+      });
+      await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" }));
+      await processor.flush();
+
+      const tree = spansToTree(await trace.drain());
+      // The subagent root is a sibling of the spawn_agent tool (under main-1).
+      const mainTurn = tree?.children.find((c) => c.name === "turn: main-1");
+      const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1");
+      const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1");
+
+      // The subagent root and turn are closed (not hanging), and the post-finalize
+      // records did NOT re-open anything: no second turn, no re-opened llm.
+      expect(subRoot?.metrics?.end).toBeDefined();
+      expect(subTurn?.metrics?.end).toBeDefined();
+      expect(subRoot?.children.some((c) => c.name === "turn: sub-2")).toBe(false);
+      expect(subRoot?.children.filter((c) => c.name?.startsWith("turn:")).length).toBe(1);
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // LLM-span start timing is derived per-turn, per-scope, so a subagent's calls
+  // must never influence the main session's call timing (or vice versa) even
+  // though both run concurrently. Each scope tracks its own turn start /
+  // last-child-end, keyed by its own transcript file.
+  test("llm span start timing is isolated per scope across concurrent subagents", async () => {
+    const reader = new MultiFileTranscriptReader();
+    const w = (path: string, record: Record<string, unknown>) =>
+      reader.append(path, JSON.stringify(record));
+    const trace = withCapturedTrace();
+    try {
+      const processor = new CodexEventProcessor(
+        "s",
+        createTestLogger(),
+        () => trace.spanFactory,
+        reader,
+      );
+      await processor.process(configEvent({ session_id: "s" }));
+
+      // Main turn starts at :10 and spawns a subagent; its own model call's output
+      // lands much later (:30), but the call began at the turn start (:10).
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:09Z",
+        type: "session_meta",
+        payload: { id: "s", cwd: "/work/app" },
+      });
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:10Z",
+        type: "event_msg",
+        payload: { type: "task_started", turn_id: "main-1" },
+      });
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:10Z",
+        type: "response_item",
+        payload: {
+          type: "function_call",
+          name: "spawn_agent",
+          call_id: "call_spawn",
+          arguments: "{}",
+          metadata: { turn_id: "main-1" },
+        },
+      });
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:11Z",
+        type: "response_item",
+        payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" },
+      });
+      await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN }));
+      await processor.process(
+        postToolUse({
+          session_id: "s",
+          transcript_path: MAIN,
+          tool_name: "spawn_agent",
+          tool_use_id: "call_spawn",
+          tool_response: JSON.stringify({ agent_id: "agent-1" }),
+        }),
+      );
+
+      // Subagent turn starts at :20 (its own clock); its model output lands at
+      // :25 and closes at :25. Its llm span must start at the SUBAGENT turn start
+      // (:20), not be affected by the main scope's :10/:11 boundaries.
+      w(SUB, {
+        timestamp: "2026-01-01T00:00:20Z",
+        type: "session_meta",
+        payload: { id: "agent-1" },
+      });
+      w(SUB, {
+        timestamp: "2026-01-01T00:00:20Z",
+        type: "event_msg",
+        payload: { type: "task_started", turn_id: "sub-1" },
+      });
+      w(SUB, {
+        timestamp: "2026-01-01T00:00:25Z",
+        type: "response_item",
+        payload: {
+          type: "message",
+          role: "assistant",
+          content: [{ type: "output_text", text: "sub" }],
+        },
+      });
+      w(SUB, {
+        timestamp: "2026-01-01T00:00:25Z",
+        type: "event_msg",
+        payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } },
+      });
+      w(SUB, {
+        timestamp: "2026-01-01T00:00:26Z",
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" },
+      });
+      await processor.process(
+        subagentStart({
+          session_id: "s",
+          transcript_path: SUB,
+          agent_id: "agent-1",
+          agent_type: "default",
+        }),
+      );
+      await processor.process(
+        subagentStop({
+          session_id: "s",
+          transcript_path: MAIN,
+          agent_transcript_path: SUB,
+          agent_id: "agent-1",
+        }),
+      );
+
+      // Main session's own model call: output lands at :30, closes at :30. Its
+      // llm span must start at the MAIN turn start (:10) — the most recent child
+      // of main-1 is the spawn_agent tool (ended :11), so it starts there.
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:30Z",
+        type: "response_item",
+        payload: {
+          type: "message",
+          role: "assistant",
+          content: [{ type: "output_text", text: "main" }],
+        },
+      });
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:30Z",
+        type: "event_msg",
+        payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } },
+      });
+      w(MAIN, {
+        timestamp: "2026-01-01T00:00:31Z",
+        type: "event_msg",
+        payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "all done" },
+      });
+      await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" }));
+      await processor.flush();
+
+      const tree = spansToTree(await trace.drain());
+      const mainTurn = tree?.children.find((c) => c.name === "turn: main-1");
+      // Main llm: opens on the spawn_agent tool call (the model's first output,
+      // before any message), so it starts at the main turn's start (:10 =
+      // 1767225610) and runs until its token_count (:30). The single llm call
+      // spans the spawn tool and the later message.
+      const mainLlm = mainTurn?.children.find((c) => c.name === "llm");
+      expect(mainLlm?.metrics?.start).toBe(1767225610);
+      expect(mainLlm?.metrics?.end).toBe(1767225630);
+
+      // Subagent llm: starts at the SUBAGENT turn start (:20 = 1767225620), wholly
+      // independent of the main scope's boundaries; ends at its token_count (:25).
+      const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1");
+      const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1");
+      const subLlm = subTurn?.children.find((c) => c.name === "llm");
+      expect(subLlm?.metrics?.start).toBe(1767225620);
+      expect(subLlm?.metrics?.end).toBe(1767225625);
+    } finally {
+      trace.cleanup();
+    }
+  });
+});
diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts
index dd95c50..55fbfe2 100644
--- a/plugins/trace-codex/src/agents/codex/event-processor.ts
+++ b/plugins/trace-codex/src/agents/codex/event-processor.ts
@@ -1,10 +1,28 @@
-// Processor for Codex events. A leading config event (same queueId) configures
-// how this session reports to Braintrust (project/credentials), so the processor
-// builds its own per-session SpanFactory. On SessionStart it opens a Braintrust
-// root span. Each turn is a child span: UserPromptSubmit opens it (input = the
-// user prompt), and the matching Stop closes it (output = the final assistant
-// message). The first Stop also ends the root span. Buffered spans are delivered
-// via flush().
+// Processor for a single Codex session. Spans are built from the session
+// transcript file (the "rollout"), which is the source of truth for everything
+// inside a session: the root, each turn, and each tool call, in true execution
+// order. The Codex lifecycle hooks drive when we read the transcript and supply
+// a few fields the transcript lacks.
+//
+// Two ordered inputs:
+//   - Hooks: `process(event)` is called once per hook. A leading config event
+//     (same queueId) configures reporting. Each hook triggers a transcript
+//     "catch-up". A SessionStart hook contributes `source`/`permission_mode` to
+//     the root span (the transcript does not record these). A Stop hook is the
+//     turn-terminal signal: before catching up we wait (bounded) for the turn's
+//     `task_complete` to be written, so the final spans aren't truncated.
+//   - Transcript: each new record is turned into spans by `processTranscriptEvent`:
+//       session_meta   -> open the root span (start time = its timestamp)
+//       task_started   -> open a turn span (child of root)
+//       user_message   -> set the open turn's input (the prompt)
+//       function_call / custom_tool_call / tool_search_call
+//                      -> open a tool span (child of its turn), keyed by call_id
+//       *_output       -> close the matching tool span (output)
+//       task_complete  -> set the turn's output and close it
+//
+// Span start times come from transcript timestamps, so sibling spans (LLM and
+// tool calls, later) render in execution order. Buffered spans are delivered via
+// flush(). Everything is wrapped so tracing can never break a Codex turn.
 
 import {
   defaultSpanFactoryProvider,
@@ -12,15 +30,98 @@ import {
   type Span,
   type SpanFactory,
   type SpanFactoryProvider,
+  type SpanRef,
+  spanRef,
 } from "../../braintrust/logger.ts";
 import type { Logger } from "../../log.ts";
 import type { EventProcessor } from "../../processor/event-processor.ts";
 import type { EnqueueEvent } from "../../server/routes.ts";
+import { PLUGIN_VERSION } from "../../version.ts";
 import { CODEX_CONFIG_EVENT } from "./event-builder.ts";
+import type { SnapshotStore } from "./snapshot-store.ts";
+import {
+  type CodexSnapshot,
+  type ConversationItemSnapshot,
+  isCompatibleSnapshot,
+  redactReportingConfig,
+  type ScopeSnapshot,
+  SNAPSHOT_SCHEMA_VERSION,
+} from "./state-snapshot.ts";
+import {
+  EVT_TASK_COMPLETE,
+  EVT_TASK_STARTED,
+  EVT_TOKEN_COUNT,
+  EVT_USER_MESSAGE,
+  ITEM_CUSTOM_TOOL_CALL,
+  ITEM_CUSTOM_TOOL_CALL_OUTPUT,
+  ITEM_FUNCTION_CALL,
+  ITEM_FUNCTION_CALL_OUTPUT,
+  ITEM_MESSAGE,
+  ITEM_REASONING,
+  ITEM_TOOL_SEARCH_CALL,
+  ITEM_TOOL_SEARCH_OUTPUT,
+  parseTranscriptLine,
+  RECORD_COMPACTED,
+  RECORD_EVENT_MSG,
+  RECORD_RESPONSE_ITEM,
+  RECORD_SESSION_META,
+  RECORD_TURN_CONTEXT,
+  type TranscriptRecord,
+} from "./transcript.ts";
+import { defaultTranscriptReader, type TranscriptReader } from "./transcript-reader.ts";
 
 const SESSION_START = "SessionStart";
-const USER_PROMPT_SUBMIT = "UserPromptSubmit";
 const STOP = "Stop";
+const SUBAGENT_START = "SubagentStart";
+const SUBAGENT_STOP = "SubagentStop";
+const POST_TOOL_USE = "PostToolUse";
+const PRE_COMPACT = "PreCompact";
+const POST_COMPACT = "PostCompact";
+const SPAWN_AGENT_TOOL = "spawn_agent";
+// Tag applied to a tool span whose call requested escalated permissions, so
+// permission-gated actions are easy to find/filter in Braintrust.
+const PERMISSION_TAG = "permission-request";
+// Tag applied to a turn span that performed a context compaction, so compactions
+// are easy to find/filter in Braintrust.
+const COMPACTION_TAG = "compaction";
+
+// Bound on how long a Stop hook waits for the turn's task_complete to appear in
+// the transcript before giving up and emitting whatever exists. Codex can write
+// task_complete several seconds after firing the Stop hook, so this is generous;
+// the wait returns as soon as the sentinel is seen, and only the last turn of a
+// session ever relies on it (earlier turns are closed by the next hook's
+// catch-up). Still bounded so a stuck writer can never stall the Codex turn.
+const SENTINEL_TIMEOUT_MS = 10_000;
+const SENTINEL_INTERVAL_MS = 25;
+
+// Bound on how long flush() polls the transcript for still-open turns to close
+// (their task_complete can land seconds after the terminal Stop). flush() is off
+// the Codex hot path, so this is generous; it returns immediately once no turns
+// remain open, and is still bounded so a stuck writer can't hang shutdown.
+const FLUSH_TIMEOUT_MS = 15_000;
+const FLUSH_INTERVAL_MS = 50;
+
+const sleep = (ms: number): Promise<void> => new Promise((resolve) => setTimeout(resolve, ms));
+
+/**
+ * Whether a hook signals the end of a turn (so we should wait, bounded, for the
+ * turn's task_complete to be written). Stop ends a normal turn; SubagentStop a
+ * subagent turn; PostCompact a compaction turn — which, unlike the others, never
+ * gets a Stop, so this is what lets its span close promptly instead of lingering
+ * open until idle eviction.
+ */
+function isTurnTerminal(eventName: string): boolean {
+  return eventName === STOP || eventName === SUBAGENT_STOP || eventName === POST_COMPACT;
+}
+
+/** Tool-call response_item subtypes that OPEN a tool span. */
+const TOOL_CALL_TYPES = new Set([ITEM_FUNCTION_CALL, ITEM_CUSTOM_TOOL_CALL, ITEM_TOOL_SEARCH_CALL]);
+/** Tool-call response_item subtypes that CLOSE a tool span (by call_id). */
+const TOOL_OUTPUT_TYPES = new Set([
+  ITEM_FUNCTION_CALL_OUTPUT,
+  ITEM_CUSTOM_TOOL_CALL_OUTPUT,
+  ITEM_TOOL_SEARCH_OUTPUT,
+]);
 
 /**
  * The final path segment of `cwd`, used to name the root span (e.g.
@@ -34,6 +135,285 @@ function projectDirName(cwd: string | undefined): string | undefined {
   return segments.length > 0 ? segments[segments.length - 1] : undefined;
 }
 
+/** Parse an ISO timestamp to Unix seconds (Braintrust's start time unit). */
+function isoToUnixSeconds(ts: unknown): number | undefined {
+  if (typeof ts !== "string") return undefined;
+  const ms = Date.parse(ts);
+  return Number.isNaN(ms) ? undefined : ms / 1000;
+}
+
+/** Extra root-span fields the transcript doesn't record; supplied by the hook. */
+interface RootEnrichment {
+  source?: string;
+  permissionMode?: string;
+}
+
+/**
+ * An OpenAI-style chat message. Used to reconstruct each LLM call's input (the
+ * transcript does not log the literal request) and to shape its output, so
+ * Braintrust renders these spans as proper LLM calls. Tool calls become an
+ * assistant message with `tool_calls`; tool results become a `tool` message.
+ */
+interface ChatMessage {
+  role: "developer" | "user" | "assistant" | "tool";
+  content: string | null;
+  tool_calls?: Array<{
+    id: string;
+    type: "function";
+    function: { name: string; arguments: string };
+  }>;
+  tool_call_id?: string;
+}
+
+/**
+ * A reasoning ("thinking") item, in the OpenAI Responses shape Braintrust
+ * renders. `summary` is the readable reasoning text the model exposes (Codex
+ * records it on `reasoning` transcript items as `summary_text` entries; the full
+ * chain-of-thought itself is encrypted and not available). Interleaved with chat
+ * messages in an LLM call's output and in the conversation history.
+ */
+interface ReasoningItem {
+  type: "reasoning";
+  summary: Array<{ type: "summary_text"; text: string }>;
+}
+
+/** An item in an LLM call's output or the conversation history. */
+type ConversationItem = ChatMessage | ReasoningItem;
+
+/** State for the LLM span currently in progress (one model call). */
+interface OpenLlm {
+  span: Span;
+  turnId: string | undefined;
+  /** Items produced by this call: reasoning, assistant text, and/or tool calls. */
+  output: ConversationItem[];
+  /**
+   * Timestamp (Unix seconds) of this call's last model-output item (message /
+   * reasoning / tool call). This is when the model finished generating and is
+   * used as the span's end time — NOT the closing token_count's timestamp. Codex
+   * writes the token_count after the tool result (at the same instant as the
+   * tool output), so ending the span there would wrongly stretch it to swallow
+   * the tool's execution. The token_count only supplies token metrics.
+   */
+  lastOutputTime: number | undefined;
+  /**
+   * When true, the span's output was already set at creation (e.g. a compaction
+   * call, whose output is the replacement history), so closing it must not
+   * overwrite it with the accumulated `output` messages.
+   */
+  outputPreset?: boolean;
+}
+
+/**
+ * An open turn span plus the timing we need to place its child LLM spans. The
+ * transcript records a model call's output only when it lands (just before the
+ * call's token_count), with no record at request-send time — so an LLM span
+ * stamped from its first output item would look near-instant. Instead we start
+ * each LLM span at the end of the most recently-ended child of this turn
+ * (`lastChildEndTime`), or at the turn's own start (`startTime`) when it's the
+ * first child. That makes a call's span cover request→response, and consecutive
+ * children tile the turn. All per-turn (hence per-scope), so concurrent
+ * subagents never affect each other's timing.
+ */
+interface OpenTurn {
+  span: Span;
+  /** Turn start (Unix seconds), used as the first child's start time. */
+  startTime: number | undefined;
+  /** Max end time (Unix seconds) among this turn's already-ended children. */
+  lastChildEndTime: number | undefined;
+}
+
+/**
+ * All parsing state tied to a single transcript file. The main session has one
+ * scope (its parent is the root span); each subagent gets its own scope (its
+ * parent is the subagent root span, a child of the spawn_agent tool that created
+ * it). Keeping offset + open spans per file is essential: a session and its
+ * subagents are interleaved on one hook stream but write to separate transcript
+ * files, so a single shared offset would read the wrong file's bytes.
+ */
+interface TranscriptScope {
+  /** Absolute transcript path this scope reads from. */
+  readonly path: string;
+  /** "main" session vs. a spawned subagent. */
+  readonly kind: "main" | "subagent";
+  /**
+   * Parent span for this scope's turn spans. For the main scope this is the
+   * root span; for a subagent it is the subagent root span. Resolved lazily so a
+   * subagent scope can be registered before its parent span is built.
+   */
+  readonly parentSpan: () => Span | null;
+  /** Byte offset already consumed from `path`. */
+  offset: number;
+  /** Turn spans currently open in this scope, keyed by turn_id (with the timing
+   * used to place their child LLM spans). */
+  readonly openTurns: Map<string, OpenTurn>;
+  /** turn_ids whose Stop has fired but whose task_complete hasn't been seen. */
+  readonly turnsAwaitingCompletion: Set<string>;
+  /** OpenAI-style chat history (messages + reasoning), used to reconstruct each
+   * LLM call's input. */
+  readonly conversationHistory: ConversationItem[];
+  /** The currently-open LLM span in this scope, if a model call is in progress. */
+  openLlm: OpenLlm | null;
+  /** Tool spans currently open in this scope, keyed by call_id. */
+  readonly openTools: Map<string, { span: Span; turnId: string }>;
+  /** Model slug, learned from this scope's first turn_context. */
+  model: string | undefined;
+  /** End time (Unix seconds) of the most recently completed turn in this scope. */
+  lastTurnEndTime: number | undefined;
+  /**
+   * For a subagent scope: its root span (the span the subagent's turns hang
+   * under) and whether it has been ended. Null/false for the main scope, which
+   * uses the processor's root span instead. The root span is created lazily from
+   * the subagent's own session_meta record (so it gets a real start time);
+   * `pendingSubagent` holds what's needed to create it until then.
+   */
+  subagentRootSpan: Span | null;
+  subagentEnded: boolean;
+  pendingSubagent?: { agentId: string; agentType: string | undefined; parent: Span };
+}
+
+/** Map Codex token usage keys onto Braintrust's standard metric names. */
+function tokenMetrics(usage: Record<string, unknown>): Record<string, number> {
+  const metrics: Record<string, number> = {};
+  const num = (v: unknown): number | undefined =>
+    typeof v === "number" && Number.isFinite(v) ? v : undefined;
+  const map: Array<[string, string]> = [
+    ["input_tokens", "prompt_tokens"],
+    ["output_tokens", "completion_tokens"],
+    ["total_tokens", "tokens"],
+    ["cached_input_tokens", "prompt_cached_tokens"],
+    ["reasoning_output_tokens", "completion_reasoning_tokens"],
+  ];
+  for (const [from, to] of map) {
+    const v = num(usage[from]);
+    if (v !== undefined) metrics[to] = v;
+  }
+  return metrics;
+}
+
+/**
+ * Shape an LLM call's output for Braintrust. Braintrust renders a single
+ * assistant message or an array of messages as a chat completion; a single
+ * message is returned directly, multiple are returned as an array.
+ */
+function llmOutput(messages: ConversationItem[]): unknown {
+  if (messages.length === 1) return messages[0];
+  return messages;
+}
+
+/**
+ * Extract the readable reasoning summary from a `reasoning` item's `summary`
+ * field: a list of `{type: "summary_text", text}` entries. Returns the
+ * non-empty entries (matching Braintrust's reasoning shape), or undefined when
+ * there's nothing readable (the full reasoning is encrypted and not exposed).
+ */
+function reasoningSummary(
+  summary: unknown,
+): Array<{ type: "summary_text"; text: string }> | undefined {
+  if (!Array.isArray(summary)) return undefined;
+  const parts: Array<{ type: "summary_text"; text: string }> = [];
+  for (const entry of summary) {
+    if (entry && typeof entry === "object") {
+      const text = (entry as { text?: unknown }).text;
+      if (typeof text === "string" && text.length > 0) {
+        parts.push({ type: "summary_text", text });
+      }
+    }
+  }
+  return parts.length > 0 ? parts : undefined;
+}
+
+/** Extract the concatenated text from a response_item message's content array. */
+function messageText(content: unknown): string | undefined {
+  if (!Array.isArray(content)) return undefined;
+  const parts: string[] = [];
+  for (const part of content) {
+    if (part && typeof part === "object" && typeof (part as { text?: unknown }).text === "string") {
+      parts.push((part as { text: string }).text);
+    }
+  }
+  return parts.length > 0 ? parts.join("") : undefined;
+}
+
+/**
+ * Shape a compaction's `replacement_history` into a readable "after compaction"
+ * output. The replacement history is the new context Codex keeps going forward:
+ * a few recent messages kept verbatim, plus a `compaction` entry holding the
+ * summary as `encrypted_content` (opaque to us). We surface the readable kept
+ * messages and replace the encrypted summary with a clear marker, so the span
+ * doesn't look like the model merely echoed the last user message.
+ */
+function compactionOutput(replacement: unknown[] | undefined): Record<string, unknown> {
+  if (replacement === undefined) {
+    return { summary: "[unavailable]", kept_messages: [] };
+  }
+  const kept: unknown[] = [];
+  let summaryEncrypted = false;
+  for (const entry of replacement) {
+    if (entry !== null && typeof entry === "object") {
+      const e = entry as Record<string, unknown>;
+      if (e.type === "compaction") {
+        if (typeof e.encrypted_content === "string") summaryEncrypted = true;
+        continue;
+      }
+    }
+    kept.push(entry);
+  }
+  return {
+    summary: summaryEncrypted ? "[summary unavailable — encrypted by Codex]" : "[no summary]",
+    kept_messages: kept,
+  };
+}
+
+/** A tool call's permission/escalation request, parsed from its arguments. */
+interface PermissionInfo {
+  /** The requested sandbox permission, e.g. "require_escalated". */
+  sandbox_permissions: string;
+  /** The model's justification shown to the user (may be absent). */
+  justification?: string;
+  /** The command prefix the escalation applies to (may be absent). */
+  prefix_rule?: unknown;
+}
+
+/**
+ * Parse a tool call's permission/escalation request from its arguments. Codex
+ * records an escalated retry's request inline in the function_call arguments
+ * (`sandbox_permissions`, `justification`, `prefix_rule`); when present, we
+ * surface it on the tool span. `args` is the raw arguments (a JSON string for
+ * function_call, or an object for custom_tool_call). Returns undefined when
+ * there's no escalation request.
+ */
+function permissionInfo(args: unknown): PermissionInfo | undefined {
+  let obj: Record<string, unknown> | undefined;
+  if (typeof args === "string") {
+    try {
+      const parsed = JSON.parse(args) as unknown;
+      if (parsed !== null && typeof parsed === "object") obj = parsed as Record<string, unknown>;
+    } catch {
+      return undefined;
+    }
+  } else if (args !== null && typeof args === "object") {
+    obj = args as Record<string, unknown>;
+  }
+  if (obj === undefined) return undefined;
+  const sandbox = obj.sandbox_permissions;
+  if (typeof sandbox !== "string" || sandbox.length === 0) return undefined;
+  const info: PermissionInfo = { sandbox_permissions: sandbox };
+  if (typeof obj.justification === "string") info.justification = obj.justification;
+  if (obj.prefix_rule !== undefined) info.prefix_rule = obj.prefix_rule;
+  return info;
+}
+
+// ConversationItems (chat messages / reasoning) are already plain JSON, so they
+// round-trip through a snapshot unchanged. These two helpers just bridge the
+// nominal type boundary between the live union and the snapshot's record array
+// (via unknown) without copying.
+function toItemSnapshots(items: ConversationItem[]): ConversationItemSnapshot[] {
+  return items as unknown as ConversationItemSnapshot[];
+}
+function fromItemSnapshots(items: ConversationItemSnapshot[]): ConversationItem[] {
+  return items as unknown as ConversationItem[];
+}
+
 export class CodexEventProcessor implements EventProcessor {
   private readonly logger: Logger;
   private readonly queueId: string | null;
@@ -43,18 +423,75 @@ export class CodexEventProcessor implements EventProcessor {
   private reportingConfig: ReportingConfig | undefined;
   private rootSpan: Span | null = null;
   private rootEnded = false;
-  // Turn spans currently open, keyed by turn_id. A turn is opened by
-  // UserPromptSubmit and closed (ended) by the matching Stop.
-  private readonly openTurns = new Map<string, Span>();
+  // The time (Unix seconds) the root span was ended, if it has been. Persisted so
+  // that on resume — where rehydrating the root necessarily re-emits its row — we
+  // can re-assert the end time and keep the root closed, rather than leaving it
+  // looking open (a rehydrated handle is created with no end).
+  private rootEndTime: number | undefined;
+  // Root enrichment from the SessionStart hook (source/permission_mode), which
+  // the transcript lacks. Applied when the root span is created, or patched onto
+  // the root if it already exists.
+  private rootEnrichment: RootEnrichment = {};
+  // Reads new transcript content on each hook (the "catch-up").
+  private readonly transcriptReader: TranscriptReader;
+  // One parsing scope per transcript file (the main session plus one per
+  // subagent), keyed by transcript path. Each owns its own offset and open
+  // spans, so interleaved hooks for different files never cross-contaminate.
+  private readonly scopes = new Map<string, TranscriptScope>();
+  // The main session's scope (its turns hang under the root span). Created on
+  // the first hook carrying the main transcript path.
+  private mainScope: TranscriptScope | null = null;
+  // call_id -> the turn span that contains a spawn_agent tool call, recorded when
+  // that tool span opens in the main transcript. The spawn_agent PostToolUse hook
+  // carries the resulting agent_id and the call_id (tool_use_id), letting us map
+  // agent_id to that turn span (below). We keep the TURN (not the tool span) so a
+  // subagent's root can be parented as a SIBLING of the spawn_agent tool — both
+  // under the same turn — rather than nested inside the tool.
+  private readonly spawnTurnSpansByCallId = new Map<string, Span>();
+  // agent_id -> the turn span under which that subagent should be parented (the
+  // turn that ran the spawn_agent tool). Resolved from the spawn_agent PostToolUse
+  // (agent_id + call_id) via spawnTurnSpansByCallId.
+  private readonly spawnTurnSpansByAgentId = new Map<string, Span>();
+  // turn_id -> compaction trigger ("manual"/"auto"), recorded from the
+  // PreCompact/PostCompact hooks (the transcript's compacted record lacks the
+  // trigger). A side map so the hook and the transcript's compacted record can
+  // arrive in any order: whichever comes first stores its half here / below, and
+  // the second one reconciles. (On a resumed session the transcript's compacted
+  // record is read during SessionStart's catch-up, before the hook arrives.)
+  private readonly compactionTriggerByTurn = new Map<string, string>();
+  // turn_id -> the compaction span built from the transcript, kept so a later
+  // PreCompact/PostCompact hook can back-fill the trigger onto it.
+  private readonly compactionSpansByTurn = new Map<string, Span>();
+  // span_id -> the name/type the span was created (or renamed) with. The SDK
+  // doesn't expose these as getters, but a rehydrated span must re-assert them
+  // (else the SDK infers a wrong name from the call stack on resume), so we
+  // remember them here and feed them into each SpanRef at serialize time.
+  private readonly spanAttrs = new Map<
+    string,
+    { name: string; type: SpanRef["type"]; startTime: number | undefined }
+  >();
+  // Optional per-session state store. When present, the processor persists a
+  // snapshot of its resumable state on flush and, on its first event, rehydrates
+  // any snapshot left by a previous server run for this session (so a session
+  // that outlived the server — idle shutdown or explicit close+resume — keeps
+  // building the same trace instead of starting over). Null disables resume.
+  private readonly snapshotStore: SnapshotStore | null;
+  // Whether we've already attempted a one-time restore from the store. Restore
+  // runs lazily on the first non-config event, once the span factory exists.
+  private restoreAttempted = false;
 
   constructor(
     queueId: string | null,
     logger: Logger,
     spanFactoryProvider: SpanFactoryProvider = defaultSpanFactoryProvider,
+    transcriptReader: TranscriptReader = defaultTranscriptReader,
+    snapshotStore: SnapshotStore | null = null,
   ) {
     this.queueId = queueId;
     this.logger = logger;
     this.spanFactoryProvider = spanFactoryProvider;
+    this.transcriptReader = transcriptReader;
+    this.snapshotStore = snapshotStore;
   }
 
   /** The session's SpanFactory, built from the config event on first use. */
@@ -65,11 +502,32 @@ export class CodexEventProcessor implements EventProcessor {
     return this.spanFactoryInstance;
   }
 
-  process(event: EnqueueEvent): void {
+  /** Whether this session reports to Braintrust at all (master switch). */
+  private get tracingEnabled(): boolean {
+    return this.reportingConfig?.traceToBraintrust === true;
+  }
+
+  // Dispatcher. The config event short-circuits (it configures reporting and
+  // must run before any spans). Every other hook (a) records any hook-only data
+  // it carries (root enrichment, subagent lifecycle), (b) catches up the
+  // relevant transcript into spans, and on a main-session Stop (c) ends the root
+  // span. Wrapped so transcript work can never break the turn.
+  async process(event: EnqueueEvent): Promise<void> {
     if (event.eventName === CODEX_CONFIG_EVENT) {
       this.configure(event);
       return;
     }
+
+    // First real event for this session: if a previous server run left a
+    // snapshot (idle shutdown mid-session, or an explicit close+resume), rebuild
+    // our state from it before processing. Runs once, regardless of which hook
+    // arrives first — the very next hook after a restart may not be SessionStart,
+    // and may not be preceded by a config event. Restore brings back the
+    // session's reporting config, so it MUST run before the master-switch check
+    // below (otherwise a resumed session with no leading config event would look
+    // untraced and we'd drop its events).
+    this.ensureRestored();
+
     // Master switch: when tracing is disabled, drop everything (no SDK calls).
     if (!this.tracingEnabled) {
       this.logger.debug("codex processor: tracing disabled; dropping event", {
@@ -78,30 +536,434 @@ export class CodexEventProcessor implements EventProcessor {
       });
       return;
     }
+
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+    const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined;
+
     if (event.eventName === SESSION_START) {
-      this.startRootSpan(event);
+      this.recordRootEnrichment(event);
+    }
+
+    // Compaction trigger is hook-only (not in the transcript). Record it keyed by
+    // turn_id, and if the compaction span was already built from the transcript
+    // (e.g. on a resumed session its compacted record is read during
+    // SessionStart's catch-up, before this hook arrives), back-fill the trigger
+    // onto it now. Order-independent: handleCompacted reads the map; this patches
+    // the span — whichever happens second reconciles.
+    if (event.eventName === PRE_COMPACT || event.eventName === POST_COMPACT) {
+      const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined;
+      const trigger = typeof data.trigger === "string" ? data.trigger : undefined;
+      if (turnId !== undefined && trigger !== undefined) {
+        this.compactionTriggerByTurn.set(turnId, trigger);
+        this.backfillCompactionTrigger(turnId, trigger);
+      }
+    }
+
+    // A subagent is starting: open its root span (under the spawn_agent tool that
+    // created it) and register a scope for its transcript so its hooks build a
+    // full turn/llm/tool hierarchy inside that span.
+    if (event.eventName === SUBAGENT_START) {
+      this.handleSubagentStart(event);
+    }
+
+    // Resolve the scope this hook belongs to (main vs. a specific subagent) by
+    // its transcript path, so each file keeps its own offset and open spans.
+    const scope = this.scopeForEvent(event);
+
+    // A turn-terminal signal: the turn is finishing, so mark it (flush, and the
+    // catch-up below, wait for this turn's task_complete, which Codex may write
+    // after the hook). Stop ends a normal turn; SubagentStop ends a subagent
+    // turn; PostCompact ends a compaction turn (which gets no Stop of its own).
+    if (isTurnTerminal(event.eventName) && scope !== null) {
+      const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined;
+      if (turnId !== undefined) scope.turnsAwaitingCompletion.add(turnId);
+    }
+
+    if (scope !== null) {
+      try {
+        await this.catchUpScope(scope, event);
+      } catch (err) {
+        this.logger.error("codex processor: transcript catch-up failed", {
+          queueId: this.queueId,
+          eventName: event.eventName,
+          error: String(err),
+        });
+      }
+    }
+
+    // After catching up the main transcript (which creates the spawn_agent tool
+    // span), map the resulting subagent's agent_id to that span so a following
+    // SubagentStart can nest the subagent under it. The PostToolUse's tool_use_id
+    // is the spawn_agent function_call's call_id; its tool_response carries the
+    // agent_id.
+    if (event.eventName === POST_TOOL_USE && agentId === undefined) {
+      this.recordSpawnedAgent(event);
+    }
+
+    // A subagent has finished: close its turns and root span.
+    if (event.eventName === SUBAGENT_STOP && agentId !== undefined) {
+      this.handleSubagentStop(event, agentId);
+    }
+
+    // End the (main) root span on the first main-session Stop. A SubagentStop
+    // must NOT end the root — the parent session is still running.
+    if (event.eventName === STOP && agentId === undefined) {
+      this.endRootSpan(this.mainScope?.lastTurnEndTime);
+    }
+  }
+
+  // Read transcript lines appended to this scope's file since its last read and
+  // turn each into spans. Never throws.
+  //
+  // On a turn-terminal hook (Stop/SubagentStop/PostCompact), first consume
+  // whatever's already on disk. If that closes the turn, we're done. Otherwise
+  // Codex hasn't written this turn's task_complete yet (it can lag the hook by
+  // seconds), so wait (bounded) for it — but only then, so a hook whose turn is
+  // already complete returns immediately. PostCompact matters specifically
+  // because a compaction turn gets no Stop, so without this its span would stay
+  // open until idle eviction (and may never be flushed).
+  private async catchUpScope(scope: TranscriptScope, event: EnqueueEvent): Promise<void> {
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+
+    // Initial read of everything available now.
+    const initial = this.transcriptReader.readFrom(scope.path, scope.offset);
+    scope.offset = initial.offset;
+    this.consumeLines(scope, initial.lines);
+
+    if (!isTurnTerminal(event.eventName)) return;
+
+    const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined;
+    // If the turn already closed during the initial read, no wait is needed.
+    if (turnId === undefined || !scope.openTurns.has(turnId)) return;
+
+    // Wait (bounded) for this turn's task_complete to be written.
+    const result = await this.transcriptReader.waitFor(
+      scope.path,
+      scope.offset,
+      (line) => this.isTaskCompleteFor(line, turnId),
+      { timeoutMs: SENTINEL_TIMEOUT_MS, intervalMs: SENTINEL_INTERVAL_MS },
+    );
+    scope.offset = result.offset;
+    this.consumeLines(scope, result.lines);
+    if (!result.sentinelFound) {
+      this.logger.warn("codex processor: task_complete sentinel not found; partial turn", {
+        queueId: this.queueId,
+        eventName: event.eventName,
+        turnId,
+      });
+    }
+  }
+
+  // ==========================================================================
+  // Transcript scopes
+  // ==========================================================================
+
+  // Resolve (creating if needed) the scope a hook belongs to, by its transcript
+  // path. A subagent hook (carrying agent_id) uses the subagent's transcript;
+  // its scope is created by SubagentStart and looked up here. Everything else is
+  // the main session. Returns null if the hook carries no usable transcript path
+  // or a subagent scope hasn't been registered yet.
+  private scopeForEvent(event: EnqueueEvent): TranscriptScope | null {
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+    const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined;
+    // SubagentStop reports the subagent file under agent_transcript_path (its
+    // transcript_path points at the PARENT session), so prefer that.
+    const path =
+      agentId !== undefined && typeof data.agent_transcript_path === "string"
+        ? data.agent_transcript_path
+        : typeof data.transcript_path === "string"
+          ? data.transcript_path
+          : undefined;
+    if (path === undefined) return null;
+
+    if (agentId === undefined) return this.ensureMainScope(path);
+
+    // Subagent hook: its scope must already exist (registered by SubagentStart).
+    const existing = this.scopes.get(path);
+    if (existing !== undefined) return existing;
+    this.logger.debug("codex processor: subagent hook before its scope exists; skipping", {
+      queueId: this.queueId,
+      agentId,
+    });
+    return null;
+  }
+
+  // The main session's scope, created on first use. Its turns hang under the
+  // processor's root span.
+  private ensureMainScope(path: string): TranscriptScope {
+    if (this.mainScope !== null) return this.mainScope;
+    const scope = this.newScope(path, "main", () => this.rootSpan);
+    this.mainScope = scope;
+    this.scopes.set(path, scope);
+    return scope;
+  }
+
+  private newScope(
+    path: string,
+    kind: "main" | "subagent",
+    parentSpan: () => Span | null,
+  ): TranscriptScope {
+    return {
+      path,
+      kind,
+      parentSpan,
+      offset: 0,
+      openTurns: new Map(),
+      turnsAwaitingCompletion: new Set(),
+      conversationHistory: [],
+      openLlm: null,
+      openTools: new Map(),
+      model: undefined,
+      lastTurnEndTime: undefined,
+      subagentRootSpan: null,
+      subagentEnded: false,
+    };
+  }
+
+  // ==========================================================================
+  // Subagent lifecycle
+  // ==========================================================================
+
+  // Record the agent_id produced by a spawn_agent tool, mapping it to that
+  // tool's span (located by call_id), so a later SubagentStart can parent the
+  // subagent's root span under the spawning tool. No-op for non-spawn tools or
+  // when the tool span isn't known.
+  private recordSpawnedAgent(event: EnqueueEvent): void {
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+    const toolName = typeof data.tool_name === "string" ? data.tool_name : undefined;
+    if (toolName !== SPAWN_AGENT_TOOL) return;
+    const callId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined;
+    if (callId === undefined) return;
+    const response = data.tool_response;
+    let agentId: string | undefined;
+    if (typeof response === "string") {
+      try {
+        const parsed = JSON.parse(response) as Record<string, unknown>;
+        if (typeof parsed.agent_id === "string") agentId = parsed.agent_id;
+      } catch {
+        // tool_response wasn't JSON; nothing to map.
+      }
+    } else if (response !== null && typeof response === "object") {
+      const ar = (response as Record<string, unknown>).agent_id;
+      if (typeof ar === "string") agentId = ar;
+    }
+    if (agentId === undefined) return;
+    const turnSpan = this.spawnTurnSpansByCallId.get(callId);
+    if (turnSpan === undefined) {
+      this.logger.debug("codex processor: spawn_agent PostToolUse with no known turn span", {
+        queueId: this.queueId,
+        callId,
+        agentId,
+      });
       return;
     }
-    if (event.eventName === USER_PROMPT_SUBMIT) {
-      this.startTurnSpan(event);
+    this.spawnTurnSpansByAgentId.set(agentId, turnSpan);
+  }
+
+  // A subagent is starting. Register a scope reading the subagent's own
+  // transcript so its turns/llm/tools build a full hierarchy. The subagent root
+  // span is created lazily from the subagent's session_meta (so it gets a real
+  // start time); we stash the parent here — the turn span that ran the
+  // spawn_agent tool (matched by agent_id from the spawn_agent PostToolUse), so
+  // the subagent root is a SIBLING of that spawn_agent tool span. Falls back to
+  // the main root span if that mapping isn't known.
+  private handleSubagentStart(event: EnqueueEvent): void {
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+    const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined;
+    const path = typeof data.transcript_path === "string" ? data.transcript_path : undefined;
+    if (agentId === undefined || path === undefined) {
+      this.logger.warn("codex processor: SubagentStart missing agent_id/transcript_path", {
+        queueId: this.queueId,
+      });
       return;
     }
-    if (event.eventName === STOP) {
-      this.endTurnSpan(event);
-      this.endRootSpan();
+    if (this.scopes.has(path)) return; // already registered
+
+    const parent = this.spawnTurnSpansByAgentId.get(agentId) ?? this.rootSpan;
+    if (parent === null) {
+      this.logger.warn("codex processor: SubagentStart with no parent span; skipping", {
+        queueId: this.queueId,
+        agentId,
+      });
       return;
     }
-    // Other events will attach child spans to the root span in later phases.
-    this.logger.debug("codex processor: no-op", {
+    const agentType = typeof data.agent_type === "string" ? data.agent_type : undefined;
+    const scope = this.newScope(path, "subagent", () => scope.subagentRootSpan);
+    scope.pendingSubagent = { agentId, agentType, parent };
+    this.scopes.set(path, scope);
+    this.logger.info("codex processor: registered subagent scope", {
       queueId: this.queueId,
-      eventName: event.eventName,
+      agentId,
     });
   }
 
+  // Create a subagent's root span from its session_meta record (sibling of the
+  // spawn_agent tool span — both under the spawning turn — with the record's
+  // timestamp as start time). Called once per subagent scope, when its
+  // session_meta is first consumed.
+  private startSubagentRootSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    const pending = scope.pendingSubagent;
+    if (pending === undefined || scope.subagentRootSpan !== null) return;
+    const startTime = isoToUnixSeconds(record.timestamp);
+    try {
+      const subagentName = `subagent: ${pending.agentId}`;
+      scope.subagentRootSpan = this.trackSpan(
+        pending.parent.startSpan({
+          name: subagentName,
+          type: "task",
+          ...(startTime !== undefined ? { startTime } : {}),
+          event: {
+            metadata: {
+              agent_id: pending.agentId,
+              agent_type: pending.agentType,
+              // The subagent's own rollout transcript on disk.
+              transcript_path: scope.path,
+            },
+          },
+        }),
+        subagentName,
+        "task",
+        startTime,
+      );
+      this.logger.info("codex processor: opened subagent root span", {
+        queueId: this.queueId,
+        agentId: pending.agentId,
+        spanId: scope.subagentRootSpan.id,
+      });
+    } catch (err) {
+      this.logger.error("codex processor: failed to open subagent root span", {
+        queueId: this.queueId,
+        agentId: pending.agentId,
+        error: String(err),
+      });
+    }
+  }
+
+  // A subagent has finished (its task_complete has been consumed by the catch-up
+  // in process()). Close any spans still open in its scope, then end its root
+  // span at the subagent's last turn end time.
+  private handleSubagentStop(event: EnqueueEvent, agentId: string): void {
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+    const path =
+      typeof data.agent_transcript_path === "string" ? data.agent_transcript_path : undefined;
+    const scope = path !== undefined ? this.scopes.get(path) : undefined;
+    if (scope === undefined || scope.subagentRootSpan === null) {
+      this.logger.debug("codex processor: SubagentStop with no scope/root span", {
+        queueId: this.queueId,
+        agentId,
+      });
+      return;
+    }
+    if (scope.subagentEnded) return;
+    scope.subagentEnded = true;
+    const endTime = scope.lastTurnEndTime;
+    const endArgs = endTime !== undefined ? { endTime } : undefined;
+    this.endOpenLlmSpan(scope, endTime);
+    this.closeAllOpenTools(scope, endArgs);
+    for (const [, turn] of scope.openTurns) {
+      try {
+        turn.span.end(endArgs);
+      } catch (err) {
+        this.logger.error("codex processor: failed to end dangling subagent turn span", {
+          queueId: this.queueId,
+          agentId,
+          error: String(err),
+        });
+      }
+    }
+    scope.openTurns.clear();
+    try {
+      scope.subagentRootSpan.end(endArgs);
+      this.logger.info("codex processor: ended subagent root span", {
+        queueId: this.queueId,
+        agentId,
+      });
+    } catch (err) {
+      this.logger.error("codex processor: failed to end subagent root span", {
+        queueId: this.queueId,
+        agentId,
+        error: String(err),
+      });
+    }
+  }
+
+  // Parse and process a batch of transcript lines into spans for the given
+  // scope, advancing nothing (the caller owns the offset). Used by catch-up and
+  // by the final flush read.
+  private consumeLines(scope: TranscriptScope, lines: string[]): void {
+    // A finalized subagent scope is inert: never re-open/mutate its spans (a
+    // re-read after SubagentStop would otherwise re-open already-closed spans).
+    if (scope.subagentEnded) return;
+    for (const line of lines) {
+      const record = parseTranscriptLine(line);
+      if (record === null) continue;
+      this.processTranscriptEvent(scope, record);
+    }
+  }
+
+  // Cheap check used as the Stop sentinel predicate: is this line a
+  // task_complete for `turnId` (or any task_complete when turnId is unknown)?
+  private isTaskCompleteFor(line: string, turnId: string | undefined): boolean {
+    const record = parseTranscriptLine(line);
+    if (record === null) return false;
+    if (record.type !== RECORD_EVENT_MSG || record.payload?.type !== EVT_TASK_COMPLETE) {
+      return false;
+    }
+    if (turnId === undefined) return true;
+    return record.payload?.turn_id === turnId;
+  }
+
+  // Turn one parsed transcript record into span operations for its scope.
+  // Unknown record types are ignored. Never throws; per-record failures logged.
+  private processTranscriptEvent(scope: TranscriptScope, record: TranscriptRecord): void {
+    try {
+      if (record.type === RECORD_SESSION_META) {
+        // The main session's session_meta opens the processor root span; a
+        // subagent's opens its (lazily created) subagent root span.
+        if (scope.kind === "main") this.startRootSpan(record);
+        else this.startSubagentRootSpan(scope, record);
+        return;
+      }
+      if (record.type === RECORD_TURN_CONTEXT) {
+        this.noteModel(scope, record);
+        return;
+      }
+      if (record.type === RECORD_COMPACTED) {
+        this.handleCompacted(scope, record);
+        return;
+      }
+      if (record.type === RECORD_EVENT_MSG) {
+        const ptype = record.payload?.type;
+        if (ptype === EVT_TASK_STARTED) this.startTurnSpan(scope, record);
+        else if (ptype === EVT_USER_MESSAGE) this.setTurnInput(scope, record);
+        else if (ptype === EVT_TASK_COMPLETE) this.endTurnSpan(scope, record);
+        else if (ptype === EVT_TOKEN_COUNT) this.closeLlmSpan(scope, record);
+        return;
+      }
+      if (record.type === RECORD_RESPONSE_ITEM) {
+        const ptype = record.payload?.type ?? "";
+        if (ptype === ITEM_MESSAGE) this.handleMessageItem(scope, record);
+        else if (ptype === ITEM_REASONING) this.handleReasoningItem(scope, record);
+        else if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(scope, record);
+        else if (TOOL_OUTPUT_TYPES.has(ptype)) this.endToolSpan(scope, record);
+        return;
+      }
+    } catch (err) {
+      this.logger.error("codex processor: failed to process transcript record", {
+        queueId: this.queueId,
+        type: record.type,
+        payloadType: record.payload?.type,
+        error: String(err),
+      });
+    }
+  }
+
+  // ==========================================================================
+  // Hook-driven configuration / enrichment
+  // ==========================================================================
+
   // Record the session's reporting config from the config event. The
-  // SpanFactory is built lazily (on first span) from this config. Arriving
-  // before the root span is created, this lets the session report to its own
-  // project/account.
+  // SpanFactory is built lazily (on first span) from this config.
   private configure(event: EnqueueEvent): void {
     if (this.spanFactoryInstance !== null) {
       this.logger.warn("codex processor: config event after factory built; ignoring", {
@@ -132,71 +994,263 @@ export class CodexEventProcessor implements EventProcessor {
     });
   }
 
-  /** Whether this session reports to Braintrust at all (master switch). */
-  private get tracingEnabled(): boolean {
-    return this.reportingConfig?.traceToBraintrust === true;
+  // Capture source/permission_mode from a SessionStart hook (the transcript
+  // lacks these). If the root span already exists, patch them onto it now.
+  private recordRootEnrichment(event: EnqueueEvent): void {
+    const data = (event.eventData ?? {}) as Record<string, unknown>;
+    const source = typeof data.source === "string" ? data.source : undefined;
+    const permissionMode =
+      typeof data.permission_mode === "string" ? data.permission_mode : undefined;
+    if (source !== undefined) this.rootEnrichment.source = source;
+    if (permissionMode !== undefined) this.rootEnrichment.permissionMode = permissionMode;
+
+    if (this.rootSpan !== null) {
+      try {
+        this.rootSpan.log({
+          metadata: {
+            source: this.rootEnrichment.source,
+            permission_mode: this.rootEnrichment.permissionMode,
+          },
+        });
+      } catch (err) {
+        this.logger.error("codex processor: failed to enrich root span", {
+          queueId: this.queueId,
+          error: String(err),
+        });
+      }
+    }
   }
 
-  async flush(): Promise<void> {
-    if (this.rootSpan === null) return;
+  // ==========================================================================
+  // Transcript-driven span building
+  // ==========================================================================
+
+  private startRootSpan(record: TranscriptRecord): void {
+    if (this.rootSpan !== null) {
+      this.logger.warn("codex processor: duplicate session_meta; keeping existing root span", {
+        queueId: this.queueId,
+      });
+      return;
+    }
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const cwd = typeof payload.cwd === "string" ? payload.cwd : undefined;
+    const sessionId = typeof payload.id === "string" ? payload.id : this.queueId;
+    const startTime = isoToUnixSeconds(record.timestamp);
+
+    const projectDir = projectDirName(cwd);
+    const spanName = projectDir ? `codex: ${projectDir}` : "codex session";
+
     try {
-      await this.rootSpan.flush();
-      this.logger.debug("codex processor: flush ok", { queueId: this.queueId });
+      this.rootSpan = this.trackSpan(
+        this.spanFactory.startSpan({
+          name: spanName,
+          type: "task",
+          ...(startTime !== undefined ? { startTime } : {}),
+          event: {
+            input: { model: this.mainScope?.model, cwd, source: this.rootEnrichment.source },
+            metadata: {
+              // User-provided extras first, so the standard keys below win.
+              ...this.reportingConfig?.additionalMetadata,
+              session_id: sessionId,
+              model: this.mainScope?.model,
+              cwd,
+              source: this.rootEnrichment.source,
+              permission_mode: this.rootEnrichment.permissionMode,
+              cli_version:
+                typeof payload.cli_version === "string" ? payload.cli_version : undefined,
+              project: this.reportingConfig?.project,
+              // The session's rollout transcript on disk (the source for this trace).
+              transcript_path: this.mainScope?.path,
+            },
+          },
+        }),
+        spanName,
+        "task",
+        startTime,
+      );
+      this.logger.info("codex processor: opened root span", {
+        queueId: this.queueId,
+        spanId: this.rootSpan.id,
+      });
     } catch (err) {
-      this.logger.error("codex processor: flush failed", {
+      this.logger.error("codex processor: failed to open root span", {
         queueId: this.queueId,
         error: String(err),
       });
     }
   }
 
-  // Open a child span for a turn on UserPromptSubmit. Keyed by turn_id so the
-  // matching Stop can close it. The user prompt is the span's input.
-  private startTurnSpan(event: EnqueueEvent): void {
-    if (this.rootSpan === null) {
-      this.logger.warn("codex processor: turn without a root span; ignoring", {
+  // Learn the scope's model from turn_context (session_meta has none). Backfill
+  // it onto the scope's parent span input/metadata the first time we see it.
+  private noteModel(scope: TranscriptScope, record: TranscriptRecord): void {
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const model = typeof payload.model === "string" ? payload.model : undefined;
+    if (model === undefined || scope.model !== undefined) return;
+    scope.model = model;
+    const parent = scope.parentSpan();
+    if (parent !== null) {
+      try {
+        parent.log({ input: { model }, metadata: { model } });
+      } catch (err) {
+        this.logger.error("codex processor: failed to backfill model", {
+          queueId: this.queueId,
+          error: String(err),
+        });
+      }
+    }
+  }
+
+  // Open a turn span on task_started, child of the scope's parent, by turn_id.
+  private startTurnSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    const parent = scope.parentSpan();
+    if (parent === null) {
+      this.logger.warn("codex processor: task_started without a parent span; ignoring", {
         queueId: this.queueId,
       });
       return;
     }
-
-    const data = (event.eventData ?? {}) as Record<string, unknown>;
-    const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined;
-    const prompt = typeof data.prompt === "string" ? data.prompt : undefined;
-    const model = typeof data.model === "string" ? data.model : undefined;
-
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined;
     if (turnId === undefined) {
-      this.logger.warn("codex processor: UserPromptSubmit without turn_id; skipping turn span", {
+      this.logger.warn("codex processor: task_started without turn_id; skipping turn span", {
         queueId: this.queueId,
       });
       return;
     }
-
-    if (this.openTurns.has(turnId)) {
+    if (scope.openTurns.has(turnId)) {
       this.logger.warn("codex processor: duplicate turn_id; keeping existing turn span", {
         queueId: this.queueId,
         turnId,
       });
       return;
     }
+    const startTime = isoToUnixSeconds(record.timestamp);
+    const turnName = `turn: ${turnId}`;
+    try {
+      const turnSpan = this.trackSpan(
+        parent.startSpan({
+          name: turnName,
+          type: "task",
+          ...(startTime !== undefined ? { startTime } : {}),
+          event: { metadata: { turn_id: turnId, model: scope.model } },
+        }),
+        turnName,
+        "task",
+        startTime,
+      );
+      scope.openTurns.set(turnId, { span: turnSpan, startTime, lastChildEndTime: undefined });
+      this.logger.info("codex processor: opened turn span", {
+        queueId: this.queueId,
+        turnId,
+        spanId: turnSpan.id,
+      });
+    } catch (err) {
+      this.logger.error("codex processor: failed to open turn span", {
+        queueId: this.queueId,
+        turnId,
+        error: String(err),
+      });
+    }
+  }
+
+  // Handle a `compacted` record: the current turn is a context compaction (Codex
+  // runs it as its own turn). Relabel that turn span as a "compaction" span with
+  // metadata (trigger from the PreCompact/PostCompact hook, the count of messages
+  // the compaction collapsed to, and the window_id) and a tag, then open an llm
+  // span representing the compaction model call. The following token_count closes
+  // that llm span with its token usage, so the trace shows what the compaction
+  // did: the conversation before it (input) and the kept context after (output).
+  private handleCompacted(scope: TranscriptScope, record: TranscriptRecord): void {
+    const turnId = this.latestOpenTurnId(scope);
+    const turn = this.latestOpenTurn(scope);
+    if (turnId === undefined || turn === undefined) {
+      this.logger.debug("codex processor: compacted record with no open turn span", {
+        queueId: this.queueId,
+      });
+      return;
+    }
+    const turnSpan = turn.span;
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const replacement = Array.isArray(payload.replacement_history)
+      ? (payload.replacement_history as unknown[])
+      : undefined;
+    const trigger = this.compactionTriggerByTurn.get(turnId);
 
+    // Relabel the turn span as a compaction span and annotate it. Keep the span
+    // so a PreCompact/PostCompact hook arriving later can back-fill the trigger.
+    this.compactionSpansByTurn.set(turnId, turnSpan);
     try {
-      const turnSpan = this.rootSpan.startSpan({
-        name: `turn: ${turnId}`,
+      turnSpan.setAttributes({ name: "compaction", type: "task" });
+      // Remember the rename so a resumed snapshot re-asserts "compaction", not
+      // the original "turn: ..." name. Preserve the span's original start time.
+      this.spanAttrs.set(turnSpan.spanId, {
+        name: "compaction",
         type: "task",
-        event: {
-          input: prompt,
-          metadata: { turn_id: turnId, model },
+        startTime: this.spanAttrs.get(turnSpan.spanId)?.startTime,
+      });
+      turnSpan.log({
+        metadata: {
+          compaction: {
+            trigger,
+            replaced_message_count: replacement?.length,
+            window_id: payload.window_id,
+          },
         },
+        tags: [COMPACTION_TAG],
       });
-      this.openTurns.set(turnId, turnSpan);
-      this.logger.info("codex processor: opened turn span", {
+    } catch (err) {
+      this.logger.error("codex processor: failed to annotate compaction span", {
         queueId: this.queueId,
         turnId,
-        spanId: turnSpan.id,
+        error: String(err),
       });
+    }
+
+    // Open an llm span for the compaction model call. To make it self-evident
+    // (and avoid the "model just echoed the user" look), shape input/output as
+    // before/after of the context window:
+    //   input  = the conversation history before compaction (+ its count)
+    //   output = the kept context after compaction (readable messages) plus a
+    //            clear marker for the summary, which Codex stores encrypted and
+    //            does not expose to us.
+    // Closed by the turn's token_count (metrics) like any other model call.
+    // Start the span at the turn's start (compaction is the sole child), not the
+    // record timestamp (which is when the compaction result lands), so its
+    // duration reflects the compaction call rather than looking instant.
+    const startTime = this.llmStartTimeFor(turn) ?? isoToUnixSeconds(record.timestamp);
+    const before = scope.conversationHistory.map((item) => ({ ...item }));
+    const output = compactionOutput(replacement);
+    const compactionLlmName = scope.model ?? "compaction";
+    try {
+      const span = this.trackSpan(
+        turnSpan.startSpan({
+          name: compactionLlmName,
+          type: "llm",
+          ...(startTime !== undefined ? { startTime } : {}),
+          event: {
+            input: { messages_before_compaction: before.length, history: before },
+            output,
+            metadata: { model: scope.model, turn_id: turnId, compaction: true },
+          },
+        }),
+        compactionLlmName,
+        "llm",
+        startTime,
+      );
+      // Reuse the open-llm slot so the turn's token_count closes it with usage.
+      // outputPreset: the replacement history is already logged as the output, so
+      // the closing token_count must not overwrite it. lastOutputTime is the
+      // compacted record's time (when the compaction output landed); the closing
+      // token_count provides metrics only.
+      scope.openLlm = {
+        span,
+        turnId,
+        output: [],
+        outputPreset: true,
+        lastOutputTime: isoToUnixSeconds(record.timestamp),
+      };
     } catch (err) {
-      this.logger.error("codex processor: failed to open turn span", {
+      this.logger.error("codex processor: failed to open compaction llm span", {
         queueId: this.queueId,
         turnId,
         error: String(err),
@@ -204,33 +1258,83 @@ export class CodexEventProcessor implements EventProcessor {
     }
   }
 
-  // Close the turn span matching the Stop's turn_id, recording the final
-  // assistant message as the span's output.
-  private endTurnSpan(event: EnqueueEvent): void {
-    const data = (event.eventData ?? {}) as Record<string, unknown>;
-    const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined;
-    const output =
-      typeof data.last_assistant_message === "string" ? data.last_assistant_message : undefined;
+  // Patch the trigger onto a compaction span when its PreCompact/PostCompact hook
+  // arrives after the span was already built from the transcript. No-op if the
+  // span isn't known yet (handleCompacted will read the trigger from the map).
+  private backfillCompactionTrigger(turnId: string, trigger: string): void {
+    const span = this.compactionSpansByTurn.get(turnId);
+    if (span === undefined) return;
+    try {
+      span.log({ metadata: { compaction: { trigger } } });
+    } catch (err) {
+      this.logger.error("codex processor: failed to back-fill compaction trigger", {
+        queueId: this.queueId,
+        turnId,
+        error: String(err),
+      });
+    }
+  }
 
-    if (turnId === undefined) {
-      this.logger.debug("codex processor: Stop without turn_id; no turn span to close", {
+  // Set the open turn's input from a user_message event (the prompt). The
+  // user_message carries no turn_id, so it applies to the most recently opened
+  // still-open turn in this scope.
+  private setTurnInput(scope: TranscriptScope, record: TranscriptRecord): void {
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const prompt = typeof payload.message === "string" ? payload.message : undefined;
+    if (prompt === undefined) return;
+    // Note: we do NOT add the prompt to conversationHistory here — the same
+    // prompt also arrives as a response_item user message (handleMessageItem),
+    // which is the canonical source for history. This event only sets the turn's
+    // input for display.
+    const turn = this.latestOpenTurn(scope);
+    if (turn === undefined) {
+      this.logger.debug("codex processor: user_message with no open turn span", {
         queueId: this.queueId,
       });
       return;
     }
+    try {
+      turn.span.log({ input: prompt });
+    } catch (err) {
+      this.logger.error("codex processor: failed to set turn input", {
+        queueId: this.queueId,
+        error: String(err),
+      });
+    }
+  }
 
-    const turnSpan = this.openTurns.get(turnId);
-    if (turnSpan === undefined) {
-      this.logger.debug("codex processor: Stop with no open turn span", {
+  // Close the turn span on task_complete, recording the final assistant message
+  // as output. Also closes any tool spans still open under that turn.
+  private endTurnSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined;
+    const output =
+      typeof payload.last_agent_message === "string" ? payload.last_agent_message : undefined;
+    if (turnId === undefined) {
+      this.logger.debug("codex processor: task_complete without turn_id", {
+        queueId: this.queueId,
+      });
+      return;
+    }
+    const turn = scope.openTurns.get(turnId);
+    if (turn === undefined) {
+      this.logger.debug("codex processor: task_complete with no open turn span", {
         queueId: this.queueId,
         turnId,
       });
       return;
     }
-
+    const turnSpan = turn.span;
+    const endTime = isoToUnixSeconds(record.timestamp);
+    // Backstop: close any spans still open without their own closing record
+    // (a trailing model call with no token_count, or a tool with no output) at
+    // the turn's end time.
+    this.endOpenLlmSpan(scope, endTime);
+    this.endOpenToolSpansForTurn(scope, turnId, endTime);
     try {
       turnSpan.log({ output });
-      turnSpan.end();
+      turnSpan.end(endTime !== undefined ? { endTime } : undefined);
+      if (endTime !== undefined) scope.lastTurnEndTime = endTime;
       this.logger.info("codex processor: ended turn span", { queueId: this.queueId, turnId });
     } catch (err) {
       this.logger.error("codex processor: failed to end turn span", {
@@ -239,17 +1343,475 @@ export class CodexEventProcessor implements EventProcessor {
         error: String(err),
       });
     } finally {
-      this.openTurns.delete(turnId);
+      scope.openTurns.delete(turnId);
+      scope.turnsAwaitingCompletion.delete(turnId);
+    }
+  }
+
+  // ==========================================================================
+  // LLM spans (transcript-driven)
+  //
+  // One LLM call = a run of reasoning/message (+ tool) items terminated by a
+  // token_count. The span opens lazily on the first model-output item (assistant
+  // message or reasoning) and closes at the next token_count, which carries the
+  // call's token usage. Tool calls the model emits are sibling tool spans under
+  // the turn (handled elsewhere); the assistant text/reasoning is the LLM span's
+  // output, and its input is the conversation reconstructed up to the open.
+  // ==========================================================================
+
+  // Ensure an LLM span is open for the scope's current model call, opening one
+  // (child of the active turn) on first use with input = the conversation so far.
+  private ensureLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    if (scope.openLlm !== null) return;
+    const turn = this.latestOpenTurn(scope);
+    if (turn === undefined) {
+      this.logger.debug("codex processor: model output with no open turn; skipping llm span", {
+        queueId: this.queueId,
+      });
+      return;
+    }
+    const turnId = this.latestOpenTurnId(scope);
+    // Start the span where the model's work for this call actually began — the
+    // end of the turn's most recently-ended child, or the turn's start when this
+    // is the first child — NOT the record timestamp (which is when the model's
+    // output landed, just before its token_count, yielding a near-instant span).
+    const startTime = this.llmStartTimeFor(turn) ?? isoToUnixSeconds(record.timestamp);
+    // Snapshot the conversation as this call's input (the transcript doesn't log
+    // the real request, so this is a best-effort reconstruction).
+    const input = scope.conversationHistory.map((item) => ({ ...item }));
+    const llmName = scope.model ?? "llm";
+    try {
+      const span = this.trackSpan(
+        turn.span.startSpan({
+          name: llmName,
+          type: "llm",
+          ...(startTime !== undefined ? { startTime } : {}),
+          event: { input, metadata: { model: scope.model, turn_id: turnId } },
+        }),
+        llmName,
+        "llm",
+        startTime,
+      );
+      scope.openLlm = { span, turnId, output: [], lastOutputTime: startTime };
+      this.logger.info("codex processor: opened llm span", {
+        queueId: this.queueId,
+        turnId,
+        spanId: span.id,
+      });
+    } catch (err) {
+      this.logger.error("codex processor: failed to open llm span", {
+        queueId: this.queueId,
+        error: String(err),
+      });
+    }
+  }
+
+  // Record that the open LLM call produced an output item at `record`'s time,
+  // advancing its lastOutputTime. The LLM span ends at this time (when the model
+  // last generated), not at the closing token_count (which Codex writes after
+  // the tool result). No-op if no LLM span is open.
+  private noteLlmOutputTime(scope: TranscriptScope, record: TranscriptRecord): void {
+    if (scope.openLlm === null) return;
+    const t = isoToUnixSeconds(record.timestamp);
+    if (t === undefined) return;
+    if (scope.openLlm.lastOutputTime === undefined || t > scope.openLlm.lastOutputTime) {
+      scope.openLlm.lastOutputTime = t;
+    }
+  }
+
+  // Handle a response_item message: an assistant message is this call's output
+  // (opens/feeds the LLM span); user/developer messages are input context. All
+  // are appended to conversation history as chat messages for reconstructing
+  // later calls' inputs.
+  private handleMessageItem(scope: TranscriptScope, record: TranscriptRecord): void {
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const rawRole = typeof payload.role === "string" ? payload.role : "user";
+    const role: ChatMessage["role"] =
+      rawRole === "assistant" || rawRole === "developer" || rawRole === "tool" ? rawRole : "user";
+    const text = messageText(payload.content);
+    if (text === undefined) return;
+
+    const msg: ChatMessage = { role, content: text };
+    if (role === "assistant") {
+      this.ensureLlmSpan(scope, record);
+      scope.openLlm?.output.push(msg);
+      this.noteLlmOutputTime(scope, record);
+    }
+    scope.conversationHistory.push(msg);
+  }
+
+  // Handle a reasoning item: it belongs to the current model call. Open the LLM
+  // span and, when the model exposed a readable reasoning summary, surface it as
+  // a `reasoning` item in the call's output and in the conversation history (so
+  // subsequent calls' input shows the prior thinking too). The full reasoning is
+  // encrypted, so items with no readable summary only open the span.
+  private handleReasoningItem(scope: TranscriptScope, record: TranscriptRecord): void {
+    this.ensureLlmSpan(scope, record);
+    // The reasoning item is a model output at this time, even when its summary is
+    // encrypted (empty) and adds nothing readable — so advance the span's end.
+    this.noteLlmOutputTime(scope, record);
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const summary = reasoningSummary(payload.summary);
+    if (summary === undefined) return;
+    const item: ReasoningItem = { type: "reasoning", summary };
+    scope.openLlm?.output.push(item);
+    scope.conversationHistory.push(item);
+  }
+
+  // Close the current LLM call on a token_count (the segment boundary): end the
+  // LLM span with token-usage metrics and its accumulated output. (Tool spans
+  // are opened/closed by their own records, so nothing to flush here.)
+  private closeLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    if (scope.openLlm === null) return;
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const info = (payload.info ?? {}) as Record<string, unknown>;
+    const usage = (info.last_token_usage ?? {}) as Record<string, unknown>;
+    const metrics = tokenMetrics(usage);
+    const { span, turnId, output, outputPreset, lastOutputTime } = scope.openLlm;
+    // End the span when the model last generated (its last output item), NOT at
+    // this token_count: Codex writes the token_count after the tool result (at
+    // the same instant as the tool output), so using it would stretch the span to
+    // swallow the tool's execution — making the llm span overlap the tool it
+    // called, which is causally impossible. The token_count only supplies metrics.
+    // Fall back to the token_count time if we somehow have no output time.
+    const endTime = lastOutputTime ?? isoToUnixSeconds(record.timestamp);
+    try {
+      span.log(outputPreset ? { metrics } : { output: llmOutput(output), metrics });
+      span.end(endTime !== undefined ? { endTime } : undefined);
+      // This LLM call is a child of its turn; advance the turn's boundary so the
+      // next child (LLM or tool) starts where this one ended (its last output).
+      this.noteChildEnded(scope, turnId, endTime);
+      this.logger.info("codex processor: ended llm span", { queueId: this.queueId });
+    } catch (err) {
+      this.logger.error("codex processor: failed to end llm span", {
+        queueId: this.queueId,
+        error: String(err),
+      });
+    } finally {
+      scope.openLlm = null;
     }
   }
 
-  // End the root span on the first Stop event. Subsequent Stops are ignored
-  // (the SDK keeps the first end time anyway).
-  private endRootSpan(): void {
+  // End any open LLM span without a closing token_count (e.g. on turn/root end).
+  // Prefer the model's last-output time (when generation actually ended) over the
+  // caller's fallback endTime (the turn/root end), so a trailing call isn't
+  // stretched to the turn boundary.
+  private endOpenLlmSpan(scope: TranscriptScope, endTime: number | undefined): void {
+    if (scope.openLlm === null) return;
+    const { span, turnId, output, lastOutputTime } = scope.openLlm;
+    const effectiveEnd = lastOutputTime ?? endTime;
+    try {
+      if (output.length > 0) span.log({ output: llmOutput(output) });
+      span.end(effectiveEnd !== undefined ? { endTime: effectiveEnd } : undefined);
+      this.noteChildEnded(scope, turnId, effectiveEnd);
+    } catch (err) {
+      this.logger.error("codex processor: failed to end dangling llm span", {
+        queueId: this.queueId,
+        error: String(err),
+      });
+    } finally {
+      scope.openLlm = null;
+    }
+  }
+
+  // Open a tool span (child of its turn) for a tool-call response_item, keyed by
+  // call_id so the matching *_output can close it. Each tool span is opened and
+  // closed by its own records (no cross-segment buffering), which keeps it robust
+  // against the transcript being read in pieces. Tool spans are ordered by their
+  // transcript start time, so they render between the LLM calls around them.
+  private startToolSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const callId = typeof payload.call_id === "string" ? payload.call_id : undefined;
+    // Most tool calls carry `name`; some (e.g. tool_search_call) don't, so fall
+    // back to the record subtype with the "_call" suffix stripped.
+    const ptype = typeof payload.type === "string" ? payload.type : undefined;
+    const name =
+      typeof payload.name === "string"
+        ? payload.name
+        : ptype !== undefined
+          ? ptype.replace(/_call$/, "")
+          : undefined;
+    const meta = (payload.metadata ?? {}) as Record<string, unknown>;
+    const turnId = typeof meta.turn_id === "string" ? meta.turn_id : this.latestOpenTurnId(scope);
+    // function_call uses `arguments` (a JSON string); custom_tool_call uses `input`.
+    const input = payload.arguments ?? payload.input;
+
+    // Record the tool call as an assistant message carrying a tool_call in the
+    // conversation history (input context for later calls). Done regardless of
+    // whether we end up creating a span, so reconstructed history stays faithful.
+    const argsString = typeof input === "string" ? input : JSON.stringify(input ?? {});
+    const toolCallMsg: ChatMessage = {
+      role: "assistant",
+      content: null,
+      tool_calls: [
+        {
+          id: callId ?? "",
+          type: "function",
+          function: { name: name ?? "tool", arguments: argsString },
+        },
+      ],
+    };
+    scope.conversationHistory.push(toolCallMsg);
+
+    if (callId === undefined) {
+      this.logger.warn("codex processor: tool call without call_id; skipping tool span", {
+        queueId: this.queueId,
+        turnId,
+      });
+      return;
+    }
+    if (turnId === undefined) {
+      this.logger.warn("codex processor: tool call without turn_id; skipping tool span", {
+        queueId: this.queueId,
+        callId,
+      });
+      return;
+    }
+    const turn = scope.openTurns.get(turnId);
+    if (turn === undefined) {
+      this.logger.warn("codex processor: tool call with no open turn span; skipping", {
+        queueId: this.queueId,
+        turnId,
+        callId,
+      });
+      return;
+    }
+    const turnSpan = turn.span;
+
+    // A tool call IS a model output, so it belongs to the current LLM call. Open
+    // the LLM span if one isn't already open (a call whose only output is a tool
+    // call — no preceding assistant message or readable reasoning — would
+    // otherwise leave the model's work, often many seconds, with no llm span,
+    // appearing as an unaccounted gap at the front of the turn). Record the tool
+    // call as this LLM call's output too. Done only once we know the tool call is
+    // valid (real open turn), so a tool call for an unknown turn opens nothing.
+    this.ensureLlmSpan(scope, record);
+    scope.openLlm?.output.push(toolCallMsg);
+    // The model emitted this tool call now — that's when its generation ended.
+    this.noteLlmOutputTime(scope, record);
+    if (scope.openTools.has(callId)) {
+      this.logger.warn("codex processor: duplicate call_id; keeping existing tool span", {
+        queueId: this.queueId,
+        callId,
+      });
+      return;
+    }
+    // If this call requested escalated permissions (Codex records the request
+    // inline in the arguments), surface it on the span as metadata and a tag.
+    const permission = permissionInfo(input);
+    const startTime = isoToUnixSeconds(record.timestamp);
+    const toolName = name ?? "tool";
+    try {
+      const toolSpan = this.trackSpan(
+        turnSpan.startSpan({
+          name: toolName,
+          type: "tool",
+          ...(startTime !== undefined ? { startTime } : {}),
+          event: {
+            input,
+            metadata: { tool_name: name, call_id: callId, turn_id: turnId, permission },
+            ...(permission !== undefined ? { tags: [PERMISSION_TAG] } : {}),
+          },
+        }),
+        toolName,
+        "tool",
+        startTime,
+      );
+      scope.openTools.set(callId, { span: toolSpan, turnId });
+      // Remember the TURN span that ran a spawn_agent tool (main scope only) so a
+      // subagent's root span can be parented as a sibling of the tool (both under
+      // this turn). The agent_id isn't in the transcript; it arrives on the
+      // spawn_agent PostToolUse hook, which we map to this turn by call_id.
+      if (scope.kind === "main" && name === SPAWN_AGENT_TOOL) {
+        this.spawnTurnSpansByCallId.set(callId, turnSpan);
+      }
+      this.logger.info("codex processor: opened tool span", {
+        queueId: this.queueId,
+        turnId,
+        callId,
+        spanId: toolSpan.id,
+      });
+    } catch (err) {
+      this.logger.error("codex processor: failed to open tool span", {
+        queueId: this.queueId,
+        turnId,
+        callId,
+        error: String(err),
+      });
+    }
+  }
+
+  // Close the tool span matching a *_output record's call_id, recording the
+  // output, and add the result to conversation history as a `tool` message.
+  private endToolSpan(scope: TranscriptScope, record: TranscriptRecord): void {
+    const payload = (record.payload ?? {}) as Record<string, unknown>;
+    const callId = typeof payload.call_id === "string" ? payload.call_id : undefined;
+    const output = payload.output ?? payload.result;
+    scope.conversationHistory.push({
+      role: "tool",
+      content: typeof output === "string" ? output : JSON.stringify(output ?? null),
+      tool_call_id: callId ?? "",
+    });
+    if (callId === undefined) {
+      this.logger.debug("codex processor: tool output without call_id", {
+        queueId: this.queueId,
+      });
+      return;
+    }
+    const entry = scope.openTools.get(callId);
+    if (entry === undefined) {
+      this.logger.debug("codex processor: tool output with no open tool span", {
+        queueId: this.queueId,
+        callId,
+      });
+      return;
+    }
+    const endTime = isoToUnixSeconds(record.timestamp);
+    try {
+      entry.span.log({ output });
+      entry.span.end(endTime !== undefined ? { endTime } : undefined);
+      // A tool is a child of its turn; advance the turn's boundary so a following
+      // LLM call (the model resuming after the tool result) starts here.
+      this.noteChildEnded(scope, entry.turnId, endTime);
+      this.logger.info("codex processor: ended tool span", { queueId: this.queueId, callId });
+    } catch (err) {
+      this.logger.error("codex processor: failed to end tool span", {
+        queueId: this.queueId,
+        callId,
+        error: String(err),
+      });
+    } finally {
+      scope.openTools.delete(callId);
+    }
+  }
+
+  // End any still-open tool spans owned by the given turn (e.g. a call whose
+  // output never arrived) when the turn ends, at the turn's end time.
+  private endOpenToolSpansForTurn(
+    scope: TranscriptScope,
+    turnId: string,
+    endTime: number | undefined,
+  ): void {
+    for (const [callId, entry] of scope.openTools) {
+      if (entry.turnId !== turnId) continue;
+      try {
+        entry.span.end(endTime !== undefined ? { endTime } : undefined);
+      } catch (err) {
+        this.logger.error("codex processor: failed to end dangling tool span", {
+          queueId: this.queueId,
+          turnId,
+          callId,
+          error: String(err),
+        });
+      } finally {
+        scope.openTools.delete(callId);
+      }
+    }
+  }
+
+  // Close every still-open tool span in a scope (used when ending a subagent or
+  // the root). Clears the scope's openTools.
+  private closeAllOpenTools(
+    scope: TranscriptScope,
+    endArgs: { endTime: number } | undefined,
+  ): void {
+    for (const [callId, entry] of scope.openTools) {
+      try {
+        entry.span.end(endArgs);
+      } catch (err) {
+        this.logger.error("codex processor: failed to end dangling tool span", {
+          queueId: this.queueId,
+          callId,
+          error: String(err),
+        });
+      }
+    }
+    scope.openTools.clear();
+  }
+
+  // The most recently opened turn that is still open in this scope (used to
+  // attach a user_message or an LLM span, which carry no turn_id).
+  private latestOpenTurn(scope: TranscriptScope): OpenTurn | undefined {
+    let last: OpenTurn | undefined;
+    for (const turn of scope.openTurns.values()) last = turn;
+    return last;
+  }
+
+  // The start time (Unix seconds) to use for an LLM span opening now under the
+  // given turn: the end of the turn's most recently-ended child if any, else the
+  // turn's own start. This makes the LLM span cover request->response (the
+  // transcript has no record at request-send time) and lets consecutive children
+  // tile the turn. Per-turn state, so concurrent subagents never interfere.
+  private llmStartTimeFor(turn: OpenTurn): number | undefined {
+    return turn.lastChildEndTime ?? turn.startTime;
+  }
+
+  // Record that a child of the given turn ended at `endTime`, advancing the
+  // turn's lastChildEndTime so the next LLM span starts where this child ended.
+  // No-op when endTime is unknown or the turn isn't open.
+  private noteChildEnded(
+    scope: TranscriptScope,
+    turnId: string | undefined,
+    endTime: number | undefined,
+  ): void {
+    if (turnId === undefined || endTime === undefined) return;
+    const turn = scope.openTurns.get(turnId);
+    if (turn === undefined) return;
+    if (turn.lastChildEndTime === undefined || endTime > turn.lastChildEndTime) {
+      turn.lastChildEndTime = endTime;
+    }
+  }
+
+  // The turn_id of the most recently opened still-open turn in this scope.
+  private latestOpenTurnId(scope: TranscriptScope): string | undefined {
+    let last: string | undefined;
+    for (const turnId of scope.openTurns.keys()) last = turnId;
+    return last;
+  }
+
+  // Number of turns (across all scopes) whose Stop has fired but which are still
+  // open (awaiting their task_complete). flush() polls only while this is > 0.
+  private countPendingTurns(): number {
+    let n = 0;
+    for (const scope of this.scopes.values()) {
+      for (const turnId of scope.turnsAwaitingCompletion) {
+        if (scope.openTurns.has(turnId)) n += 1;
+      }
+    }
+    return n;
+  }
+
+  // End the root span on the first main-session Stop. Subsequent Stops are
+  // ignored (the SDK records a span's end time once). `endTime` is the completing
+  // turn's end time so the root isn't stamped with a late wall-clock value; falls
+  // back to the SDK's now() if unknown. Any still-open spans in the main scope
+  // are closed at the same time so nothing dangles.
+  private endRootSpan(endTime: number | undefined): void {
     if (this.rootSpan === null || this.rootEnded) return;
     this.rootEnded = true;
+    this.rootEndTime = endTime;
+    const endArgs = endTime !== undefined ? { endTime } : undefined;
+    const scope = this.mainScope;
+    // Backstop: close any open LLM span, any still-open tool spans, then any
+    // still-open turn spans in the main scope, so nothing dangles.
+    if (scope !== null) {
+      this.endOpenLlmSpan(scope, endTime);
+      this.closeAllOpenTools(scope, endArgs);
+      for (const [turnId, turn] of scope.openTurns) {
+        try {
+          turn.span.end(endArgs);
+        } catch (err) {
+          this.logger.error("codex processor: failed to end dangling turn span on root end", {
+            queueId: this.queueId,
+            turnId,
+            error: String(err),
+          });
+        }
+      }
+      scope.openTurns.clear();
+    }
     try {
-      this.rootSpan.end();
+      this.rootSpan.end(endArgs);
       this.logger.info("codex processor: ended root span", { queueId: this.queueId });
     } catch (err) {
       this.logger.error("codex processor: failed to end root span", {
@@ -257,58 +1819,358 @@ export class CodexEventProcessor implements EventProcessor {
         error: String(err),
       });
     }
+    // NOTE: we deliberately do NOT delete the snapshot here. Ending the root span
+    // means "this turn's Stop fired", not "the session is over" — Codex has no
+    // session-end hook, and later turns attach as children of this same root. The
+    // snapshot must persist so a restart between turns can resume the session;
+    // stale ones are reclaimed by the store's age-based GC.
   }
 
-  private startRootSpan(event: EnqueueEvent): void {
-    if (this.rootSpan !== null) {
-      this.logger.warn("codex processor: duplicate SessionStart; keeping existing root span", {
+  async flush(): Promise<void> {
+    if (this.rootSpan === null) return;
+    // Final catch-up: a turn's task_complete can land slightly after its Stop
+    // hook fires, so the Stop's bounded wait may miss it and leave the turn open.
+    // flush() happens after the terminal Stop's /flush (and on idle/eviction).
+    // Poll every scope's transcript until every open turn has closed (or a
+    // bounded timeout elapses), processing new records as they appear. This
+    // processes the transcript (it does not unilaterally end spans), staying
+    // consistent with "the transcript is the truth".
+    await this.drainOpenTurns();
+    try {
+      await this.rootSpan.flush();
+      this.logger.debug("codex processor: flush ok", { queueId: this.queueId });
+    } catch (err) {
+      this.logger.error("codex processor: flush failed", {
         queueId: this.queueId,
+        error: String(err),
       });
-      return;
     }
+    // Persist state so a future server run can resume this session's trace. Done
+    // after the catch-up + flush above so the snapshot reflects the latest
+    // offsets and closed spans.
+    this.persistState();
+  }
 
-    const data = (event.eventData ?? {}) as Record<string, unknown>;
-    const model = typeof data.model === "string" ? data.model : undefined;
-    const cwd = typeof data.cwd === "string" ? data.cwd : undefined;
-    const source = typeof data.source === "string" ? data.source : undefined;
-    const permissionMode =
-      typeof data.permission_mode === "string" ? data.permission_mode : undefined;
-
-    // Name the root span after the directory Codex was launched from, e.g.
-    // "codex: myapp" for /whatever/myapp. Falls back to "codex session" when cwd
-    // is unknown.
-    const projectDir = projectDirName(cwd);
-    const spanName = projectDir ? `codex: ${projectDir}` : "codex session";
+  // ==========================================================================
+  // State persistence (resume support)
+  // ==========================================================================
 
+  // Persist this session's snapshot so a future server run can resume the trace.
+  // Best-effort: the store never throws. No-op without a store/session id, or
+  // before any root span exists (nothing worth resuming yet).
+  //
+  // We persist even once the root span has ended. Codex's "Stop" is per-TURN,
+  // not per-session (there is no session-end hook), and we end the root on the
+  // first Stop — but the session lives on and more turns can follow, each a
+  // child of that same (ended) root. So the snapshot must survive past root-end,
+  // or a server restart between turns would lose the thread and start a brand-new
+  // trace. Stale snapshots from sessions that truly never resume are reclaimed by
+  // the store's age-based GC, not by deleting here.
+  private persistState(): void {
+    if (this.snapshotStore === null || this.queueId === null) return;
+    if (this.rootSpan === null) return; // nothing worth resuming yet
     try {
-      this.rootSpan = this.spanFactory.startSpan({
-        name: spanName,
-        type: "task",
-        event: {
-          input: { model, cwd, source },
-          metadata: {
-            // User-provided extras first, so the standard keys below win on
-            // conflict.
-            ...this.reportingConfig?.additionalMetadata,
-            session_id: this.queueId,
-            model,
-            cwd,
-            source,
-            permission_mode: permissionMode,
-            event_source_version: event.eventSourceVersion,
-            project: this.reportingConfig?.project,
-          },
-        },
+      this.snapshotStore.write(this.queueId, this.serialize());
+    } catch (err) {
+      this.logger.error("codex processor: failed to serialize state", {
+        queueId: this.queueId,
+        error: String(err),
       });
-      this.logger.info("codex processor: opened root span", {
+    }
+  }
+
+  // One-time restore from a snapshot left by a previous server run. Runs lazily
+  // on the first event (after reporting is configured, so the span factory and
+  // tracing-enabled check are settled). Guarded so it happens at most once.
+  private ensureRestored(): void {
+    if (this.restoreAttempted) return;
+    this.restoreAttempted = true;
+    if (this.snapshotStore === null || this.queueId === null) return;
+    // If we've already built a root span this run, there's nothing to restore
+    // (we're the original processor, not a post-restart one).
+    if (this.rootSpan !== null) return;
+    const snapshot = this.snapshotStore.read(this.queueId);
+    if (snapshot === null) return;
+    if (!isCompatibleSnapshot(snapshot, PLUGIN_VERSION)) {
+      this.logger.info("codex processor: snapshot version mismatch; starting fresh", {
         queueId: this.queueId,
-        spanId: this.rootSpan.id,
+        snapshotVersion: snapshot.pluginVersion,
+        snapshotSchema: snapshot.schemaVersion,
+      });
+      this.snapshotStore.delete(this.queueId);
+      return;
+    }
+    try {
+      this.restore(snapshot);
+      this.logger.info("codex processor: restored session state from snapshot", {
+        queueId: this.queueId,
+        scopes: snapshot.scopes.length,
       });
     } catch (err) {
-      this.logger.error("codex processor: failed to open root span", {
+      this.logger.error("codex processor: failed to restore snapshot; starting fresh", {
         queueId: this.queueId,
         error: String(err),
       });
     }
   }
+
+  // Remember the name/type a span was created (or renamed) with, so it can be
+  // re-asserted on rehydration. Returns the span unchanged for call-site
+  // convenience.
+  private trackSpan(
+    span: Span,
+    name: string,
+    type: SpanRef["type"],
+    startTime: number | undefined,
+  ): Span {
+    this.spanAttrs.set(span.spanId, { name, type, startTime });
+    return span;
+  }
+
+  // Capture a span's identity plus its remembered name/type/start for the
+  // snapshot, so a rehydrated handle keeps them (and isn't re-stamped to the
+  // resume time).
+  private refFor(span: Span): SpanRef {
+    const attrs = this.spanAttrs.get(span.spanId);
+    return spanRef(span, attrs?.name, attrs?.type, attrs?.startTime);
+  }
+
+  // Build a serializable snapshot of all resumable state. Span handles are
+  // captured as identities (spanId/rootSpanId/parents) plus their name/type; the
+  // transcript on disk remains the source of truth for content, so nothing here
+  // re-derives spans.
+  private serialize(): CodexSnapshot {
+    const scopes: ScopeSnapshot[] = [];
+    for (const scope of this.scopes.values()) {
+      scopes.push({
+        path: scope.path,
+        kind: scope.kind,
+        offset: scope.offset,
+        openTurns: Array.from(scope.openTurns, ([turnId, turn]) => ({
+          turnId,
+          span: this.refFor(turn.span),
+          startTime: turn.startTime,
+          lastChildEndTime: turn.lastChildEndTime,
+        })),
+        turnsAwaitingCompletion: Array.from(scope.turnsAwaitingCompletion),
+        conversationHistory: toItemSnapshots(scope.conversationHistory),
+        openLlm:
+          scope.openLlm === null
+            ? null
+            : {
+                span: this.refFor(scope.openLlm.span),
+                turnId: scope.openLlm.turnId,
+                output: toItemSnapshots(scope.openLlm.output),
+                lastOutputTime: scope.openLlm.lastOutputTime,
+                outputPreset: scope.openLlm.outputPreset,
+              },
+        openTools: Array.from(scope.openTools, ([callId, entry]) => ({
+          callId,
+          span: this.refFor(entry.span),
+          turnId: entry.turnId,
+        })),
+        model: scope.model,
+        lastTurnEndTime: scope.lastTurnEndTime,
+        subagentRootSpan:
+          scope.subagentRootSpan === null ? null : this.refFor(scope.subagentRootSpan),
+        subagentEnded: scope.subagentEnded,
+        ...(scope.pendingSubagent !== undefined
+          ? {
+              pendingSubagent: {
+                agentId: scope.pendingSubagent.agentId,
+                agentType: scope.pendingSubagent.agentType,
+                parent: this.refFor(scope.pendingSubagent.parent),
+              },
+            }
+          : {}),
+      });
+    }
+    return {
+      pluginVersion: PLUGIN_VERSION,
+      schemaVersion: SNAPSHOT_SCHEMA_VERSION,
+      sessionId: this.queueId,
+      savedAt: Date.now(),
+      reportingConfig: redactReportingConfig(this.reportingConfig),
+      rootSpan: this.rootSpan === null ? null : this.refFor(this.rootSpan),
+      rootEnded: this.rootEnded,
+      rootEndTime: this.rootEndTime,
+      rootEnrichment: { ...this.rootEnrichment },
+      mainScopePath: this.mainScope?.path ?? null,
+      scopes,
+      // We deliberately do NOT persist the span-bearing side-maps
+      // (spawnTurnSpansBy*, compactionSpansByTurn). They retain references to
+      // spans that have since CLOSED, and rehydrating a closed span re-opens it
+      // (the SDK stamps a fresh start and no end), producing duplicate, never-
+      // closing spans on resume. The only things they enable across a restart are
+      // rare in-flight reconciliations (nesting a subagent spawned right before
+      // the restart; back-filling a compaction trigger whose hook lands after the
+      // restart) — acceptable to lose. The trigger map is plain strings (no span
+      // refs), so it's harmless to keep.
+      compactionTriggerByTurn: Array.from(this.compactionTriggerByTurn, ([key, value]) => ({
+        key,
+        value,
+      })),
+    };
+  }
+
+  // Rebuild live state from a snapshot. Span handles are recreated bound to their
+  // original ids (so further log()/end() merge into the same rows). Each scope's
+  // lazy parent-span resolver is reconstructed the same way newScope/
+  // handleSubagentStart do, so subsequent transcript records attach correctly.
+  private restore(snapshot: CodexSnapshot): void {
+    // Reporting config first: it determines the span factory (project/creds).
+    // The snapshot is the source of truth — we adopt it as-is rather than
+    // merging in this run's config, so a resume whose env/settings drifted from
+    // the original session can't clobber the snapshot's values. The one field
+    // the snapshot can't carry is the `apiKey` secret (intentionally never
+    // persisted), so we re-attach the live one resolved from this run's config
+    // event / env. This must happen before we touch `this.spanFactory`, which is
+    // built lazily from `this.reportingConfig`.
+    if (snapshot.reportingConfig !== undefined) {
+      this.reportingConfig = {
+        ...snapshot.reportingConfig,
+        apiKey: this.reportingConfig?.apiKey,
+      };
+    }
+
+    const factory = this.spanFactory;
+    // Rehydrate a handle and re-remember its name/type/start, so if this resumed
+    // processor is itself snapshotted again, the attributes survive the next hop.
+    const rehydrate = (ref: SpanRef): Span => {
+      const span = factory.rehydrateSpan(ref);
+      if (ref.name !== undefined && ref.type !== undefined) {
+        this.spanAttrs.set(span.spanId, {
+          name: ref.name,
+          type: ref.type,
+          startTime: ref.startTime,
+        });
+      }
+      return span;
+    };
+
+    this.rootSpan = snapshot.rootSpan === null ? null : rehydrate(snapshot.rootSpan);
+    this.rootEnded = snapshot.rootEnded;
+    this.rootEndTime = snapshot.rootEndTime;
+    this.rootEnrichment = { ...snapshot.rootEnrichment };
+    // Rehydrating the root necessarily re-emits its row with no end time. If the
+    // root was already ended before the restart, re-assert that end now so it
+    // stays closed (a later turn can still attach as a child of an ended root).
+    if (this.rootEnded && this.rootSpan !== null) {
+      try {
+        this.rootSpan.end(
+          this.rootEndTime !== undefined ? { endTime: this.rootEndTime } : undefined,
+        );
+      } catch (err) {
+        this.logger.error("codex processor: failed to re-end root span on restore", {
+          queueId: this.queueId,
+          error: String(err),
+        });
+      }
+    }
+
+    for (const s of snapshot.scopes) {
+      const isMain = s.path === snapshot.mainScopePath;
+      // The lazy parent resolver mirrors live construction: a main scope's turns
+      // hang under the root span; a subagent scope's under its own (restored)
+      // subagent root span.
+      const scope: TranscriptScope = {
+        path: s.path,
+        kind: s.kind,
+        parentSpan: isMain ? () => this.rootSpan : () => scope.subagentRootSpan,
+        offset: s.offset,
+        openTurns: new Map(
+          s.openTurns.map((t) => [
+            t.turnId,
+            {
+              span: rehydrate(t.span),
+              startTime: t.startTime,
+              lastChildEndTime: t.lastChildEndTime,
+            },
+          ]),
+        ),
+        turnsAwaitingCompletion: new Set(s.turnsAwaitingCompletion),
+        conversationHistory: fromItemSnapshots(s.conversationHistory),
+        openLlm:
+          s.openLlm === null
+            ? null
+            : {
+                span: rehydrate(s.openLlm.span),
+                turnId: s.openLlm.turnId,
+                output: fromItemSnapshots(s.openLlm.output),
+                lastOutputTime: s.openLlm.lastOutputTime,
+                outputPreset: s.openLlm.outputPreset,
+              },
+        openTools: new Map(
+          s.openTools.map((t) => [t.callId, { span: rehydrate(t.span), turnId: t.turnId }]),
+        ),
+        model: s.model,
+        lastTurnEndTime: s.lastTurnEndTime,
+        subagentRootSpan: s.subagentRootSpan === null ? null : rehydrate(s.subagentRootSpan),
+        subagentEnded: s.subagentEnded,
+        ...(s.pendingSubagent !== undefined
+          ? {
+              pendingSubagent: {
+                agentId: s.pendingSubagent.agentId,
+                agentType: s.pendingSubagent.agentType,
+                parent: rehydrate(s.pendingSubagent.parent),
+              },
+            }
+          : {}),
+      };
+      this.scopes.set(s.path, scope);
+      if (isMain) this.mainScope = scope;
+    }
+
+    // Span-bearing side-maps (spawnTurnSpansBy*, compactionSpansByTurn) are not
+    // persisted — see serialize() — so there is nothing to rehydrate for them,
+    // which is what keeps a resume from re-opening already-closed spans. Only the
+    // plain-string trigger map is restored.
+    for (const e of snapshot.compactionTriggerByTurn) {
+      this.compactionTriggerByTurn.set(e.key, e.value);
+    }
+  }
+
+  // Read and process new transcript records for every scope, retrying on a
+  // bounded interval, until no turn that has seen its Stop is still open (its
+  // task_complete can land seconds after the Stop hook) or the timeout elapses.
+  // Always does at least one read so a task_complete already on disk closes its
+  // turn immediately. Crucially, this does NOT wait for turns that are merely
+  // mid-progress (no Stop yet) — so an idle flush during an active turn returns
+  // promptly. On timeout, leaves whatever is still open for the backstops.
+  private async drainOpenTurns(): Promise<void> {
+    if (this.scopes.size === 0) return;
+    const deadline = Date.now() + FLUSH_TIMEOUT_MS;
+    for (;;) {
+      for (const scope of this.scopes.values()) {
+        // A finalized subagent scope is inert: re-reading it would re-open spans
+        // that were already closed (and the SDK would overwrite the closed spans
+        // with un-ended ones). Skip it.
+        if (scope.subagentEnded) continue;
+        try {
+          const { lines, offset } = this.transcriptReader.readFrom(scope.path, scope.offset);
+          scope.offset = offset;
+          this.consumeLines(scope, lines);
+        } catch (err) {
+          this.logger.error("codex processor: final catch-up read failed", {
+            queueId: this.queueId,
+            path: scope.path,
+            error: String(err),
+          });
+        }
+      }
+      // Only turns whose Stop has fired but whose task_complete hasn't arrived
+      // keep us waiting. (endTurnSpan removes them from both sets as they close.)
+      const pending = this.countPendingTurns();
+      if (pending === 0 || Date.now() >= deadline) {
+        if (pending > 0) {
+          this.logger.warn("codex processor: turns awaiting completion at flush timeout", {
+            queueId: this.queueId,
+            pending,
+          });
+        }
+        return;
+      }
+      await sleep(FLUSH_INTERVAL_MS);
+    }
+  }
 }
diff --git a/plugins/trace-codex/src/agents/codex/register.ts b/plugins/trace-codex/src/agents/codex/register.ts
index a905601..f4b194d 100644
--- a/plugins/trace-codex/src/agents/codex/register.ts
+++ b/plugins/trace-codex/src/agents/codex/register.ts
@@ -3,10 +3,12 @@
 // another agent means adding a sibling module that exports the same shape.
 
 import type { EventBuilder } from "../../client/client.ts";
+import type { Logger } from "../../log.ts";
 import type { EventProcessorFactory } from "../../processor/event-processor.ts";
 import { buildEnqueueEvents, CODEX_EVENT_SOURCE } from "./event-builder.ts";
 import { CodexEventProcessor } from "./event-processor.ts";
 import { applySettingsToEnv, loadSettingsFile, settingsFilePath } from "./settings.ts";
+import { SnapshotStore } from "./snapshot-store.ts";
 
 export interface Agent {
   /** Event source string this agent's events carry (matches the registry key). */
@@ -17,9 +19,11 @@ export interface Agent {
   buildEvents: EventBuilder;
   /**
    * Event names that terminate a turn/session. After enqueuing one of these,
-   * the hook asks the server to flush synchronously so the final spans are
-   * delivered before the process tree is torn down (e.g. in CI). Codex only
-   * exposes a per-turn "Stop" hook today; there is no session-end event.
+   * the hook asks the server to flush. By default the flush is fire-and-forget
+   * so the turn isn't stalled; setting BRAINTRUST_FLUSH_ON_TURN_END makes the
+   * hook block until the server confirms the final spans are delivered before
+   * the process tree is torn down (e.g. in CI). Codex only exposes a per-turn
+   * "Stop" hook today; there is no session-end event.
    */
   terminalEvents: string[];
   /**
@@ -31,10 +35,28 @@ export interface Agent {
   loadSettings: () => string[];
 }
 
+// One shared snapshot store for all Codex sessions, created on first use. It
+// persists each session's resumable state under PLUGIN_DATA/state so a restarted
+// server can rehydrate in-progress traces. Created lazily (reusing the first
+// session's logger) so a one-time GC of stale orphan snapshots runs the first
+// time we actually trace, and so this module stays side-effect-free on import.
+let sharedSnapshotStore: SnapshotStore | null = null;
+function snapshotStore(logger: Logger): SnapshotStore {
+  if (sharedSnapshotStore === null) {
+    sharedSnapshotStore = new SnapshotStore(logger);
+    // Reclaim stale snapshots. Snapshots intentionally outlive their root span
+    // (later turns resume onto the same root), so they are never self-deleted on
+    // root-end; the only cleanup is this age-based sweep of ones that never
+    // resumed.
+    sharedSnapshotStore.gcOlderThan();
+  }
+  return sharedSnapshotStore;
+}
+
 export const codexAgent: Agent = {
   eventSource: CODEX_EVENT_SOURCE,
   createProcessor: (queueId, logger, spanFactoryProvider) =>
-    new CodexEventProcessor(queueId, logger, spanFactoryProvider),
+    new CodexEventProcessor(queueId, logger, spanFactoryProvider, undefined, snapshotStore(logger)),
   buildEvents: buildEnqueueEvents,
   terminalEvents: ["Stop"],
   loadSettings: () => applySettingsToEnv(loadSettingsFile(settingsFilePath())),
diff --git a/plugins/trace-codex/src/agents/codex/resume.test.ts b/plugins/trace-codex/src/agents/codex/resume.test.ts
new file mode 100644
index 0000000..488bee4
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/resume.test.ts
@@ -0,0 +1,671 @@
+// Resume / state-persistence tests.
+//
+// When the event server shuts down mid-session (idle timeout) or the user closes
+// and later resumes, a fresh CodexEventProcessor is created for the same session
+// id. These tests verify that, given a persisted snapshot, the new processor
+// rehydrates the original spans and continues the SAME trace — producing the
+// same result as if one processor had handled the whole session — instead of
+// duplicating or orphaning spans.
+
+import { describe, expect, test } from "bun:test";
+import type { SpanFactory } from "../../braintrust/logger.ts";
+import {
+  type CapturedSpan,
+  createTestLogger,
+  diffSpan,
+  type ExpectedSpan,
+  spansToTree,
+  withCapturedTrace,
+} from "../../test-helpers.ts";
+import { CodexEventProcessor } from "./event-processor.ts";
+import { SnapshotStore } from "./snapshot-store.ts";
+import type { CodexSnapshot } from "./state-snapshot.ts";
+import {
+  compacted,
+  configEvent,
+  FakeTranscriptReader,
+  functionCall,
+  functionCallOutput,
+  preToolUse,
+  sessionMeta,
+  sessionStart,
+  stop,
+  type TraceEntry,
+  taskComplete,
+  taskStarted,
+  tokenCount,
+  transcript,
+  turnContext,
+  userMessage,
+} from "./test-helpers.ts";
+
+/** An in-memory snapshot store, so tests don't touch disk. Mirrors the disk
+ * store's surface (the parts the processor uses). */
+class MemorySnapshotStore extends SnapshotStore {
+  private readonly mem = new Map<string, CodexSnapshot>();
+
+  constructor() {
+    super(createTestLogger(), { dir: "/dev/null/unused" });
+  }
+
+  override read(sessionId: string): CodexSnapshot | null {
+    const snap = this.mem.get(sessionId);
+    // Deep-clone so callers can't mutate stored state (matches disk JSON).
+    return snap ? (JSON.parse(JSON.stringify(snap)) as CodexSnapshot) : null;
+  }
+
+  override write(sessionId: string, snapshot: CodexSnapshot): void {
+    this.mem.set(sessionId, JSON.parse(JSON.stringify(snapshot)) as CodexSnapshot);
+  }
+
+  override delete(sessionId: string): void {
+    this.mem.delete(sessionId);
+  }
+
+  has(sessionId: string): boolean {
+    return this.mem.has(sessionId);
+  }
+}
+
+const isTranscriptEntry = (e: TraceEntry): e is ReturnType<typeof transcript> =>
+  (e as { kind?: string }).kind === "transcript";
+
+/**
+ * Run a list of entries (hooks + transcript writes) through a processor, then
+ * flush. Shares the reader, factory, and store across calls so a second
+ * processor can resume the first's session. Returns nothing; spans accumulate in
+ * the shared captured trace.
+ */
+async function runThrough(
+  entries: TraceEntry[],
+  deps: {
+    queueId: string;
+    reader: FakeTranscriptReader;
+    factory: SpanFactory;
+    store: SnapshotStore;
+  },
+): Promise<void> {
+  const processor = new CodexEventProcessor(
+    deps.queueId,
+    createTestLogger(),
+    () => deps.factory,
+    deps.reader,
+    deps.store,
+  );
+  for (const entry of entries) {
+    if (isTranscriptEntry(entry)) {
+      deps.reader.append(JSON.stringify(entry.record));
+    } else {
+      await processor.process(entry);
+    }
+  }
+  await processor.flush();
+}
+
+/** Assert a captured trace matches an expected tree. */
+function expectTrace(spans: CapturedSpan[], expected: ExpectedSpan): void {
+  const tree = spansToTree(spans);
+  const diffs = diffSpan(tree, expected, "root");
+  if (diffs.length > 0) {
+    throw new Error(`trace does not match expected:\n${diffs.join("\n")}`);
+  }
+}
+
+describe("CodexEventProcessor: resume from snapshot", () => {
+  // The realistic mid-session restart: the server idle-shuts-down DURING a turn
+  // (before its Stop), then the next hook arrives on a new processor. The
+  // resumed processor must rehydrate the open root + turn and finish them,
+  // yielding one trace with the turn closed exactly once (no duplicates).
+  test("an idle restart mid-turn rehydrates open spans and finishes them", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-midturn";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      // Run 1: open the session and a turn, but NO Stop yet. The SessionStart
+      // and a PreToolUse hook drive transcript catch-up (in production every
+      // hook does). An idle flush then persists the open state. The config
+      // carries an apiKey to prove it's never written to the snapshot.
+      await runThrough(
+        [
+          configEvent({ session_id: queueId, apiKey: "sk-secret-do-not-persist" }),
+          sessionMeta({ id: queueId, cwd: "/tmp/proj" }),
+          turnContext({ model: "gpt-5.5" }),
+          taskStarted({ turn_id: "turn-1" }),
+          userMessage({ message: "hello" }),
+          sessionStart({ session_id: queueId, source: "startup" }),
+          preToolUse({ session_id: queueId }),
+        ],
+        deps,
+      );
+
+      // Mid-session: the root and turn are open, so a snapshot exists.
+      expect(store.has(queueId)).toBe(true);
+      const snap = store.read(queueId);
+      expect(snap?.rootEnded).toBe(false);
+      expect(snap?.scopes[0]?.openTurns.length).toBe(1);
+      // The persisted snapshot must never contain the apiKey (secret).
+      expect(JSON.stringify(snap)).not.toContain("apiKey");
+
+      // Run 2: a NEW processor (server restarted). The first event isn't a
+      // SessionStart — it's the Stop closing the turn. The resumed processor
+      // must restore the open root+turn and finish them.
+      await runThrough(
+        [
+          // No config event this run: the resumed processor uses the snapshot's
+          // reporting config. (In production a resume DOES re-send config, but
+          // restore must work without it too.)
+          taskComplete({ turn_id: "turn-1", last_agent_message: "all done" }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      // The resumed Stop ended the root, but the snapshot is kept (more turns
+      // could still follow — Codex's Stop is per-turn, not per-session). It's
+      // updated to reflect the now-ended root.
+      expect(store.has(queueId)).toBe(true);
+      expect(store.read(queueId)?.rootEnded).toBe(true);
+
+      const spans = await trace.drain();
+      // Exactly one root, one turn — no duplicates from the restart.
+      expectTrace(spans, {
+        span_attributes: { name: "codex: proj", type: "task" },
+        metadata: { session_id: queueId },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: turn-1", type: "task" },
+            input: "hello",
+            output: "all done",
+            ended: true,
+            children: [],
+          },
+        ],
+      });
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // The /shutdown-between-turns case: turn 1 completes (its Stop fires, which
+  // ends the root), the server is shut down, then the user sends another message.
+  // The root ending on the first Stop must NOT discard the snapshot — turn 2 has
+  // to resume under the SAME root, not start a brand-new trace.
+  test("a turn sent after a between-turns shutdown resumes under the same root", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-multiturn";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      // Run 1: a full first turn, including its Stop (which ends the root span).
+      // Then the server shuts down (we stop using processor A after its flush).
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          sessionMeta({ id: queueId, cwd: "/tmp/proj" }),
+          turnContext({ model: "gpt-5.5" }),
+          taskStarted({ turn_id: "turn-1" }),
+          userMessage({ message: "first" }),
+          taskComplete({ turn_id: "turn-1", last_agent_message: "did first" }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      // The root ended on turn 1's Stop — but the snapshot must SURVIVE, since
+      // more turns can still come (Codex's Stop is per-turn, not per-session).
+      expect(store.has(queueId)).toBe(true);
+      expect(store.read(queueId)?.rootEnded).toBe(true);
+
+      // Run 2: a NEW processor (server restarted after /shutdown). A second turn
+      // arrives. It must attach under the SAME restored root, not a new one.
+      await runThrough(
+        [
+          taskStarted({ turn_id: "turn-2" }),
+          userMessage({ message: "second" }),
+          taskComplete({ turn_id: "turn-2", last_agent_message: "did second" }),
+          stop({ turn_id: "turn-2" }),
+        ],
+        deps,
+      );
+
+      const spans = await trace.drain();
+      // One root, two turns under it — a single continuous trace.
+      expectTrace(spans, {
+        span_attributes: { name: "codex: proj", type: "task" },
+        metadata: { session_id: queueId },
+        children: [
+          {
+            span_attributes: { name: "turn: turn-1", type: "task" },
+            output: "did first",
+            ended: true,
+            children: [],
+          },
+          {
+            span_attributes: { name: "turn: turn-2", type: "task" },
+            input: "second",
+            output: "did second",
+            ended: true,
+            children: [],
+          },
+        ],
+      });
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // A deeper hierarchy (turn -> tool) open across the restart must rehydrate with
+  // the correct span names (not SDK-inferred ones) and close cleanly.
+  test("an open tool span survives a restart and closes with its original name", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-tool";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      // Run 1: a turn opens and calls a tool (function_call), but the tool's
+      // output hasn't arrived yet when the server idle-shuts-down.
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          sessionMeta({ id: queueId, cwd: "/tmp/proj" }),
+          turnContext({ model: "gpt-5.5" }),
+          taskStarted({ turn_id: "turn-1" }),
+          userMessage({ message: "run a tool" }),
+          functionCall({ turn_id: "turn-1", call_id: "call-1", name: "shell" }),
+          sessionStart({ session_id: queueId, source: "startup" }),
+          preToolUse({ session_id: queueId }),
+        ],
+        deps,
+      );
+
+      const snap = store.read(queueId);
+      expect(snap?.scopes[0]?.openTools.length).toBe(1);
+      expect(snap?.scopes[0]?.openTools[0]?.span.name).toBe("shell");
+
+      // Run 2: a fresh processor. The tool output and turn completion arrive, then
+      // the Stop. The rehydrated tool span must close under the rehydrated turn.
+      await runThrough(
+        [
+          functionCallOutput({ call_id: "call-1", output: "ok" }),
+          taskComplete({ turn_id: "turn-1", last_agent_message: "tool done" }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      const spans = await trace.drain();
+      expectTrace(spans, {
+        span_attributes: { name: "codex: proj", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: turn-1", type: "task" },
+            output: "tool done",
+            ended: true,
+            children: [
+              // The model call that emitted the tool call opens an llm span.
+              { span_attributes: { name: "gpt-5.5", type: "llm" }, ended: true },
+              {
+                span_attributes: { name: "shell", type: "tool" },
+                output: "ok",
+                ended: true,
+                children: [],
+              },
+            ],
+          },
+        ],
+      });
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // The per-turn timing used to place LLM spans (turn start + last-child-end)
+  // must survive a restart, so an LLM call that opens AFTER the restart still
+  // starts at the right place (here: the turn's start, since it's the first
+  // child) rather than at its post-restart output record.
+  test("turn timing survives a restart so a resumed llm span starts correctly", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-timing";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      // Run 1: the turn opens at :11 but the model hasn't produced output yet when
+      // the server shuts down (no llm span exists yet — only the open turn).
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          transcript({
+            timestamp: "2026-01-01T00:00:10Z",
+            type: "session_meta",
+            payload: { id: queueId, cwd: "/tmp/proj" },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:10Z",
+            type: "turn_context",
+            payload: { model: "gpt-5.5" },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:11Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-1" },
+          }),
+          sessionStart({ session_id: queueId, source: "startup" }),
+          preToolUse({ session_id: queueId }),
+        ],
+        deps,
+      );
+
+      // The open turn's start time was persisted.
+      const snap = store.read(queueId);
+      expect(snap?.scopes[0]?.openTurns[0]?.turnId).toBe("turn-1");
+      expect(snap?.scopes[0]?.openTurns[0]?.startTime).toBe(1767225611);
+
+      // Run 2: a fresh processor. The model output finally lands (:18) and the
+      // call closes (:18). The resumed llm span must start at the turn's start
+      // (:11, restored), not at its output record (:18).
+      await runThrough(
+        [
+          transcript({
+            timestamp: "2026-01-01T00:00:18Z",
+            type: "response_item",
+            payload: {
+              type: "message",
+              role: "assistant",
+              content: [{ type: "output_text", text: "hi" }],
+            },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:18Z",
+            type: "event_msg",
+            payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:19Z",
+            type: "event_msg",
+            payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "hi" },
+          }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      const spans = await trace.drain();
+      const tree = spansToTree(spans);
+      const turn = tree?.children.find((c) => c.name === "turn: turn-1");
+      const llm = turn?.children.find((c) => c.name === "gpt-5.5" || c.name === "llm");
+      // Started at the restored turn start (:11), ended at token_count (:18).
+      expect(llm?.metrics?.start).toBe(1767225611);
+      expect(llm?.metrics?.end).toBe(1767225618);
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  test("a version-incompatible snapshot is discarded and the session starts fresh", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-badver";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+
+      // Plant a snapshot with an incompatible plugin version.
+      store.write(queueId, {
+        pluginVersion: "0.0.0-incompatible",
+        schemaVersion: 1,
+        sessionId: queueId,
+        savedAt: Date.now(),
+        reportingConfig: { traceToBraintrust: true },
+        rootSpan: { spanId: "stale", rootSpanId: "stale", spanParents: [] },
+        rootEnded: false,
+        rootEndTime: undefined,
+        rootEnrichment: {},
+        mainScopePath: "/old.jsonl",
+        scopes: [],
+        compactionTriggerByTurn: [],
+      });
+
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          sessionMeta({ id: queueId, cwd: "/tmp/fresh" }),
+          turnContext({ model: "gpt-5.5" }),
+          taskStarted({ turn_id: "turn-1" }),
+          userMessage({ message: "hi" }),
+          taskComplete({ turn_id: "turn-1", last_agent_message: "ok" }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      const spans = await trace.drain();
+      // A fresh trace was built (the stale snapshot's "stale" span id is absent),
+      // and the incompatible snapshot was removed.
+      expect(spans.some((s) => s.span_id === "stale")).toBe(false);
+      expectTrace(spans, {
+        span_attributes: { name: "codex: fresh", type: "task" },
+        children: [{ span_attributes: { name: "turn: turn-1" }, output: "ok", children: [] }],
+      });
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // Regression for the duplicate/never-closing compaction span: a compaction that
+  // FINISHED before the restart must not be re-opened on resume. (Previously the
+  // compaction span lived forever in a side-map and was rehydrated — re-emitting
+  // it with a fresh start and no end.)
+  test("a completed compaction is not re-opened on resume", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-compact";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      // Run 1: a turn, then a compaction turn that completes, all before shutdown.
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          transcript({
+            timestamp: "2026-01-01T00:00:10Z",
+            type: "session_meta",
+            payload: { id: queueId, cwd: "/tmp/proj" },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:10Z",
+            type: "turn_context",
+            payload: { model: "gpt-5.5" },
+          }),
+          // A normal turn.
+          transcript({
+            timestamp: "2026-01-01T00:00:11Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-1" },
+          }),
+          userMessage({ message: "hi" }),
+          taskComplete({ turn_id: "turn-1", last_agent_message: "hello" }),
+          stop({ turn_id: "turn-1" }),
+          // A compaction turn that fully completes.
+          transcript({
+            timestamp: "2026-01-01T00:00:20Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-2" },
+          }),
+          compacted({
+            replacement_history: [{ type: "message", role: "user", content: [] }],
+            window_id: 1,
+          }),
+          tokenCount({ total_tokens: 5 }),
+          transcript({
+            timestamp: "2026-01-01T00:00:22Z",
+            type: "event_msg",
+            payload: { type: "task_complete", turn_id: "turn-2", last_agent_message: "" },
+          }),
+          // A non-terminal hook drives the catch-up that consumes the above.
+          preToolUse({ session_id: queueId }),
+        ],
+        deps,
+      );
+
+      // The compaction span finished in run 1.
+      let spans = await trace.drain();
+      const compaction1 = spans.find((s) => s.span_attributes?.name === "compaction");
+      expect(compaction1?.metrics?.end).toBeDefined();
+      const compactionId = compaction1?.span_id;
+
+      // Run 2: a new processor resumes and runs another turn.
+      await runThrough(
+        [
+          transcript({
+            timestamp: "2026-01-01T00:00:30Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-3" },
+          }),
+          userMessage({ message: "again" }),
+          taskComplete({ turn_id: "turn-3", last_agent_message: "ok" }),
+          stop({ turn_id: "turn-3" }),
+        ],
+        deps,
+      );
+
+      spans = await trace.drain();
+      // The compaction span must NOT have been re-emitted (reopened) on resume:
+      // run 2 produces no row for its span_id.
+      expect(spans.some((s) => s.span_id === compactionId)).toBe(false);
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // The root span, ended before a restart, must keep its original start AND end
+  // after resume — rehydrating it re-emits its row, so we re-assert the end.
+  test("a root ended before a restart keeps its original start and end", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-rootend";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      // Run 1: a full turn; its Stop ends the root at the turn's end (:13).
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          transcript({
+            timestamp: "2026-01-01T00:00:10Z",
+            type: "session_meta",
+            payload: { id: queueId, cwd: "/tmp/proj" },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:11Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-1" },
+          }),
+          userMessage({ message: "hi" }),
+          transcript({
+            timestamp: "2026-01-01T00:00:13Z",
+            type: "event_msg",
+            payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "hello" },
+          }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      const snap = store.read(queueId);
+      expect(snap?.rootEnded).toBe(true);
+      expect(snap?.rootEndTime).toBe(1767225613);
+
+      // Run 2: a new processor resumes (a second turn arrives).
+      await runThrough(
+        [
+          transcript({
+            timestamp: "2026-01-01T00:00:20Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-2" },
+          }),
+          userMessage({ message: "again" }),
+          taskComplete({ turn_id: "turn-2", last_agent_message: "ok" }),
+          stop({ turn_id: "turn-2" }),
+        ],
+        deps,
+      );
+
+      const spans = await trace.drain();
+      const tree = spansToTree(spans);
+      // The root kept its original start (:10) and end (:13) — not corrupted to
+      // the resume time, not left open.
+      expect(tree?.metrics?.start).toBe(1767225610);
+      expect(tree?.metrics?.end).toBe(1767225613);
+      // The second turn still attached under the same root.
+      expect(tree?.children.some((c) => c.name === "turn: turn-2")).toBe(true);
+    } finally {
+      trace.cleanup();
+    }
+  });
+
+  // An OPEN span spanning the restart must keep its original start (the SDK would
+  // otherwise stamp the rehydration time). Here the turn opens at :11 in run 1
+  // and is closed in run 2.
+  test("an open turn span keeps its original start across a restart", async () => {
+    const trace = withCapturedTrace();
+    try {
+      const queueId = "sess-openstart";
+      const reader = new FakeTranscriptReader();
+      const store = new MemorySnapshotStore();
+      const deps = { queueId, reader, factory: trace.spanFactory, store };
+
+      await runThrough(
+        [
+          configEvent({ session_id: queueId }),
+          transcript({
+            timestamp: "2026-01-01T00:00:10Z",
+            type: "session_meta",
+            payload: { id: queueId, cwd: "/tmp/proj" },
+          }),
+          transcript({
+            timestamp: "2026-01-01T00:00:11Z",
+            type: "event_msg",
+            payload: { type: "task_started", turn_id: "turn-1" },
+          }),
+          userMessage({ message: "hi" }),
+          sessionStart({ session_id: queueId, source: "startup" }),
+          preToolUse({ session_id: queueId }),
+        ],
+        deps,
+      );
+
+      await runThrough(
+        [
+          transcript({
+            timestamp: "2026-01-01T00:00:30Z",
+            type: "event_msg",
+            payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "done" },
+          }),
+          stop({ turn_id: "turn-1" }),
+        ],
+        deps,
+      );
+
+      const tree = spansToTree(await trace.drain());
+      const turn = tree?.children.find((c) => c.name === "turn: turn-1");
+      // Start preserved from run 1 (:11), end from run 2 (:30) — not the resume time.
+      expect(turn?.metrics?.start).toBe(1767225611);
+      expect(turn?.metrics?.end).toBe(1767225630);
+    } finally {
+      trace.cleanup();
+    }
+  });
+});
diff --git a/plugins/trace-codex/src/agents/codex/settings.test.ts b/plugins/trace-codex/src/agents/codex/settings.test.ts
index 1ace67b..068816c 100644
--- a/plugins/trace-codex/src/agents/codex/settings.test.ts
+++ b/plugins/trace-codex/src/agents/codex/settings.test.ts
@@ -70,6 +70,16 @@ describe("loadSettingsFile", () => {
     expect(loadSettingsFile(path)).toEqual({});
   });
 
+  test("parses flushOnTurnEnd boolean; ignores non-boolean", () => {
+    expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: true })))).toEqual({
+      flushOnTurnEnd: true,
+    });
+    expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: false })))).toEqual({
+      flushOnTurnEnd: false,
+    });
+    expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: "yes" })))).toEqual({});
+  });
+
   test("missing file returns {}", () => {
     expect(loadSettingsFile(join(dir, "absent.json"))).toEqual({});
   });
@@ -125,6 +135,13 @@ describe("applySettingsToEnv", () => {
     expect(env.TRACE_TO_BRAINTRUST).toBe("true");
     expect(env.BRAINTRUST_ADDITIONAL_METADATA).toBe('{"team":"platform"}');
   });
+
+  test("maps flushOnTurnEnd to BRAINTRUST_FLUSH_ON_TURN_END", () => {
+    const env: NodeJS.ProcessEnv = {};
+    const applied = applySettingsToEnv({ flushOnTurnEnd: true }, env);
+    expect(env.BRAINTRUST_FLUSH_ON_TURN_END).toBe("true");
+    expect(applied).toEqual(["flushOnTurnEnd"]);
+  });
 });
 
 describe("settingsFilePath", () => {
diff --git a/plugins/trace-codex/src/agents/codex/settings.ts b/plugins/trace-codex/src/agents/codex/settings.ts
index f1cc7f0..3373539 100644
--- a/plugins/trace-codex/src/agents/codex/settings.ts
+++ b/plugins/trace-codex/src/agents/codex/settings.ts
@@ -25,6 +25,13 @@ export interface Settings {
   project?: string;
   /** Master switch: when false or unset, no traces are reported to Braintrust. */
   traceToBraintrust?: boolean;
+  /**
+   * When true, the hook blocks at each turn's end (the Stop event) until the
+   * server confirms all events are processed and spans are flushed. Useful in
+   * programmatic/CI runs to guarantee traces are delivered before Codex exits.
+   * Defaults to false (fire-and-forget flush so the turn isn't stalled).
+   */
+  flushOnTurnEnd?: boolean;
   /** Extra metadata merged into the root span (standard keys win on conflict). */
   additionalMetadata?: Record<string, unknown>;
   /** If set, record every event to this NDJSON file (for replay). */
@@ -44,6 +51,7 @@ export const SETTINGS_TO_ENV: Record<keyof Settings, string> = {
   appUrl: "BRAINTRUST_APP_URL",
   project: "BRAINTRUST_PROJECT",
   traceToBraintrust: "TRACE_TO_BRAINTRUST",
+  flushOnTurnEnd: "BRAINTRUST_FLUSH_ON_TURN_END",
   additionalMetadata: "BRAINTRUST_ADDITIONAL_METADATA",
   recordFile: "BRAINTRUST_EVENT_SERVER_RECORD_FILE",
   port: "BRAINTRUST_EVENT_SERVER_PORT",
@@ -55,6 +63,8 @@ const SETTING_KEYS = Object.keys(SETTINGS_TO_ENV) as Array<keyof Settings>;
 
 const NUMBER_KEYS = new Set<keyof Settings>(["port", "idleTimeoutMs", "idleCheckIntervalMs"]);
 
+const BOOLEAN_KEYS = new Set<keyof Settings>(["traceToBraintrust", "flushOnTurnEnd"]);
+
 /**
  * The plugin's writable data directory, where config.json lives. Resolved
  * independently of the server Config layer so settings detection stays its own
@@ -104,8 +114,8 @@ export function loadSettingsFile(path: string): Settings {
       if (typeof value === "number" && Number.isFinite(value)) {
         (settings as Record<string, unknown>)[key] = value;
       }
-    } else if (key === "traceToBraintrust") {
-      if (typeof value === "boolean") settings.traceToBraintrust = value;
+    } else if (BOOLEAN_KEYS.has(key)) {
+      if (typeof value === "boolean") (settings as Record<string, unknown>)[key] = value;
     } else if (key === "additionalMetadata") {
       if (typeof value === "object" && !Array.isArray(value)) {
         settings.additionalMetadata = value as Record<string, unknown>;
diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts
new file mode 100644
index 0000000..2a85a85
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts
@@ -0,0 +1,103 @@
+import { afterEach, beforeEach, describe, expect, test } from "bun:test";
+import { mkdtempSync, rmSync, utimesSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { createTestLogger } from "../../test-helpers.ts";
+import { SnapshotStore } from "./snapshot-store.ts";
+import type { CodexSnapshot } from "./state-snapshot.ts";
+
+function makeSnapshot(sessionId: string): CodexSnapshot {
+  return {
+    pluginVersion: "9.9.9",
+    schemaVersion: 1,
+    sessionId,
+    savedAt: Date.now(),
+    reportingConfig: { project: "p", traceToBraintrust: true },
+    rootSpan: { spanId: "r", rootSpanId: "r", spanParents: [] },
+    rootEnded: false,
+    rootEndTime: undefined,
+    rootEnrichment: { source: "startup" },
+    mainScopePath: "/t.jsonl",
+    scopes: [],
+    compactionTriggerByTurn: [],
+  };
+}
+
+describe("SnapshotStore", () => {
+  let dir: string;
+  let store: SnapshotStore;
+
+  beforeEach(() => {
+    dir = mkdtempSync(join(tmpdir(), "snap-store-"));
+    store = new SnapshotStore(createTestLogger(), { dir });
+  });
+
+  afterEach(() => {
+    rmSync(dir, { recursive: true, force: true });
+  });
+
+  test("read returns null when no snapshot exists", () => {
+    expect(store.read("missing")).toBeNull();
+  });
+
+  test("write then read round-trips a snapshot", () => {
+    const snap = makeSnapshot("sess-1");
+    store.write("sess-1", snap);
+    const loaded = store.read("sess-1");
+    expect(loaded).toEqual(snap);
+  });
+
+  test("write is isolated per session id", () => {
+    store.write("a", makeSnapshot("a"));
+    store.write("b", makeSnapshot("b"));
+    expect(store.read("a")?.sessionId).toBe("a");
+    expect(store.read("b")?.sessionId).toBe("b");
+  });
+
+  test("delete removes a snapshot", () => {
+    store.write("sess-1", makeSnapshot("sess-1"));
+    store.delete("sess-1");
+    expect(store.read("sess-1")).toBeNull();
+  });
+
+  test("delete of a missing snapshot does not throw", () => {
+    expect(() => store.delete("nope")).not.toThrow();
+  });
+
+  test("read of a malformed snapshot file returns null (never throws)", () => {
+    store.write("bad", makeSnapshot("bad"));
+    // Corrupt the file on disk.
+    writeFileSync(join(dir, "state", "bad.json"), "{ not json");
+    expect(store.read("bad")).toBeNull();
+  });
+
+  test("a session id with path separators can't escape the state dir", () => {
+    const evil = "../../escape";
+    store.write(evil, makeSnapshot(evil));
+    // It round-trips by the same (sanitized) key...
+    expect(store.read(evil)?.sessionId).toBe(evil);
+    // ...and never created anything outside the state dir.
+    expect(() => rmSync(join(dir, "state"), { recursive: true })).not.toThrow();
+  });
+
+  test("gcOlderThan removes snapshots older than the ttl but keeps fresh ones", () => {
+    store.write("old", makeSnapshot("old"));
+    store.write("fresh", makeSnapshot("fresh"));
+    // Backdate "old" two hours.
+    const oldPath = join(dir, "state", "old.json");
+    const twoHoursAgo = new Date(Date.now() - 2 * 60 * 60 * 1000);
+    utimesSync(oldPath, twoHoursAgo, twoHoursAgo);
+
+    store.gcOlderThan(60 * 60 * 1000); // 1 hour ttl
+
+    expect(store.read("old")).toBeNull();
+    expect(store.read("fresh")?.sessionId).toBe("fresh");
+  });
+
+  test("gcOlderThan on a non-existent state dir does not throw", () => {
+    const empty = new SnapshotStore(createTestLogger(), {
+      dir: join(dir, "does-not-exist"),
+    });
+    expect(() => empty.gcOlderThan()).not.toThrow();
+  });
+});
diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.ts
new file mode 100644
index 0000000..625e23d
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/snapshot-store.ts
@@ -0,0 +1,161 @@
+// Per-session snapshot store: persists each Codex session's resumable processor
+// state to one JSON file under the plugin's writable data directory, so a
+// restarted server can rehydrate the session's in-progress trace.
+//
+// This is Codex-owned (not part of the generic event server): *where* a plugin
+// may persist state is the coding agent's call. Codex hands plugins a writable
+// PLUGIN_DATA directory; that's our base. Each agent that wants resume support
+// brings its own store with whatever location rules its host imposes.
+//
+// Like the event recorder and the span logger, this never throws: a persistence
+// failure must not be able to break a Codex turn. Every operation is wrapped and
+// failures are logged and swallowed. Writes are atomic (temp file + rename) so a
+// crash mid-write can't leave a half-written snapshot that fails to parse.
+
+import {
+  mkdirSync,
+  readdirSync,
+  readFileSync,
+  renameSync,
+  rmSync,
+  statSync,
+  writeFileSync,
+} from "node:fs";
+import { join } from "node:path";
+import type { Logger } from "../../log.ts";
+import type { CodexSnapshot } from "./state-snapshot.ts";
+
+/** Subdirectory of the data dir that holds per-session snapshots. */
+const STATE_SUBDIR = "state";
+
+/** Default age after which an orphaned snapshot is garbage-collected (7 days).
+ * Orphans happen when a server crashes before a session's root span ends (which
+ * is when the snapshot is normally deleted). */
+export const DEFAULT_SNAPSHOT_TTL_MS = 7 * 24 * 60 * 60 * 1000;
+
+/**
+ * Resolve the plugin's writable data directory. Precedence matches the rest of
+ * the plugin (server Config + settings): explicit override, then Codex's
+ * PLUGIN_DATA, then a temp fallback so the binary still runs standalone.
+ */
+function dataDir(env: NodeJS.ProcessEnv): string {
+  return (
+    env.BRAINTRUST_EVENT_SERVER_LOG_DIR ||
+    env.PLUGIN_DATA ||
+    `${env.TMPDIR || "/tmp"}/braintrust-event-server`
+  );
+}
+
+/**
+ * Map a session id to its snapshot filename. Session ids are UUIDs, but sanitize
+ * defensively so a stray id can never escape the state directory.
+ */
+function snapshotFileName(sessionId: string): string {
+  const safe = sessionId.replace(/[^A-Za-z0-9._-]/g, "_");
+  return `${safe}.json`;
+}
+
+export interface SnapshotStoreOptions {
+  /** Override the data dir base. Defaults to the resolved PLUGIN_DATA. */
+  dir?: string;
+  env?: NodeJS.ProcessEnv;
+}
+
+/**
+ * Reads and writes per-session {@link CodexSnapshot}s under
+ * `<dataDir>/state/<sessionId>.json`. All methods are best-effort and never
+ * throw.
+ */
+export class SnapshotStore {
+  private readonly dir: string;
+  private readonly logger: Logger;
+
+  constructor(logger: Logger, options: SnapshotStoreOptions = {}) {
+    this.logger = logger;
+    const base = options.dir ?? dataDir(options.env ?? process.env);
+    this.dir = join(base, STATE_SUBDIR);
+  }
+
+  private pathFor(sessionId: string): string {
+    return join(this.dir, snapshotFileName(sessionId));
+  }
+
+  /** Load a session's snapshot, or null if absent/unreadable/malformed. */
+  read(sessionId: string): CodexSnapshot | null {
+    let raw: string;
+    try {
+      raw = readFileSync(this.pathFor(sessionId), "utf8");
+    } catch {
+      return null; // absent is the common case
+    }
+    try {
+      return JSON.parse(raw) as CodexSnapshot;
+    } catch (err) {
+      this.logger.warn("snapshot store: malformed snapshot; ignoring", {
+        sessionId,
+        error: String(err),
+      });
+      return null;
+    }
+  }
+
+  /** Persist a session's snapshot atomically. Never throws. */
+  write(sessionId: string, snapshot: CodexSnapshot): void {
+    const finalPath = this.pathFor(sessionId);
+    const tmpPath = `${finalPath}.${process.pid}.tmp`;
+    try {
+      mkdirSync(this.dir, { recursive: true });
+      writeFileSync(tmpPath, JSON.stringify(snapshot));
+      renameSync(tmpPath, finalPath);
+    } catch (err) {
+      this.logger.error("snapshot store: write failed", {
+        sessionId,
+        error: String(err),
+      });
+      // Best-effort cleanup of the temp file so it doesn't linger.
+      try {
+        rmSync(tmpPath, { force: true });
+      } catch {
+        // ignore
+      }
+    }
+  }
+
+  /** Delete a session's snapshot (e.g. once its root span has ended). */
+  delete(sessionId: string): void {
+    try {
+      rmSync(this.pathFor(sessionId), { force: true });
+    } catch (err) {
+      this.logger.error("snapshot store: delete failed", {
+        sessionId,
+        error: String(err),
+      });
+    }
+  }
+
+  /**
+   * Remove snapshots older than `ttlMs` (by file mtime). Called at server
+   * startup to clean up orphans left by a previous crash. Never throws.
+   */
+  gcOlderThan(ttlMs: number = DEFAULT_SNAPSHOT_TTL_MS): void {
+    let entries: string[];
+    try {
+      entries = readdirSync(this.dir);
+    } catch {
+      return; // no state dir yet: nothing to GC
+    }
+    const cutoff = Date.now() - ttlMs;
+    for (const name of entries) {
+      if (!name.endsWith(".json")) continue;
+      const full = join(this.dir, name);
+      try {
+        if (statSync(full).mtimeMs < cutoff) {
+          rmSync(full, { force: true });
+          this.logger.debug("snapshot store: gc removed stale snapshot", { file: name });
+        }
+      } catch (err) {
+        this.logger.debug("snapshot store: gc skip", { file: name, error: String(err) });
+      }
+    }
+  }
+}
diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts
new file mode 100644
index 0000000..5498993
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts
@@ -0,0 +1,62 @@
+import { describe, expect, test } from "bun:test";
+import {
+  type CodexSnapshot,
+  isCompatibleSnapshot,
+  redactReportingConfig,
+  SNAPSHOT_SCHEMA_VERSION,
+} from "./state-snapshot.ts";
+
+function snapshot(overrides: Partial<CodexSnapshot> = {}): CodexSnapshot {
+  return {
+    pluginVersion: "1.2.3",
+    schemaVersion: SNAPSHOT_SCHEMA_VERSION,
+    sessionId: "s",
+    savedAt: 0,
+    reportingConfig: undefined,
+    rootSpan: null,
+    rootEnded: false,
+    rootEndTime: undefined,
+    rootEnrichment: {},
+    mainScopePath: null,
+    scopes: [],
+    compactionTriggerByTurn: [],
+    ...overrides,
+  };
+}
+
+describe("isCompatibleSnapshot", () => {
+  test("accepts a matching plugin + schema version", () => {
+    expect(isCompatibleSnapshot(snapshot(), "1.2.3")).toBe(true);
+  });
+
+  test("rejects a mismatched plugin version", () => {
+    expect(isCompatibleSnapshot(snapshot({ pluginVersion: "9.9.9" }), "1.2.3")).toBe(false);
+  });
+
+  test("rejects a mismatched schema version", () => {
+    expect(
+      isCompatibleSnapshot(snapshot({ schemaVersion: SNAPSHOT_SCHEMA_VERSION + 1 }), "1.2.3"),
+    ).toBe(false);
+  });
+});
+
+describe("redactReportingConfig", () => {
+  test("strips the apiKey but keeps everything else", () => {
+    const redacted = redactReportingConfig({
+      project: "p",
+      apiKey: "sk-secret",
+      apiUrl: "https://api",
+      traceToBraintrust: true,
+    });
+    expect(redacted).toEqual({
+      project: "p",
+      apiUrl: "https://api",
+      traceToBraintrust: true,
+    });
+    expect("apiKey" in (redacted ?? {})).toBe(false);
+  });
+
+  test("returns undefined for an undefined config", () => {
+    expect(redactReportingConfig(undefined)).toBeUndefined();
+  });
+});
diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.ts
new file mode 100644
index 0000000..0564ee4
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/state-snapshot.ts
@@ -0,0 +1,152 @@
+// Serializable snapshot of a CodexEventProcessor's resumable state.
+//
+// The processor builds a session's trace incrementally, holding all its
+// bookkeeping in memory: where it is in each transcript file (byte offsets),
+// which spans are still open (and their identities), the reconstructed
+// conversation history used to shape LLM-call inputs, and a few side-maps that
+// reconcile hook-only data with the transcript. The rollout transcript on disk
+// is the durable source of truth for span *content*; this snapshot captures only
+// the bookkeeping needed to pick up exactly where a previous run left off.
+//
+// The server is short-lived: it shuts down after an idle window (default 5 min)
+// or when the user closes the session, and a later hook (a "resume", or just the
+// next turn) spins up a fresh processor for the same session id. Without a
+// snapshot that processor would re-read the transcript from zero with brand-new
+// span ids, producing duplicate/orphaned spans. With one, it rehydrates the
+// original spans (via {@link SpanFactory.rehydrateSpan}) and continues, so every
+// further log()/end() merges into the same trace.
+//
+// This module defines the snapshot *shape* and pure mapping helpers only; the
+// processor owns turning its live state (which holds Span handles) into a
+// snapshot and back, since that needs the span factory. Span handles are stored
+// here as {@link SpanRef} identities (span_id/root_span_id/parents), captured
+// from the synchronous getters on a live span.
+
+import type { ReportingConfig, SpanRef } from "../../braintrust/logger.ts";
+
+/**
+ * Bumped whenever the snapshot shape changes incompatibly. A snapshot whose
+ * schemaVersion doesn't match is discarded on load (the session simply starts
+ * fresh, losing only resume continuity), so we never try to rehydrate state we
+ * can't interpret.
+ */
+export const SNAPSHOT_SCHEMA_VERSION = 4;
+
+/** A conversation item (chat message or reasoning), stored verbatim. These are
+ * plain JSON already, so they round-trip without transformation. */
+export type ConversationItemSnapshot = Record<string, unknown>;
+
+/** An open LLM span and the output items accumulated for it so far. */
+export interface OpenLlmSnapshot {
+  span: SpanRef;
+  turnId: string | undefined;
+  output: ConversationItemSnapshot[];
+  /** Time (Unix seconds) of the call's last model-output item; the span's end. */
+  lastOutputTime: number | undefined;
+  outputPreset?: boolean;
+}
+
+/** An open tool span and the turn it belongs to (keyed by call_id in the map). */
+export interface OpenToolSnapshot {
+  callId: string;
+  span: SpanRef;
+  turnId: string;
+}
+
+/** An open turn span (keyed by turn_id in the map), plus the timing used to
+ * place its child LLM spans. */
+export interface OpenTurnSnapshot {
+  turnId: string;
+  span: SpanRef;
+  /** Turn start (Unix seconds), used as the first child's start time. */
+  startTime: number | undefined;
+  /** Max end time (Unix seconds) among the turn's already-ended children. */
+  lastChildEndTime: number | undefined;
+}
+
+/** Pending-subagent info captured before its root span exists. The parent turn
+ * span is referenced by identity so it can be rehydrated independently. */
+export interface PendingSubagentSnapshot {
+  agentId: string;
+  agentType: string | undefined;
+  parent: SpanRef;
+}
+
+/** One transcript file's parsing state. Mirrors TranscriptScope, with Span
+ * handles replaced by SpanRef identities. */
+export interface ScopeSnapshot {
+  path: string;
+  kind: "main" | "subagent";
+  offset: number;
+  openTurns: OpenTurnSnapshot[];
+  turnsAwaitingCompletion: string[];
+  conversationHistory: ConversationItemSnapshot[];
+  openLlm: OpenLlmSnapshot | null;
+  openTools: OpenToolSnapshot[];
+  model: string | undefined;
+  lastTurnEndTime: number | undefined;
+  subagentRootSpan: SpanRef | null;
+  subagentEnded: boolean;
+  pendingSubagent?: PendingSubagentSnapshot;
+}
+
+/** A side-map entry pairing a key with a string value. */
+export interface StringMapEntry {
+  key: string;
+  value: string;
+}
+
+/** The full resumable state of a CodexEventProcessor. */
+export interface CodexSnapshot {
+  /** Plugin version that wrote this snapshot; discarded on mismatch. */
+  pluginVersion: string;
+  /** Snapshot shape version; discarded on mismatch. */
+  schemaVersion: number;
+  /** Session id (queueId) this snapshot belongs to. */
+  sessionId: string | null;
+  /** Wall-clock ms when written, for age-based GC of orphans. */
+  savedAt: number;
+
+  /**
+   * Reporting config, minus secrets. `apiKey` is intentionally never persisted
+   * (re-resolved from the env / config event on resume), mirroring how the event
+   * recorder redacts it.
+   */
+  reportingConfig: Omit<ReportingConfig, "apiKey"> | undefined;
+
+  rootSpan: SpanRef | null;
+  rootEnded: boolean;
+  /** When the root was ended (Unix seconds), if it has been; re-asserted on
+   * resume so the rehydrated root row stays closed. */
+  rootEndTime: number | undefined;
+  rootEnrichment: { source?: string; permissionMode?: string };
+
+  /** Path of the main scope within `scopes`, if it exists. */
+  mainScopePath: string | null;
+  scopes: ScopeSnapshot[];
+
+  /**
+   * Compaction triggers ("manual"/"auto") keyed by turn_id. Plain strings (no
+   * span references), so safe to carry across a restart. The span-bearing
+   * side-maps (spawn-agent turn spans, compaction spans) are deliberately NOT
+   * persisted: they hold references to spans that may have closed, and
+   * rehydrating a closed span re-opens it.
+   */
+  compactionTriggerByTurn: StringMapEntry[];
+}
+
+/** A parsed snapshot is usable only if it matches our plugin + schema version. */
+export function isCompatibleSnapshot(snapshot: CodexSnapshot, pluginVersion: string): boolean {
+  return (
+    snapshot.schemaVersion === SNAPSHOT_SCHEMA_VERSION && snapshot.pluginVersion === pluginVersion
+  );
+}
+
+/** Strip the apiKey from a reporting config before persisting it. */
+export function redactReportingConfig(
+  config: ReportingConfig | undefined,
+): Omit<ReportingConfig, "apiKey"> | undefined {
+  if (config === undefined) return undefined;
+  const { apiKey: _apiKey, ...rest } = config;
+  return rest;
+}
diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts
index eaa0e57..e85d729 100644
--- a/plugins/trace-codex/src/agents/codex/test-helpers.ts
+++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts
@@ -1,7 +1,13 @@
-// Codex-specific test helpers: event builders that mimic Codex hook payloads,
-// and assertProducesTrace, which runs events through the CodexEventProcessor and
-// checks the resulting Braintrust trace. Generic trace/span helpers are imported
-// from the shared test-helpers.
+// Codex-specific test helpers. Spans are built from transcript records; hooks
+// drive lifecycle (config, root enrichment, the Stop sentinel). A test expresses
+// both sources in one ordered list. transcript(...) (and the typed builders that
+// wrap it) represent lines being appended to the transcript file; hook builders
+// (sessionStart, stop, configEvent) represent hook events arriving.
+//
+// Semantics: a transcript entry buffers a line. The NEXT hook triggers a
+// catch-up that reads all buffered-but-unread lines, then handles the hook —
+// modeling how the real processor catches up on each hook. A Stop hook waits for
+// the turn's task_complete to be present before catching up.
 
 import type { EnqueueEvent } from "../../server/routes.ts";
 import {
@@ -13,32 +19,68 @@ import {
 } from "../../test-helpers.ts";
 import { CODEX_CONFIG_EVENT } from "./event-builder.ts";
 import { CodexEventProcessor } from "./event-processor.ts";
+import type {
+  TranscriptReader,
+  TranscriptReadResult,
+  TranscriptWaitResult,
+  WaitForOptions,
+} from "./transcript-reader.ts";
+
+/** Synthetic transcript path attached to hook events so catch-up reads run. */
+export const TEST_TRANSCRIPT_PATH = "/test/transcript.jsonl";
+
+/** Monotonic default timestamp so transcript records sort in creation order. */
+let tsCounter = 0;
+function nextTimestamp(): string {
+  tsCounter += 1;
+  return new Date(Date.UTC(2026, 0, 1, 0, 0, tsCounter)).toISOString();
+}
 
 // ============================================================================
-// Event builders
+// Hook event builders
 // ============================================================================
 
-function codexEvent(eventName: string, eventData: Record<string, unknown>): EnqueueEvent {
+function codexHook(eventName: string, eventData: Record<string, unknown>): EnqueueEvent {
   const queueId = typeof eventData.session_id === "string" ? eventData.session_id : "session-1";
   return {
     queueId,
     eventSource: "codex-hook",
     eventSourceVersion: null,
     eventName,
-    eventData: { hook_event_name: eventName, ...eventData },
+    eventData: { hook_event_name: eventName, transcript_path: TEST_TRANSCRIPT_PATH, ...eventData },
   };
 }
 
 export function sessionStart(data: Record<string, unknown> = {}): EnqueueEvent {
-  return codexEvent("SessionStart", { session_id: "session-1", ...data });
+  return codexHook("SessionStart", { session_id: "session-1", ...data });
 }
 
-export function userPromptSubmit(data: Record<string, unknown> = {}): EnqueueEvent {
-  return codexEvent("UserPromptSubmit", { session_id: "session-1", ...data });
+export function stop(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("Stop", { session_id: "session-1", ...data });
 }
 
-export function stop(data: Record<string, unknown> = {}): EnqueueEvent {
-  return codexEvent("Stop", { session_id: "session-1", ...data });
+export function subagentStart(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("SubagentStart", { session_id: "session-1", ...data });
+}
+
+export function subagentStop(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("SubagentStop", { session_id: "session-1", ...data });
+}
+
+export function preToolUse(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("PreToolUse", { session_id: "session-1", ...data });
+}
+
+export function preCompact(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("PreCompact", { session_id: "session-1", ...data });
+}
+
+export function postCompact(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("PostCompact", { session_id: "session-1", ...data });
+}
+
+export function postToolUse(data: Record<string, unknown> = {}): EnqueueEvent {
+  return codexHook("PostToolUse", { session_id: "session-1", ...data });
 }
 
 /** A config event enabling tracing; pass extra reporting config via `data`. */
@@ -53,25 +95,307 @@ export function configEvent(data: Record<string, unknown> = {}): EnqueueEvent {
 }
 
 // ============================================================================
-// assertProducesTrace: run events through a CodexEventProcessor and assert the
-// resulting trace. One session => one trace.
+// Transcript entry builders
+// ============================================================================
+
+/** Marks a transcript line being written to the (fake) transcript file. */
+export interface TranscriptEntry {
+  kind: "transcript";
+  record: Record<string, unknown>;
+}
+
+/**
+ * Build a raw transcript entry from a full record envelope
+ * `{ timestamp, type, payload }`. `timestamp` defaults to a monotonic value.
+ */
+export function transcript(record: Record<string, unknown> = {}): TranscriptEntry {
+  return {
+    kind: "transcript",
+    record: { timestamp: nextTimestamp(), ...record },
+  };
+}
+
+/** session_meta record: opens the root span (cwd, session id, cli version). */
+export function sessionMeta(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({ type: "session_meta", payload: { id: "session-1", ...payload } });
+}
+
+/** turn_context record: carries the model for a turn. */
+export function turnContext(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({ type: "turn_context", payload });
+}
+
+/** task_started event: opens a turn span (by turn_id). */
+export function taskStarted(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({ type: "event_msg", payload: { type: "task_started", ...payload } });
+}
+
+/** user_message event: the prompt; sets the open turn's input. */
+export function userMessage(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({ type: "event_msg", payload: { type: "user_message", ...payload } });
+}
+
+/** task_complete event: closes a turn span (output = last_agent_message). */
+export function taskComplete(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({ type: "event_msg", payload: { type: "task_complete", ...payload } });
+}
+
+/** function_call response_item: opens a tool span (by call_id, under turn). */
+export function functionCall(payload: Record<string, unknown> = {}): TranscriptEntry {
+  const { turn_id, metadata, ...rest } = payload as {
+    turn_id?: string;
+    metadata?: Record<string, unknown>;
+  } & Record<string, unknown>;
+  return transcript({
+    type: "response_item",
+    payload: {
+      type: "function_call",
+      ...(turn_id !== undefined ? { metadata: { turn_id, ...metadata } } : { metadata }),
+      ...rest,
+    },
+  });
+}
+
+/** function_call_output response_item: closes a tool span (by call_id). */
+export function functionCallOutput(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({
+    type: "response_item",
+    payload: { type: "function_call_output", ...payload },
+  });
+}
+
+/** custom_tool_call response_item (e.g. apply_patch): opens a tool span. */
+export function customToolCall(payload: Record<string, unknown> = {}): TranscriptEntry {
+  const { turn_id, metadata, ...rest } = payload as {
+    turn_id?: string;
+    metadata?: Record<string, unknown>;
+  } & Record<string, unknown>;
+  return transcript({
+    type: "response_item",
+    payload: {
+      type: "custom_tool_call",
+      ...(turn_id !== undefined ? { metadata: { turn_id, ...metadata } } : { metadata }),
+      ...rest,
+    },
+  });
+}
+
+/** custom_tool_call_output response_item: closes a tool span (by call_id). */
+export function customToolCallOutput(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({
+    type: "response_item",
+    payload: { type: "custom_tool_call_output", ...payload },
+  });
+}
+
+/** A message response_item with the given role and text. */
+export function message(role: string, text: string): TranscriptEntry {
+  const itemType = role === "assistant" ? "output_text" : "input_text";
+  return transcript({
+    type: "response_item",
+    payload: { type: "message", role, content: [{ type: itemType, text }] },
+  });
+}
+
+/** assistant message response_item (model output; opens/feeds an llm span). */
+export function assistantMessage(text: string): TranscriptEntry {
+  return message("assistant", text);
+}
+
+/** user message response_item (input context). */
+export function userMessageItem(text: string): TranscriptEntry {
+  return message("user", text);
+}
+
+/**
+ * reasoning response_item. Pass summary strings to surface readable reasoning
+ * ("thinking"); they're wrapped in Codex's real `{type:"summary_text", text}`
+ * shape. With no strings, only `encrypted_content` is present (the common case),
+ * so nothing readable is surfaced.
+ */
+export function reasoning(summary: string[] = []): TranscriptEntry {
+  return transcript({
+    type: "response_item",
+    payload: {
+      type: "reasoning",
+      summary: summary.map((text) => ({ type: "summary_text", text })),
+      encrypted_content: "opaque",
+    },
+  });
+}
+
+/**
+ * token_count event: closes the current llm span. `usage` maps the Codex token
+ * keys (input_tokens, output_tokens, total_tokens, ...) onto last_token_usage.
+ */
+export function tokenCount(usage: Record<string, number> = {}): TranscriptEntry {
+  return transcript({
+    type: "event_msg",
+    payload: { type: "token_count", info: { last_token_usage: usage } },
+  });
+}
+
+/**
+ * compacted record: marks the current turn as a context compaction.
+ * `replacement_history` is the summarized history that replaces prior context.
+ */
+export function compacted(payload: Record<string, unknown> = {}): TranscriptEntry {
+  return transcript({ type: "compacted", payload });
+}
+
+/** One entry in a mixed trace list: a hook event or a transcript write. */
+export type TraceEntry = EnqueueEvent | TranscriptEntry;
+
+function isTranscriptEntry(entry: TraceEntry): entry is TranscriptEntry {
+  return (entry as TranscriptEntry).kind === "transcript";
+}
+
+// ============================================================================
+// FakeTranscriptReader: an in-memory transcript fed by transcript(...) entries.
+// ============================================================================
+
+/**
+ * Models the transcript file as a growing byte buffer. `append` writes a
+ * complete line; `readFrom` returns complete lines from the given offset.
+ * `waitFor` resolves immediately from the current buffer (the harness has
+ * already written everything available before the hook runs), so tests stay
+ * synchronous and don't actually sleep.
+ */
+export class FakeTranscriptReader implements TranscriptReader {
+  private buffer = "";
+  /** Offsets passed to readFrom/waitFor, in call order (for assertions). */
+  readonly readOffsets: number[] = [];
+
+  append(line: string): void {
+    this.buffer += `${line}\n`;
+  }
+
+  readFrom(_path: string, offset: number): TranscriptReadResult {
+    this.readOffsets.push(offset);
+    const from = offset > this.buffer.length ? 0 : offset;
+    const slice = this.buffer.slice(from);
+    const lines: string[] = [];
+    let consumed = 0;
+    let lineStart = 0;
+    for (let i = 0; i < slice.length; i++) {
+      if (slice[i] === "\n") {
+        lines.push(slice.slice(lineStart, i));
+        lineStart = i + 1;
+        consumed = lineStart;
+      }
+    }
+    return { lines, offset: from + consumed };
+  }
+
+  async waitFor(
+    path: string,
+    offset: number,
+    predicate: (line: string) => boolean,
+    _opts: WaitForOptions,
+  ): Promise<TranscriptWaitResult> {
+    const { lines, offset: next } = this.readFrom(path, offset);
+    const sentinelFound = lines.some(predicate);
+    return { lines, offset: next, sentinelFound };
+  }
+}
+
+/**
+ * A transcript reader backed by multiple in-memory files, keyed by path. Used by
+ * subagent tests, where the main session and each subagent write to separate
+ * transcript files but are read by one processor. `append(path, line)` writes to
+ * a specific file; reads/waits operate per path with independent offsets.
+ */
+export class MultiFileTranscriptReader implements TranscriptReader {
+  private readonly buffers = new Map<string, string>();
+  /** Per-path withheld trailing line that only appears after N more reads. */
+  private readonly withheldByPath = new Map<string, { line: string; afterReads: number }>();
+
+  append(path: string, line: string): void {
+    this.buffers.set(path, `${this.buffers.get(path) ?? ""}${line}\n`);
+  }
+
+  /**
+   * Append a line to `path` that only becomes visible after `afterReads` more
+   * reads of that path. Models a record Codex writes slightly later, so it isn't
+   * seen by the catch-up that's running now but is by a subsequent one.
+   */
+  withhold(path: string, line: string, afterReads: number): void {
+    this.withheldByPath.set(path, { line, afterReads });
+  }
+
+  readFrom(path: string, offset: number): TranscriptReadResult {
+    const pending = this.withheldByPath.get(path);
+    if (pending !== undefined) {
+      if (pending.afterReads <= 0) {
+        this.append(path, pending.line);
+        this.withheldByPath.delete(path);
+      } else {
+        pending.afterReads -= 1;
+      }
+    }
+    const buffer = this.buffers.get(path) ?? "";
+    const from = offset > buffer.length ? 0 : offset;
+    const slice = buffer.slice(from);
+    const lines: string[] = [];
+    let consumed = 0;
+    let lineStart = 0;
+    for (let i = 0; i < slice.length; i++) {
+      if (slice[i] === "\n") {
+        lines.push(slice.slice(lineStart, i));
+        lineStart = i + 1;
+        consumed = lineStart;
+      }
+    }
+    return { lines, offset: from + consumed };
+  }
+
+  async waitFor(
+    path: string,
+    offset: number,
+    predicate: (line: string) => boolean,
+    _opts: WaitForOptions,
+  ): Promise<TranscriptWaitResult> {
+    const { lines, offset: next } = this.readFrom(path, offset);
+    const sentinelFound = lines.some(predicate);
+    return { lines, offset: next, sentinelFound };
+  }
+}
+
+// ============================================================================
+// assertProducesTrace: run a mixed list of hook events and transcript writes
+// through a CodexEventProcessor and assert the resulting trace.
 // ============================================================================
 
 export async function assertProducesTrace(
-  events: EnqueueEvent[],
+  entries: TraceEntry[],
   expected: ExpectedSpan,
-  opts: { queueId?: string } = {},
+  opts: { queueId?: string; reader?: FakeTranscriptReader } = {},
 ): Promise<void> {
-  const queueId = opts.queueId ?? events[0]?.queueId ?? "session-1";
+  const hookEvents = entries.filter((e): e is EnqueueEvent => !isTranscriptEntry(e));
+  const queueId = opts.queueId ?? hookEvents[0]?.queueId ?? "session-1";
+
   // Tracing is off by default; prepend a tracing-enabled config event unless the
   // caller already provided one, so trace assertions exercise the span path.
-  const hasConfig = events.some((e) => e.eventName === CODEX_CONFIG_EVENT);
-  const toProcess = hasConfig ? events : [configEvent({ session_id: queueId }), ...events];
+  const hasConfig = hookEvents.some((e) => e.eventName === CODEX_CONFIG_EVENT);
+  const toRun: TraceEntry[] = hasConfig
+    ? entries
+    : [configEvent({ session_id: queueId }), ...entries];
+
+  const reader = opts.reader ?? new FakeTranscriptReader();
   const trace = withCapturedTrace();
   try {
-    const processor = new CodexEventProcessor(queueId, createTestLogger(), () => trace.spanFactory);
-    for (const event of toProcess) {
-      await processor.process(event);
+    const processor = new CodexEventProcessor(
+      queueId,
+      createTestLogger(),
+      () => trace.spanFactory,
+      reader,
+    );
+    for (const entry of toRun) {
+      if (isTranscriptEntry(entry)) {
+        reader.append(JSON.stringify(entry.record));
+      } else {
+        await processor.process(entry);
+      }
     }
     await processor.flush();
 
diff --git a/plugins/trace-codex/src/agents/codex/transcript-reader.ts b/plugins/trace-codex/src/agents/codex/transcript-reader.ts
new file mode 100644
index 0000000..ef2ad70
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/transcript-reader.ts
@@ -0,0 +1,155 @@
+// Reads new content from a Codex transcript file starting at a byte offset.
+//
+// The processor "catches up" on the transcript whenever a hook arrives: it asks
+// the reader for everything appended since the last read, parses the new lines,
+// and advances its offset. This interface is the seam that keeps the processor
+// free of direct filesystem coupling — production reads the real file, while
+// unit tests inject a fake fed by `transcript(...)` test entries.
+//
+// On a terminal hook (Stop) the processor must not race the transcript writer:
+// the turn's closing record may not be flushed to disk yet. `waitFor` polls the
+// file, bounded by a timeout, until a sentinel line appears (or it gives up).
+// This is the one deliberate blocking point, and it always returns rather than
+// hanging the turn.
+
+import { readFileSync, statSync } from "node:fs";
+import type { Logger } from "../../log.ts";
+
+/** Result of a read: the complete new lines and the advanced byte offset. */
+export interface TranscriptReadResult {
+  /** Complete lines (newline-terminated in the file) read since `offset`. */
+  lines: string[];
+  /**
+   * The new offset to read from next time. A partial trailing line (no
+   * newline yet) is NOT consumed and not advanced past, so it is re-read once
+   * it is fully written.
+   */
+  offset: number;
+}
+
+/** Options bounding a `waitFor` poll. */
+export interface WaitForOptions {
+  /** Give up after this many milliseconds total. */
+  timeoutMs: number;
+  /** Delay between polls. */
+  intervalMs: number;
+}
+
+/** Result of waiting: the lines read so far and whether the sentinel was seen. */
+export interface TranscriptWaitResult extends TranscriptReadResult {
+  /** True if `predicate` matched one of the read lines before the timeout. */
+  sentinelFound: boolean;
+}
+
+export interface TranscriptReader {
+  /**
+   * Read complete lines from `path` starting at byte `offset`. Returns the new
+   * lines plus the offset just past the last newline consumed. Must never
+   * throw: on a missing/unreadable file it returns no lines and the same
+   * offset, so the next hook simply tries again.
+   */
+  readFrom(path: string, offset: number): TranscriptReadResult;
+
+  /**
+   * Read from `offset`, then keep polling until `predicate` matches one of the
+   * cumulative lines read or the timeout elapses. Returns all lines read across
+   * polls, the advanced offset, and whether the sentinel was found. Never
+   * throws and always returns within the timeout, so the caller can fail soft
+   * (emit whatever exists) rather than stall the turn.
+   */
+  waitFor(
+    path: string,
+    offset: number,
+    predicate: (line: string) => boolean,
+    opts: WaitForOptions,
+  ): Promise<TranscriptWaitResult>;
+}
+
+/**
+ * Split a buffer into complete lines (those ending in "\n") and report how many
+ * bytes were consumed. A trailing fragment without a newline is left for the
+ * next read.
+ */
+function splitCompleteLines(buf: Buffer): { lines: string[]; consumed: number } {
+  const lines: string[] = [];
+  let lineStart = 0;
+  let consumed = 0;
+  for (let i = 0; i < buf.length; i++) {
+    if (buf[i] === 0x0a /* \n */) {
+      lines.push(buf.toString("utf8", lineStart, i));
+      lineStart = i + 1;
+      consumed = lineStart;
+    }
+  }
+  return { lines, consumed };
+}
+
+const sleep = (ms: number): Promise<void> => new Promise((resolve) => setTimeout(resolve, ms));
+
+/**
+ * Production reader: reads the file, slices from `offset`, and returns complete
+ * lines plus the advanced offset. Reads the whole file and slices for
+ * simplicity; transcripts are modest and this runs off the hook hot path.
+ */
+export function createFileTranscriptReader(diagLogger?: Logger): TranscriptReader {
+  function readFrom(path: string, offset: number): TranscriptReadResult {
+    try {
+      const size = statSync(path).size;
+      // File shrank or truncated (e.g. replaced): reset to start.
+      const from = offset > size ? 0 : offset;
+      if (from >= size) {
+        return { lines: [], offset: from };
+      }
+      const buf = readFileSync(path);
+      const slice = buf.subarray(from);
+      const { lines, consumed } = splitCompleteLines(slice);
+      return { lines, offset: from + consumed };
+    } catch (err) {
+      diagLogger?.debug("transcript reader: read failed; skipping", {
+        path,
+        offset,
+        error: String(err),
+      });
+      return { lines: [], offset };
+    }
+  }
+
+  async function waitFor(
+    path: string,
+    offset: number,
+    predicate: (line: string) => boolean,
+    opts: WaitForOptions,
+  ): Promise<TranscriptWaitResult> {
+    const deadline = Date.now() + opts.timeoutMs;
+    const allLines: string[] = [];
+    let cursor = offset;
+    let sentinelFound = false;
+
+    // Poll until the sentinel appears or we run out of time. Always does at
+    // least one read so a sentinel already on disk is found immediately.
+    for (;;) {
+      const { lines, offset: next } = readFrom(path, cursor);
+      cursor = next;
+      for (const line of lines) {
+        allLines.push(line);
+        if (predicate(line)) sentinelFound = true;
+      }
+      if (sentinelFound || Date.now() >= deadline) break;
+      await sleep(opts.intervalMs);
+    }
+
+    if (!sentinelFound) {
+      diagLogger?.debug("transcript reader: sentinel not found before timeout", {
+        path,
+        offset,
+        timeoutMs: opts.timeoutMs,
+      });
+    }
+    return { lines: allLines, offset: cursor, sentinelFound };
+  }
+
+  return { readFrom, waitFor };
+}
+
+/** Default production reader (no diagnostics). */
+export const defaultTranscriptReader: TranscriptReader = createFileTranscriptReader();
diff --git a/plugins/trace-codex/src/agents/codex/transcript.test.ts b/plugins/trace-codex/src/agents/codex/transcript.test.ts
new file mode 100644
index 0000000..c49404b
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/transcript.test.ts
@@ -0,0 +1,47 @@
+import { describe, expect, test } from "bun:test";
+import { parseTranscriptLine } from "./transcript.ts";
+
+describe("parseTranscriptLine", () => {
+  test("parses a well-formed record envelope", () => {
+    const line = JSON.stringify({
+      timestamp: "2026-06-22T04:50:25.725Z",
+      type: "response_item",
+      payload: { type: "message", role: "assistant", content: "hi" },
+    });
+    const record = parseTranscriptLine(line);
+    expect(record).not.toBeNull();
+    expect(record?.type).toBe("response_item");
+    expect(record?.payload?.type).toBe("message");
+    expect(record?.timestamp).toBe("2026-06-22T04:50:25.725Z");
+  });
+
+  test("preserves unknown record types (forward compatible)", () => {
+    const line = JSON.stringify({ type: "some_future_type", payload: { type: "whatever" } });
+    const record = parseTranscriptLine(line);
+    expect(record?.type).toBe("some_future_type");
+    expect(record?.payload?.type).toBe("whatever");
+  });
+
+  test("preserves extra top-level fields", () => {
+    const line = JSON.stringify({ type: "compacted", payload: {}, window_id: 42 });
+    const record = parseTranscriptLine(line);
+    expect((record as Record<string, unknown>)?.window_id).toBe(42);
+  });
+
+  test("returns null for a blank line", () => {
+    expect(parseTranscriptLine("")).toBeNull();
+    expect(parseTranscriptLine("   ")).toBeNull();
+  });
+
+  test("returns null (no throw) for malformed JSON", () => {
+    expect(parseTranscriptLine('{"type":"response_item"')).toBeNull();
+    expect(parseTranscriptLine("not json at all")).toBeNull();
+  });
+
+  test("returns null for JSON that isn't an object", () => {
+    expect(parseTranscriptLine("[1,2,3]")).toBeNull();
+    expect(parseTranscriptLine("42")).toBeNull();
+    expect(parseTranscriptLine('"a string"')).toBeNull();
+    expect(parseTranscriptLine("null")).toBeNull();
+  });
+});
diff --git a/plugins/trace-codex/src/agents/codex/transcript.ts b/plugins/trace-codex/src/agents/codex/transcript.ts
new file mode 100644
index 0000000..e1d1a76
--- /dev/null
+++ b/plugins/trace-codex/src/agents/codex/transcript.ts
@@ -0,0 +1,81 @@
+// Pure parser/types for Codex transcript ("rollout") records.
+//
+// Codex writes an append-only JSONL transcript per session at the path every
+// hook payload reports as `transcript_path`. Each line is one record with a
+// stable envelope: `{ timestamp, type, payload }`. The transcript is the only
+// source for LLM calls, token usage, and the true execution order of LLM and
+// tool calls within a turn.
+//
+// This module only parses lines into typed records; it builds no spans and has
+// no Braintrust dependency. The format is explicitly NOT a stable interface
+// (Codex may change it across versions), so parsing is permissive and
+// fail-soft: unknown record types are preserved as-is, and a malformed line
+// yields null rather than throwing.
+
+/**
+ * The common envelope every transcript line shares. `payload` is intentionally
+ * loose: we narrow it per `type`/`payload.type` only where we need to, and we
+ * tolerate shapes we don't recognize.
+ */
+export interface TranscriptRecord {
+  /** ISO-8601 timestamp Codex wrote the record (its clock). */
+  timestamp?: string;
+  /** Top-level record kind, e.g. "response_item", "event_msg", "turn_context". */
+  type?: string;
+  /** Record-specific body. Narrow via `payload.type` for response_item/event_msg. */
+  payload?: TranscriptPayload;
+  /** Anything else Codex includes; preserved for forward compatibility. */
+  [key: string]: unknown;
+}
+
+/** Loose payload shape. `type` discriminates response_item / event_msg subtypes. */
+export interface TranscriptPayload {
+  type?: string;
+  [key: string]: unknown;
+}
+
+// Known top-level record types. Kept as constants (not a closed union) so
+// unknown types still parse and flow through as plain records.
+export const RECORD_SESSION_META = "session_meta";
+export const RECORD_TURN_CONTEXT = "turn_context";
+export const RECORD_RESPONSE_ITEM = "response_item";
+export const RECORD_EVENT_MSG = "event_msg";
+export const RECORD_COMPACTED = "compacted";
+
+// Known response_item payload subtypes.
+export const ITEM_MESSAGE = "message";
+export const ITEM_REASONING = "reasoning";
+export const ITEM_FUNCTION_CALL = "function_call";
+export const ITEM_FUNCTION_CALL_OUTPUT = "function_call_output";
+export const ITEM_CUSTOM_TOOL_CALL = "custom_tool_call";
+export const ITEM_CUSTOM_TOOL_CALL_OUTPUT = "custom_tool_call_output";
+export const ITEM_TOOL_SEARCH_CALL = "tool_search_call";
+export const ITEM_TOOL_SEARCH_OUTPUT = "tool_search_output";
+
+// Known event_msg payload subtypes.
+export const EVT_TOKEN_COUNT = "token_count";
+export const EVT_AGENT_MESSAGE = "agent_message";
+export const EVT_USER_MESSAGE = "user_message";
+export const EVT_TASK_STARTED = "task_started";
+export const EVT_TASK_COMPLETE = "task_complete";
+export const EVT_CONTEXT_COMPACTED = "context_compacted";
+
+/**
+ * Parse a single transcript line into a TranscriptRecord. Returns null for
+ * blank lines and for anything that isn't a JSON object, so callers can skip it
+ * without breaking the session. Never throws.
+ */
+export function parseTranscriptLine(line: string): TranscriptRecord | null {
+  const trimmed = line.trim();
+  if (trimmed.length === 0) return null;
+  try {
+    const parsed = JSON.parse(trimmed) as unknown;
+    if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
+      return null;
+    }
+    return parsed as TranscriptRecord;
+  } catch {
+    // Malformed JSON (e.g. a partially-written line): skip it.
+    return null;
+  }
+}
diff --git a/plugins/trace-codex/src/braintrust/logger.ts b/plugins/trace-codex/src/braintrust/logger.ts
index a31710f..4b95023 100644
--- a/plugins/trace-codex/src/braintrust/logger.ts
+++ b/plugins/trace-codex/src/braintrust/logger.ts
@@ -13,9 +13,64 @@ import type { Logger } from "../log.ts";
 
 export type { Span, StartSpanArgs };
 
+/**
+ * The minimal, serializable identity of a span: enough to recreate a handle
+ * bound to the same row after a restart, so further log()/end() calls merge into
+ * the original span server-side (rows are keyed by span_id). Captured from a live
+ * span's `spanId`/`rootSpanId`/`spanParents` getters (all synchronous).
+ */
+export interface SpanRef {
+  spanId: string;
+  rootSpanId: string;
+  /** Parent span ids (empty for a root span). */
+  spanParents: string[];
+  /**
+   * The span's name/type at capture time. Re-asserted on rehydration so the
+   * merged row keeps the original attributes — without them the SDK would infer
+   * a (wrong) name from the call stack when the rehydrated handle is created.
+   */
+  name?: string;
+  type?: StartSpanArgs["type"];
+  /**
+   * The span's original start time (Unix seconds). Re-asserted on rehydration so
+   * the merged row keeps its original start — without it the SDK stamps the start
+   * to the moment of rehydration (the resume time), corrupting the span's timing.
+   */
+  startTime?: number;
+}
+
+/**
+ * Capture a live span's identity for later rehydration. `name`/`type`/`startTime`
+ * are the attributes the span was created with (the SDK doesn't expose them as
+ * getters), passed by the caller so they can be re-asserted on resume.
+ */
+export function spanRef(
+  span: Span,
+  name?: string,
+  type?: StartSpanArgs["type"],
+  startTime?: number,
+): SpanRef {
+  return {
+    spanId: span.spanId,
+    rootSpanId: span.rootSpanId,
+    spanParents: span.spanParents,
+    ...(name !== undefined ? { name } : {}),
+    ...(type !== undefined ? { type } : {}),
+    ...(startTime !== undefined ? { startTime } : {}),
+  };
+}
+
 /** Creates root spans and can flush buffered events. */
 export interface SpanFactory {
   startSpan(args: StartSpanArgs): Span;
+  /**
+   * Recreate a handle bound to an existing span's identity (see {@link SpanRef}).
+   * The returned span carries the original span_id/root_span_id/parents, so
+   * subsequent log()/end() calls merge into that row rather than creating a new
+   * one. Used to resume a session's trace after the server restarted. The
+   * returned handle is "naked": it logs nothing until the caller does.
+   */
+  rehydrateSpan(ref: SpanRef): Span;
   flush(): Promise<void>;
 }
 
@@ -79,6 +134,14 @@ export function createSpanFactory(config?: ReportingConfig, diagLogger?: Logger)
   });
   return {
     startSpan: (args) => logger.startSpan(args),
+    rehydrateSpan: (ref) =>
+      logger.startSpan({
+        spanId: ref.spanId,
+        parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId },
+        ...(ref.name !== undefined ? { name: ref.name } : {}),
+        ...(ref.type !== undefined ? { type: ref.type } : {}),
+        ...(ref.startTime !== undefined ? { startTime: ref.startTime } : {}),
+      }),
     flush: async () => {
       try {
         await logger.flush();
diff --git a/plugins/trace-codex/src/client/client.ts b/plugins/trace-codex/src/client/client.ts
index f4b1735..264306b 100644
--- a/plugins/trace-codex/src/client/client.ts
+++ b/plugins/trace-codex/src/client/client.ts
@@ -34,11 +34,19 @@ async function readStdin(): Promise<string> {
 export interface HookClientOptions {
   /**
    * Event names that terminate a turn/session. After enqueuing one of these,
-   * the client asks the server to flush synchronously (POST /flush) so the
-   * final spans are delivered before this process — and any background server —
-   * is torn down. Defaults to none (fire-and-forget).
+   * the client asks the server to flush (POST /flush) so buffered spans are
+   * delivered. Defaults to none (fire-and-forget).
    */
   terminalEvents?: readonly string[];
+  /**
+   * Whether to block on a terminal event until the server confirms all events
+   * are processed and spans are delivered. Defaults to false. When false the
+   * client does nothing on a terminal event — the long-lived server flushes on
+   * its own when the queue goes idle. Set true (via BRAINTRUST_FLUSH_ON_TURN_END)
+   * to guarantee delivery before Codex exits, e.g. in programmatic/CI runs where
+   * the process tree is torn down before the idle flush can fire.
+   */
+  flushOnTurnEnd?: boolean;
 }
 
 export async function runHookClient(
@@ -79,11 +87,14 @@ export async function runHookClient(
     }
   }
 
-  // On a terminal event, block until the server confirms the queue has drained
-  // and buffered spans are flushed. Without this, a CI job (or any short-lived
-  // host) can end right after the agent's last turn — before the background
-  // server's idle timeout fires — and lose the final spans.
-  if (sawTerminal) {
+  // On a terminal event, only flush when flushOnTurnEnd is set: block until the
+  // server confirms the queue has drained and buffered spans are delivered —
+  // important for a CI job (or any short-lived host) that ends right after the
+  // agent's last turn, before the background server's idle timeout fires, which
+  // would otherwise lose the final spans. Otherwise do nothing: the long-lived
+  // server flushes on its own when the queue goes idle, so the turn isn't
+  // stalled by a flush request.
+  if (sawTerminal && (options.flushOnTurnEnd ?? false)) {
     const flushed = await postFlush(config, logger);
     logger.debug("flush requested on terminal event", { flushed });
   }
diff --git a/plugins/trace-codex/src/index.ts b/plugins/trace-codex/src/index.ts
index d5be092..81853fe 100644
--- a/plugins/trace-codex/src/index.ts
+++ b/plugins/trace-codex/src/index.ts
@@ -22,6 +22,13 @@ function agentFactories(): Map<string, EventProcessorFactory> {
   return new Map(AGENTS.map((a) => [a.eventSource, a.createProcessor]));
 }
 
+/** Parse a boolean env var: true only for "true"/"1" (case-insensitive). */
+function parseBoolEnv(value: string | undefined): boolean {
+  if (!value) return false;
+  const v = value.trim().toLowerCase();
+  return v === "true" || v === "1";
+}
+
 async function main(): Promise<void> {
   const mode = process.argv[2] ?? "hook";
 
@@ -64,6 +71,8 @@ async function main(): Promise<void> {
   try {
     await runHookClient(config, logger, agent.buildEvents, {
       terminalEvents: agent.terminalEvents,
+      // Settings (above) have mapped BRAINTRUST_FLUSH_ON_TURN_END into env.
+      flushOnTurnEnd: parseBoolEnv(process.env.BRAINTRUST_FLUSH_ON_TURN_END),
     });
   } catch (err) {
     logger.error("unexpected hook client error", { error: String(err) });
diff --git a/plugins/trace-codex/src/server/enqueue-client.ts b/plugins/trace-codex/src/server/enqueue-client.ts
index a947c0e..e00f680 100644
--- a/plugins/trace-codex/src/server/enqueue-client.ts
+++ b/plugins/trace-codex/src/server/enqueue-client.ts
@@ -31,13 +31,14 @@ export async function postEnqueue(
 }
 
 /**
- * Ask the server to process everything enqueued so far and flush buffered spans
- * to the backend, then wait for it to confirm. Returns true on a 2xx response.
- * Never throws.
+ * Ask the server to flush, blocking until it confirms. Returns true on a 2xx
+ * response. Never throws.
  *
- * The hook calls this after a terminal event so the final spans are delivered
- * before the process tree is torn down (e.g. when a CI job ends right after the
- * agent's last turn, before the background server's idle timeout fires).
+ * The server waits until everything enqueued so far has been processed and
+ * buffered spans have reached the backend before responding. The hook uses this
+ * after a terminal event (only when block-on-stop is configured) so the final
+ * spans are delivered before the process tree is torn down (e.g. a CI job ending
+ * right after the agent's last turn, before the idle timeout fires).
  */
 export async function postFlush(
   config: Pick<Config, "host" | "port">,
@@ -46,8 +47,8 @@ export async function postFlush(
   try {
     const res = await fetch(`${serverBaseUrl(config)}/flush`, {
       method: "POST",
-      // The server bounds its own wait (FLUSH_TIMEOUT_MS); allow a little more
-      // here so we receive its response rather than aborting the request.
+      // The server bounds its wait by FLUSH_TIMEOUT_MS; allow a little more here
+      // so we receive its response rather than aborting first.
       signal: AbortSignal.timeout(12_000),
     });
     if (!res.ok) {
diff --git a/plugins/trace-codex/src/server/routes.ts b/plugins/trace-codex/src/server/routes.ts
index 3ae78bf..74ce636 100644
--- a/plugins/trace-codex/src/server/routes.ts
+++ b/plugins/trace-codex/src/server/routes.ts
@@ -83,11 +83,13 @@ export async function handleRequest(req: Request, deps: RouteDeps): Promise<Resp
   }
 
   // POST /flush
-  // Wait until everything enqueued so far has been processed and buffered spans
-  // have been flushed to the backend, then respond. Used by the hook client on
-  // terminal events (e.g. Codex "Stop") so the final spans are delivered before
-  // the process tree is torn down — important in short-lived environments like
-  // CI where the background server won't survive to flush on idle.
+  //
+  // Wait until everything enqueued so far has been processed AND buffered spans
+  // have been flushed to the backend, then respond. Used on terminal events /
+  // shutdown in short-lived environments (e.g. CI) where the background server
+  // won't survive to flush on idle, so the final spans must be delivered before
+  // the process tree is torn down. In normal interactive use the client doesn't
+  // call this — the server flushes on its own when the queue goes idle.
   if (path === "/flush" && req.method === "POST") {
     const flushed = await Promise.race([
       queue.drained().then(() => true),
diff --git a/plugins/trace-codex/src/server/server.test.ts b/plugins/trace-codex/src/server/server.test.ts
index e26689c..f85f690 100644
--- a/plugins/trace-codex/src/server/server.test.ts
+++ b/plugins/trace-codex/src/server/server.test.ts
@@ -127,4 +127,53 @@ describe("startServer", () => {
       await server.stop();
     }
   });
+
+  test("/flush waits for the queue to drain and processors to flush", async () => {
+    // /flush must return only after every enqueued event has been processed AND
+    // processors have been flushed (the idle flush runs before drained()
+    // resolves). Records ordering to prove flush happened before the response.
+    const events: string[] = [];
+    const factory: EventProcessorFactory = () => ({
+      process: async () => {
+        await sleep(20);
+        events.push("process");
+      },
+      flush: () => {
+        events.push("flush");
+      },
+    });
+
+    const server = startServer(
+      testConfig({ idleTimeoutMs: 60_000, idleCheckIntervalMs: 60_000 }),
+      new Map([["test", factory]]),
+      createTestLogger(),
+    );
+
+    try {
+      const enqueue = await fetch(`http://127.0.0.1:${server.port}/enqueue`, {
+        method: "POST",
+        headers: { "content-type": "application/json" },
+        body: JSON.stringify({
+          queueId: "s1",
+          eventSource: "test",
+          eventSourceVersion: null,
+          eventName: "Stop",
+          eventData: {},
+        }),
+      });
+      expect(enqueue.status).toBe(200);
+
+      const flushRes = await fetch(`http://127.0.0.1:${server.port}/flush`, {
+        method: "POST",
+      });
+      expect(flushRes.status).toBe(200);
+      expect(await flushRes.json()).toEqual({ ok: true });
+
+      // By the time /flush returns, the event was processed and the idle flush
+      // ran.
+      expect(events).toEqual(["process", "flush"]);
+    } finally {
+      await server.stop();
+    }
+  });
 });
diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts
index 639d4a4..2f99eeb 100644
--- a/plugins/trace-codex/src/test-helpers.ts
+++ b/plugins/trace-codex/src/test-helpers.ts
@@ -1,7 +1,7 @@
 // Shared test helpers.
 
 import { _exportsForTestingOnly, initLogger } from "braintrust";
-import type { Span, SpanFactory, StartSpanArgs } from "./braintrust/logger.ts";
+import type { Span, SpanFactory, SpanRef, StartSpanArgs } from "./braintrust/logger.ts";
 import type { Logger } from "./log.ts";
 
 /** A no-op logger for tests; never touches the filesystem. */
@@ -53,6 +53,25 @@ export function createFakeSpanFactory(): FakeSpanFactory {
         },
       } as unknown as Span;
     },
+    rehydrateSpan(ref: SpanRef): Span {
+      const fake: FakeSpan = {
+        id: ref.spanId,
+        startArgs: {},
+        flushCount: 0,
+        endCount: 0,
+      };
+      spans.push(fake);
+      return {
+        id: fake.id,
+        flush: async () => {
+          fake.flushCount += 1;
+        },
+        end: () => {
+          fake.endCount += 1;
+          return 0;
+        },
+      } as unknown as Span;
+    },
     flush: async () => {
       factory.factoryFlushCount += 1;
     },
@@ -85,7 +104,8 @@ export interface CapturedSpan {
   input?: unknown;
   output?: unknown;
   metadata?: Record<string, unknown>;
-  metrics?: { start?: number; end?: number };
+  tags?: string[];
+  metrics?: { start?: number; end?: number } & Record<string, number | undefined>;
 }
 
 export interface CapturedTrace {
@@ -109,6 +129,14 @@ export function withCapturedTrace(): CapturedTrace {
   return {
     spanFactory: {
       startSpan: (args) => logger.startSpan(args),
+      rehydrateSpan: (ref) =>
+        logger.startSpan({
+          spanId: ref.spanId,
+          parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId },
+          ...(ref.name !== undefined ? { name: ref.name } : {}),
+          ...(ref.type !== undefined ? { type: ref.type } : {}),
+          ...(ref.startTime !== undefined ? { startTime: ref.startTime } : {}),
+        }),
       flush: () => logger.flush(),
     },
     drain: async () => {
@@ -131,12 +159,51 @@ export interface SpanTree {
   input?: unknown;
   output?: unknown;
   metadata?: Record<string, unknown>;
-  metrics?: { start?: number; end?: number };
+  tags?: string[];
+  metrics?: { start?: number; end?: number } & Record<string, number | undefined>;
   children: SpanTree[];
 }
 
+/**
+ * Merge captured span rows that share a span_id into one logical span, the way
+ * Braintrust merges rows server-side. Multiple rows for one span_id occur when a
+ * span is logged across several calls (e.g. start, then output, then end) and
+ * especially when a span is rehydrated after a server restart: the resumed
+ * handle re-emits rows under the original id. Later non-empty fields win;
+ * metadata and span_attributes are shallow-merged so partial updates accumulate.
+ */
+export function mergeCapturedSpans(spans: CapturedSpan[]): CapturedSpan[] {
+  const byId = new Map<string, CapturedSpan>();
+  const order: string[] = [];
+  for (const span of spans) {
+    const existing = byId.get(span.span_id);
+    if (existing === undefined) {
+      byId.set(span.span_id, { ...span });
+      order.push(span.span_id);
+      continue;
+    }
+    // Merge: later rows override, but don't clobber an existing value with
+    // undefined (a partial row that omits a field shouldn't erase it).
+    const merged: CapturedSpan = { ...existing };
+    for (const [key, value] of Object.entries(span) as [keyof CapturedSpan, unknown][]) {
+      if (value === undefined) continue;
+      if (key === "metadata" || key === "span_attributes" || key === "metrics") {
+        merged[key] = {
+          ...(existing[key] as Record<string, unknown> | undefined),
+          ...(value as Record<string, unknown>),
+        } as never;
+      } else {
+        merged[key] = value as never;
+      }
+    }
+    byId.set(span.span_id, merged);
+  }
+  return order.map((id) => byId.get(id) as CapturedSpan);
+}
+
 /** Build a single-rooted tree from flat captured spans (via span_parents). */
-export function spansToTree(spans: CapturedSpan[]): SpanTree | null {
+export function spansToTree(rawSpans: CapturedSpan[]): SpanTree | null {
+  const spans = mergeCapturedSpans(rawSpans);
   if (spans.length === 0) return null;
 
   const root = spans.find(
@@ -171,6 +238,7 @@ export function spansToTree(spans: CapturedSpan[]): SpanTree | null {
       input: span.input,
       output: span.output,
       metadata: span.metadata,
+      tags: span.tags,
       metrics: span.metrics,
       children,
     };
@@ -188,8 +256,12 @@ export interface ExpectedSpan {
   input?: unknown;
   output?: unknown;
   metadata?: Record<string, unknown>;
+  /** If set, assert each listed tag is present on the span. */
+  tags?: string[];
   /** If set, assert whether the span has an end time (true) or not (false). */
   ended?: boolean;
+  /** If set, assert exact start/end metric values (Unix seconds). */
+  metrics?: { start?: number; end?: number } & Record<string, number | undefined>;
   /** Exact list of children (length and order are checked). */
   children?: ExpectedSpan[];
 }
@@ -198,6 +270,23 @@ function nameMatches(actual: string | undefined, expected: string | RegExp): boo
   return expected instanceof RegExp ? expected.test(actual ?? "") : actual === expected;
 }
 
+/** JSON.stringify with object keys sorted recursively, for order-insensitive
+ * deep equality of plain data (objects/arrays/primitives). */
+function canonicalJson(value: unknown): string {
+  const sort = (v: unknown): unknown => {
+    if (Array.isArray(v)) return v.map(sort);
+    if (v !== null && typeof v === "object") {
+      const out: Record<string, unknown> = {};
+      for (const key of Object.keys(v as Record<string, unknown>).sort()) {
+        out[key] = sort((v as Record<string, unknown>)[key]);
+      }
+      return out;
+    }
+    return v;
+  };
+  return JSON.stringify(sort(value));
+}
+
 export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: string): string[] {
   const diffs: string[] = [];
   if (actual === null) {
@@ -233,17 +322,37 @@ export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path:
   }
   if (expected.metadata !== undefined) {
     for (const [key, value] of Object.entries(expected.metadata)) {
-      const a = JSON.stringify(actual.metadata?.[key]);
-      const e = JSON.stringify(value);
+      // Compare with sorted keys so nested-object key order doesn't matter (the
+      // SDK may reorder keys, e.g. when metadata is merged across log() calls).
+      const a = canonicalJson(actual.metadata?.[key]);
+      const e = canonicalJson(value);
       if (a !== e) diffs.push(`${path}.metadata.${key}: expected ${e}, got ${a}`);
     }
   }
+  if (expected.tags !== undefined) {
+    for (const tag of expected.tags) {
+      if (!actual.tags?.includes(tag)) {
+        diffs.push(
+          `${path}.tags: expected to include "${tag}", got ${JSON.stringify(actual.tags)}`,
+        );
+      }
+    }
+  }
   if (expected.ended !== undefined) {
     const isEnded = actual.metrics?.end !== undefined;
     if (isEnded !== expected.ended) {
       diffs.push(`${path}.ended: expected ${expected.ended}, got ${isEnded}`);
     }
   }
+  if (expected.metrics !== undefined) {
+    for (const [key, value] of Object.entries(expected.metrics)) {
+      if (value === undefined) continue;
+      const actualValue = actual.metrics?.[key];
+      if (actualValue !== value) {
+        diffs.push(`${path}.metrics.${key}: expected ${value}, got ${String(actualValue)}`);
+      }
+    }
+  }
   if (expected.children !== undefined) {
     if (actual.children.length !== expected.children.length) {
       diffs.push(