diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 57dc956..da62abb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,7 +7,7 @@ on: jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 44aee2c..fec606b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -17,7 +17,9 @@ env: jobs: release: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + outputs: + version: ${{ steps.vars.outputs.version }} steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -95,3 +97,13 @@ jobs: "$PLUGIN_DIR"/bin/codex-hook-darwin-x64 \ "$PLUGIN_DIR"/bin/codex-hook-linux-x64 \ "$PLUGIN_DIR"/bin/codex-hook-linux-arm64 + + # After a successful publish, run the end-to-end smoke test (macOS + Linux) + # against the just-released version. + smoke: + needs: release + uses: ./.github/workflows/smoke.yaml + with: + version: ${{ needs.release.outputs.version }} + secrets: + openai_api_key: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml new file mode 100644 index 0000000..ea66015 --- /dev/null +++ b/.github/workflows/smoke.yaml @@ -0,0 +1,81 @@ +name: Smoke trace-codex + +# End-to-end smoke test for the trace-codex plugin against a published release. +# Installs Codex + the plugin from the marketplace, runs a real `codex exec` +# session with tracing pointed at a local mock Braintrust collector, and asserts +# at least one trace row was reported. +# +# Two entry points: +# - workflow_call: invoked by release.yaml after a successful release, with the +# version it just published. +# - workflow_dispatch: run manually from the Actions tab against any published +# release version (e.g. to retest an older release). + +on: + workflow_call: + inputs: + version: + description: "Published release version to test, e.g. 0.0.1 (with or without leading v)." + required: true + type: string + secrets: + openai_api_key: + description: "OpenAI API key used by codex exec." + required: true + workflow_dispatch: + inputs: + version: + description: "Published release version to test, e.g. 0.0.1 (with or without leading v)." + required: true + type: string + +permissions: + contents: read + +env: + PLUGIN_DIR: plugins/trace-codex + +jobs: + smoke: + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-24.04 + label: linux-x64 + - runner: macos-15 + label: darwin-arm64 + - runner: macos-15-intel + label: darwin-x64 + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Install Bun + uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2 + with: + bun-version: latest + + - name: Install plugin dependencies + working-directory: ${{ env.PLUGIN_DIR }} + run: bun install --frozen-lockfile + + - name: Install Codex CLI + run: npm install -g @openai/codex + + - name: Install trace-codex plugin from the marketplace + env: + # The release repo is private; the launcher downloads the binary using + # this token. `codex plugin` also uses gh credentials for the clone. + GH_TOKEN: ${{ github.token }} + run: | + codex plugin marketplace add braintrustdata/braintrust-codex-plugin + codex plugin add trace-codex@braintrust-codex-plugins + + - name: Run smoke test (${{ matrix.label }}) + working-directory: ${{ env.PLUGIN_DIR }} + env: + OPENAI_API_KEY: ${{ secrets.openai_api_key || secrets.OPENAI_API_KEY }} + GH_TOKEN: ${{ github.token }} + SMOKE_RELEASE_VERSION: ${{ inputs.version }} + run: sh scripts/smoke-test.sh diff --git a/AGENTS.md b/AGENTS.md index d2016b0..2e4bf58 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,8 +17,9 @@ Key files for the MCP/skills plugin: - `plugins/braintrust-codex-plugin/.mcp.json` — MCP server definition - `plugins/braintrust-codex-plugin/skills/braintrust/` — agent skills exposed through the plugin -Key files for the tracing plugin: +Key files for the tracing plugin (see [`plugins/trace-codex/AGENTS.md`](plugins/trace-codex/AGENTS.md) for its architecture): +- `plugins/trace-codex/AGENTS.md` — architecture and contributor guide for this plugin - `plugins/trace-codex/.codex-plugin/plugin.json` — plugin manifest - `plugins/trace-codex/hooks/hooks.json` — lifecycle hook config - `plugins/trace-codex/src/` — the hook client + event server (compiled to `bin/codex-hook`) diff --git a/plugins/trace-codex/AGENTS.md b/plugins/trace-codex/AGENTS.md new file mode 100644 index 0000000..1c0edc1 --- /dev/null +++ b/plugins/trace-codex/AGENTS.md @@ -0,0 +1,195 @@ +# AGENTS.md — trace-codex + +Guidelines for AI agents working in the `trace-codex` plugin. This is the +opt-in plugin that traces Codex sessions to Braintrust. It is **independent** +from the MCP/skills plugin (`braintrust-codex-plugin`) — do not merge behavior +between them. See the [repo AGENTS.md](../../AGENTS.md) for the monorepo +overview. + +## What this plugin is + +A long-lived, local **event server** plus a short-lived **hook client**, both +compiled into a single `codex-hook` binary. Codex fires lifecycle hooks; each +hook invocation runs the client, which forwards the event to the server, which +turns the stream of events into Braintrust spans (session → turn → tool). + +## Architecture + +Request/data flow: + +``` +Codex hook → bin/codex-hook.sh → codex-hook (client mode) + → POST /enqueue → in-memory FIFO queue + → ProcessorRegistry (one EventProcessor per session) + → Braintrust spans +``` + +Key pieces: + +- `src/index.ts` — entry point. One binary, three modes: `serve` (the + background server), `hook` (default; read one event from stdin, enqueue it), + and `replay` (re-POST a recorded NDJSON session). +- `src/client/` — the hook client. Ensures a server is up (booting a detached + one if needed), POSTs events, and on a terminal event (`Stop`) calls `/flush` + so final spans are delivered before the process tree is torn down. +- `src/server/` — the HTTP server, the FIFO `EventQueue`, routes + (`/enqueue`, `/flush`, `/health`, `/shutdown`), and the idle watchdog. +- `src/processor/` — the generic `ProcessorRegistry` (LRU map of per-session + processors) and the `EventProcessor` interface. +- `src/agents/codex/` — Codex-specific: translate hook payloads into + `EnqueueEvent`s (`event-builder.ts`), build spans (`event-processor.ts`), and + read user settings (`settings.ts`). +- `src/braintrust/` — thin wrapper over the Braintrust SDK (span factory). + +The server is **single-version**: a client whose version doesn't match a running +server bails rather than talking to it. It shuts itself down after an idle +window (default 5 min) or on `/shutdown`. + +The design has three deliberate properties worth understanding before changing +anything: + +### 1. Agent-agnostic "event server" + +The core (`src/server/`, `src/processor/`, `src/braintrust/`) knows nothing +about Codex. It speaks a generic `EnqueueEvent` shape and routes events to a +per-session processor. All Codex-specific knowledge lives in `src/agents/codex/` +and is registered into the generic core at startup. Adding another agent (Claude +Code, opencode, etc.) means adding a sibling `src/agents//` module that +exports the same `Agent` shape — the server, queue, and registry should not need +to change. + +Keep this boundary clean: nothing under `src/server/`, `src/processor/`, or +`src/braintrust/` should reference Codex by name. The generic event server may +eventually be extracted into its own package, so treat any Codex-specific leak +into the generic layers as a bug, not a shortcut. + +### 2. Bun-compiled, cross-platform binaries + +The hook command in `hooks.json` is a fixed, platform-agnostic string that +invokes `bin/codex-hook.sh` (or `.cmd` on Windows), **not** the binary directly. +That launcher resolves and runs the real, platform-specific `codex-hook` binary. + +- `scripts/build.ts` compiles `src/index.ts` with `bun build --compile` for each + target (`darwin-arm64/x64`, `linux-x64/arm64`; Windows slots in but isn't + built yet). `BUILD_HOST_ONLY=1` builds just the host target (used by the + repo-root `install.sh` for local dev installs). +- The compiled binaries are **large and not committed**. The launcher downloads + the matching binary from the plugin's GitHub release on first use and caches + it at `$PLUGIN_ROOT/bin/codex-hook`. Codex wipes `$PLUGIN_ROOT` on every + install/upgrade, so the cache self-invalidates and the next hook re-downloads + the right version. +- Plugin version is the single source of truth in + `.codex-plugin/plugin.json`; the launcher reads it to construct the release + tag (`trace-codex-v`). + +**Hard rule:** the hook must never fail the Codex turn. The launcher and client +log to stderr and exit 0 on any error; the server swallows errors so a turn is +never blocked by tracing. + +### 3. Hooks are non-blocking by default + +Tracing must not degrade the experience of using the coding agent. Every hook +fires synchronously in Codex's path, so blocking work on a hook directly adds +latency to the user's session. The design therefore makes the common path +fire-and-forget: the client POSTs to `/enqueue` and returns immediately, while +the background server does the slow work (Braintrust SDK calls, flushes) off the +critical path. + +The terminal event is the one place where blocking is even an option, and it's +**opt-in**: + +- On a **terminal event** (`Stop`), the client calls `/flush` **only** when + `BRAINTRUST_FLUSH_ON_TURN_END` is set, blocking until the server confirms the + queue has drained and buffered spans reached Braintrust. The blocking mode + exists for short-lived hosts (e.g. a CI job that ends right after the last turn) + that would otherwise tear the process tree down before the final spans are + delivered. When enabled, the wait is bounded by a timeout (`/flush` gives up + rather than hanging the turn). By default the client does nothing on `Stop`: the + long-lived server flushes on its own when the queue goes idle, so in normal + interactive use no spans are lost and the turn isn't stalled by a flush request. + +When adding behavior, prefer enqueue-and-return. Reach for a blocking +request/await only when data would otherwise be lost, keep it off the +per-hook hot path where possible, and always bound it so a slow or stuck backend +can't stall the agent. + +### 4. Sessions can outlive the server (resume) + +The server is short-lived (idle shutdown after ~5 min, or the user closes and +later resumes). A Codex session, though, can span that gap. To avoid dropping +the tail of a trace — or worse, re-emitting it as duplicate/orphaned spans — the +Codex processor **persists its resumable state** and rehydrates it when a new +processor is created for the same session id. + +How it works: + +- The transcript (rollout JSONL) on disk is the durable source of truth for span + *content*. What a restart loses is the processor's in-memory bookkeeping: + transcript byte offsets, which spans are still open (and their identities), + the reconstructed conversation history, and the subagent/compaction side-maps. +- On every `flush()` (idle drain, eviction, terminal Stop, shutdown), the + processor writes a JSON snapshot of that bookkeeping to + `PLUGIN_DATA/state/.json` (`src/agents/codex/snapshot-store.ts`). + Span handles are stored as identities (`span_id`/`root_span_id`/`parents` plus + name/type), captured from the SDK's synchronous span getters. +- On its first event, a processor attempts a one-time restore for its session + id. If a compatible snapshot exists, it recreates each span handle bound to the + original id via `SpanFactory.rehydrateSpan` (Braintrust merges rows by + `span_id`, so further `log()`/`end()` calls continue the same trace). Restore + runs **before** the tracing-enabled master switch, because the snapshot also + carries the session's reporting config (a bare mid-session restart may have no + leading config event). +- The snapshot is **not** deleted when the root span ends. Codex's `Stop` hook is + per-**turn**, not per-session (there is no session-end hook), and we end the + root on the first `Stop` — but later turns still attach as children of that same + root, so a restart *between* turns must be able to resume. The snapshot + therefore persists past root-end; stale ones (sessions that never resume) are + reclaimed by a startup GC sweep that removes snapshots older than a TTL. + +Invariants to preserve when changing this: + +- **Persistence is Codex-owned**, not part of the generic event server: *where* a + plugin may persist state is the host agent's call (Codex gives us + `PLUGIN_DATA`). Keep the store and snapshot shape under `src/agents/codex/`. + The only generic addition is `SpanFactory.rehydrateSpan` (pure SDK behavior). +- **Never persist secrets.** The `apiKey` is stripped from the snapshot's + reporting config and re-resolved from env / the config event on resume (mirrors + the event recorder's redaction). +- **Never throw.** The store swallows and logs all I/O errors; a persistence or + restore failure must not break a turn — it just falls back to starting fresh. +- **Version-gate snapshots.** Each carries `pluginVersion` + a schema version; + a mismatch discards the snapshot. Bump `SNAPSHOT_SCHEMA_VERSION` in + `state-snapshot.ts` whenever the shape changes incompatibly. + +## Making changes + +- **Generic vs. agent code**: put agent-specific logic in `src/agents//`; + keep the server/processor/braintrust layers agent-agnostic. +- **Config**: two layers. The agent-specific layer (`src/agents/codex/settings.ts`) + reads the user's `config.json` from `PLUGIN_DATA`, maps its friendly camelCase + keys onto `BRAINTRUST_*` / `BRAINTRUST_EVENT_SERVER_*` env vars (env wins over + the file), and is run by the hook client before it boots the server — so the + spawned server inherits the resolved env. The generic layer (`src/config.ts`) + then reads **only** those env vars and never touches `config.json`. Keep it + that way: config-file parsing is deliberately agent-specific. User-facing + settings are documented in the [README](./README.md); update it (and the + `Settings` map in `settings.ts`) when adding one. +- **Hooks**: `hooks/hooks.json` lists the lifecycle events wired to the client. + Most are forwarded but not yet turned into spans (the processor no-ops the + ones it doesn't handle). +- **Build**: edit `scripts/build.ts` for targets/output; the launcher scripts + (`bin/codex-hook.sh`, `bin/codex-hook.cmd`) for download/exec behavior. + +## Commands + +Run from `plugins/trace-codex/`: + +- `bun test` — run the test suite (tests live next to sources as `*.test.ts`). +- `bun run typecheck` — `tsc --noEmit`. +- `bun run lint` / `bun run check` — Biome lint / lint+format. +- `bun run build` — compile all target binaries (`BUILD_HOST_ONLY=1` for host + only). +- `bun run dev` — run the server in watch mode. + +Add or update tests alongside any behavior change; keep typecheck and lint +clean. diff --git a/plugins/trace-codex/Makefile b/plugins/trace-codex/Makefile index 0f20746..ead81c7 100644 --- a/plugins/trace-codex/Makefile +++ b/plugins/trace-codex/Makefile @@ -9,6 +9,14 @@ # make build Compile the host binary (and per-platform binaries). # make integration Build, boot the server via the hook binary, health-check, # assert the version, then shut it down. +# make smoke End-to-end: run a real `codex exec` session with the +# installed plugin, traced to a local mock collector, and +# assert >=1 trace row. Requires codex, bun, and +# OPENAI_API_KEY; optionally SMOKE_VERSION to pin a release. +# make token-proxy Run the token-counting proxy (debugging aid) that sits +# between Codex and OpenAI and prints each call's token usage +# for comparison against the trace. Requires OPENAI_API_KEY. +# See scripts/token-proxy.ts for the config.toml snippet. # make clean Remove build artifacts. SHELL := /bin/bash @@ -24,7 +32,7 @@ EXPECTED_VERSION := $(shell sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^" BIN := bin/codex-hook BASE_URL := http://127.0.0.1:$(TEST_PORT) -.PHONY: test lint typecheck unit build integration clean +.PHONY: test lint typecheck unit build integration smoke token-proxy clean test: lint typecheck unit integration @@ -81,6 +89,20 @@ integration: build echo " -> server stopped cleanly"; \ echo "PASS: integration test" +# End-to-end smoke test. Assumes the plugin is installed in Codex (run +# `../../install.sh trace-codex` first for a local dev install). Set +# SMOKE_VERSION to pin the release whose binary the launcher downloads; leave it +# unset to use the installed plugin manifest version / local dev binary. +smoke: + @echo "==> Smoke test" + SMOKE_RELEASE_VERSION="$(SMOKE_VERSION)" sh scripts/smoke-test.sh + +# Token-counting proxy for verifying trace-codex's token accounting. Run it, then +# point Codex at it via a custom model_provider (see scripts/token-proxy.ts). +token-proxy: + @echo "==> Token proxy (Ctrl-C for the session total)" + bun run scripts/token-proxy.ts + clean: @echo "==> Cleaning" rm -rf bin diff --git a/plugins/trace-codex/README.md b/plugins/trace-codex/README.md index 6371b6e..b2d059c 100644 --- a/plugins/trace-codex/README.md +++ b/plugins/trace-codex/README.md @@ -12,6 +12,50 @@ codex plugin add trace-codex@braintrust-codex-plugins TRACE_TO_BRAINTRUST=true BRAINTRUST_PROJECT=my-coding-agent codex ``` +## Using the plugin in CI + +Tracing `codex exec` runs in CI works. A few notes: + +- **Block on flush.** Set `BRAINTRUST_FLUSH_ON_TURN_END=true` so the final spans are delivered before the job exits (a short-lived CI job can otherwise tear down before the background server's idle-drain flush fires). +- **Trust the hooks non-interactively.** Pass `codex exec --dangerously-bypass-hook-trust` to skip the one-time interactive hook-trust prompt. (Only for plugins you trust.) + +GitHub Actions example: + +```yaml +name: codex-traced +on: [workflow_dispatch] + +jobs: + run: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: Install Codex CLI + run: npm install -g @openai/codex # or your preferred install method + + - name: Install the trace-codex plugin (pinned to a release tag) + # TODO: replace with desired version ---------------------------------vvvvvvvvvvvvvvvvvv + run: | + codex plugin marketplace add braintrustdata/braintrust-codex-plugin@trace-codex-v0.0.X + codex plugin add trace-codex@braintrust-codex-plugins + + - name: Run a traced Codex session + env: + TRACE_TO_BRAINTRUST: "true" + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + BRAINTRUST_PROJECT: my-coding-agent # <--- TODO: replace with your project name + BRAINTRUST_FLUSH_ON_TURN_END: "true" + run: | + codex exec \ + --skip-git-repo-check \ + --dangerously-bypass-hook-trust \ + --sandbox read-only \ + "summarize the changes in this repo" +``` + +If your plugin release repo is private, also expose a token the launcher can use to download the binary (`GH_TOKEN` or `GITHUB_TOKEN`, or an authenticated `gh`). + ## Configuration There are two ways to configure the plugin. **Environment variables always win over the config file** @@ -30,8 +74,22 @@ Every setting can be provided as a `config.json` key or as an environment variab | `project` | `BRAINTRUST_PROJECT` | _(unset)_ | Project to log traces into. | | `apiUrl` | `BRAINTRUST_API_URL` | api.braintrust.dev | Braintrust API URL. | | `additionalMetadata` | `BRAINTRUST_ADDITIONAL_METADATA` | _(unset)_ | JSON object of extra metadata merged into the root span. Standard keys (`session_id`, `model`, `project`, etc.) take precedence on conflict. | +| `flushOnTurnEnd` | `BRAINTRUST_FLUSH_ON_TURN_END` | `false` | When `true`, the hook blocks at each turn's end (the `Stop` event) until the server confirms all spans are flushed. Use in programmatic/CI runs to guarantee traces are delivered before Codex exits. | | `recordFile` | `BRAINTRUST_EVENT_SERVER_RECORD_FILE` | _(unset)_ | If set, record every event to this NDJSON file (for `replay`). | +### Resuming sessions + +Note that when resuming a session, the original session's options will remain in effect. + +For example: + +```sh +TRACE_TO_BRAINTRUST=true codex # first session enables braintrust +TRACE_TO_BRAINTRUST=false codex resume abcde # the resumed session will still be traced because the original session was +``` + +The trace itself also survives the background server stopping. The server shuts down after the idle window (or on an explicit shutdown), but a session can outlive it — you might leave it open past the timeout, send another message after the server stopped, or `codex resume` later. The plugin snapshots each session's in-progress trace state under `$PLUGIN_DATA/state/` and restores it when the session continues, so later turns keep landing in the same trace instead of starting a new one. Stale snapshots age out automatically, and secrets (your API key) are never written to them. + ### Advanced Options Advanced plugin settings for debugging or developing the plugin diff --git a/plugins/trace-codex/bin/codex-hook.sh b/plugins/trace-codex/bin/codex-hook.sh index eaed11e..102c3d9 100755 --- a/plugins/trace-codex/bin/codex-hook.sh +++ b/plugins/trace-codex/bin/codex-hook.sh @@ -51,8 +51,17 @@ suffix="$os_name-$arch_name" # Read the plugin version (single source of truth) from the manifest. Only on # this slow path, so no dependency (jq) and no parsing on the hot path. +# +# TRACE_CODEX_RELEASE_VERSION overrides the manifest version. This is intended +# for testing (e.g. the smoke test pointing at an arbitrary published release); +# normal installs leave it unset and use the manifest. manifest="$ROOT/.codex-plugin/plugin.json" -version=$(sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$manifest" 2>/dev/null | head -1) +version="${TRACE_CODEX_RELEASE_VERSION:-}" +if [ -z "$version" ]; then + version=$(sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$manifest" 2>/dev/null | head -1) +fi +# Allow an optional leading 'v' on the override (v0.0.1 or 0.0.1). +version="${version#v}" if [ -z "$version" ]; then log "could not read plugin version from $manifest; tracing disabled this session" exit 0 diff --git a/plugins/trace-codex/config.json.example b/plugins/trace-codex/config.json.example index 8f05530..b9b0d93 100644 --- a/plugins/trace-codex/config.json.example +++ b/plugins/trace-codex/config.json.example @@ -6,6 +6,7 @@ "appUrl": "https://www.braintrust.dev", "project": "my-codex-project", "additionalMetadata": { "team": "platform", "env": "dev" }, + "flushOnTurnEnd": false, "recordFile": "", "port": 52734, "idleTimeoutMs": 300000, diff --git a/plugins/trace-codex/scripts/mock-collector.ts b/plugins/trace-codex/scripts/mock-collector.ts new file mode 100644 index 0000000..51c29c8 --- /dev/null +++ b/plugins/trace-codex/scripts/mock-collector.ts @@ -0,0 +1,116 @@ +// Mock Braintrust collector for the smoke test. +// +// The smoke test runs a real Codex session with the trace plugin enabled, but +// points the Braintrust SDK at this local server instead of the real backend. +// It implements just enough of the Braintrust HTTP API for a span flush to +// succeed, and counts the trace rows it receives so the smoke test can assert +// that tracing actually reported something. +// +// The Braintrust SDK (v3.x) makes these calls during initLogger + flush: +// POST {BRAINTRUST_APP_URL}/api/apikey/login -> org_info handshake +// POST {BRAINTRUST_APP_URL}/api/project/register -> resolves the project id +// GET {BRAINTRUST_API_URL}/version -> payload-limit probe (tolerated) +// POST {BRAINTRUST_API_URL}/logs3 -> the actual span data +// Pointing both BRAINTRUST_APP_URL and BRAINTRUST_API_URL at this one origin +// captures the whole flow. +// +// On shutdown (SIGTERM/SIGINT) it writes a JSON summary to MOCK_COLLECTOR_OUT: +// { "logs3Requests": , "totalRows": } +// so the smoke test can read the result after the Codex session ends. + +import { writeFileSync } from "node:fs"; + +const PORT = Number(process.env.MOCK_COLLECTOR_PORT || "53999"); +const OUT = process.env.MOCK_COLLECTOR_OUT || ""; + +let logs3Requests = 0; +let totalRows = 0; + +function log(message: string): void { + process.stderr.write(`mock-collector: ${message}\n`); +} + +function json(body: unknown, status = 200): Response { + return new Response(JSON.stringify(body), { + status, + headers: { "content-type": "application/json" }, + }); +} + +/** Count the trace rows in a /logs3 body: { rows: [...], api_version: 2 }. */ +function countRows(body: unknown): number { + if ( + typeof body === "object" && + body !== null && + Array.isArray((body as { rows?: unknown[] }).rows) + ) { + return (body as { rows: unknown[] }).rows.length; + } + return 0; +} + +const server = Bun.serve({ + port: PORT, + hostname: "127.0.0.1", + async fetch(req) { + const { pathname } = new URL(req.url); + + // Login handshake. api_url/proxy_url are echoed back, but the SDK uses + // BRAINTRUST_API_URL (set by the smoke test) for the data plane regardless. + if (req.method === "POST" && pathname === "/api/apikey/login") { + const self = `http://127.0.0.1:${PORT}`; + return json({ + org_info: [{ id: "smoke-org", name: "smoke", api_url: self, proxy_url: self }], + }); + } + + // Project registration -> returns a project id the rows reference. + if (req.method === "POST" && pathname === "/api/project/register") { + return json({ project: { id: "00000000-0000-0000-0000-000000000000", name: "smoke" } }); + } + + // Payload-limit probe. Tolerated by the SDK; return no limit. + if (req.method === "GET" && pathname === "/version") { + return json({ logs3_payload_max_bytes: null }); + } + + // The actual span data. Count the rows and acknowledge. + if (req.method === "POST" && (pathname === "/logs3" || pathname === "/logs3/overflow")) { + let rows = 0; + try { + rows = countRows(await req.json()); + } catch (err) { + log(`failed to parse ${pathname} body: ${String(err)}`); + } + logs3Requests += 1; + totalRows += rows; + log(`received ${pathname}: ${rows} row(s) (total ${totalRows})`); + return json({}); + } + + // Anything else: 200 with an empty object so the SDK never hard-fails. + log(`unhandled ${req.method} ${pathname}`); + return json({}); + }, +}); + +log(`listening on http://127.0.0.1:${server.port}`); + +function writeSummaryAndExit(): void { + const summary = { logs3Requests, totalRows }; + if (OUT) { + try { + writeFileSync(OUT, JSON.stringify(summary)); + log(`wrote summary to ${OUT}: ${JSON.stringify(summary)}`); + } catch (err) { + log(`failed to write summary to ${OUT}: ${String(err)}`); + } + } else { + log(`summary: ${JSON.stringify(summary)}`); + } + server.stop(true); + process.exit(0); +} + +process.on("SIGTERM", writeSummaryAndExit); +process.on("SIGINT", writeSummaryAndExit); diff --git a/plugins/trace-codex/scripts/smoke-test.sh b/plugins/trace-codex/scripts/smoke-test.sh new file mode 100755 index 0000000..d91d646 --- /dev/null +++ b/plugins/trace-codex/scripts/smoke-test.sh @@ -0,0 +1,176 @@ +#!/bin/sh +# End-to-end smoke test: run a real Codex session with the trace plugin enabled, +# pointed at a local mock Braintrust collector, and assert at least one trace row +# was reported. +# +# Usage: +# scripts/smoke-test.sh [VERSION] +# +# VERSION (optional; or the SMOKE_RELEASE_VERSION env var) pins the release whose +# codex-hook binary the plugin launcher downloads, e.g. "0.0.1" or "v0.0.1". When +# unset, the launcher uses the installed plugin manifest version (the normal +# behavior). CI passes the version it just released; locally you can point this +# at any published release. +# +# Prerequisites (the caller is responsible for these): +# - `codex` is on PATH and authenticated for `codex exec` (we pass +# CODEX_API_KEY from OPENAI_API_KEY below). +# - The trace-codex plugin is installed and enabled in Codex (the CI workflow +# installs it from the published release; locally use ./install.sh). +# - Bun is on PATH (used to run the mock collector). +# - OPENAI_API_KEY is set. +# - For a private release repo, GH_TOKEN or GITHUB_TOKEN (or an authenticated +# `gh`) so the launcher can download the binary. +# +# Exit 0 on success (>=1 trace row received), non-zero otherwise. + +set -eu + +# Resolve the plugin dir from this script's location so it works from anywhere. +SCRIPT_DIR=$(CDPATH= cd "$(dirname "$0")" && pwd) +PLUGIN_DIR=$(dirname "$SCRIPT_DIR") + +# Target release version: positional arg wins, then env, else empty (= manifest). +RELEASE_VERSION="${1:-${SMOKE_RELEASE_VERSION:-}}" + +PORT="${MOCK_COLLECTOR_PORT:-53999}" +BASE_URL="http://127.0.0.1:$PORT" + +log() { printf 'smoke: %s\n' "$1" >&2; } +fail() { printf 'smoke: FAIL: %s\n' "$1" >&2; exit 1; } + +# Redact the API key from any text we print (defense-in-depth for the public +# repo: GitHub already masks the secret, but we never want to echo subprocess +# output that might contain it, transformed or otherwise). Reads stdin, writes +# scrubbed text to stderr. A no-op if the key is empty. +scrub() { + if [ -n "${OPENAI_API_KEY:-}" ]; then + sed "s/${OPENAI_API_KEY}/[REDACTED]/g" 2>/dev/null || cat + else + cat + fi +} + +# --- Preconditions --------------------------------------------------------- + +[ -n "${OPENAI_API_KEY:-}" ] || fail "OPENAI_API_KEY is not set" +command -v codex >/dev/null 2>&1 || fail "codex is not on PATH" +command -v bun >/dev/null 2>&1 || fail "bun is not on PATH" + +# Belt-and-suspenders: explicitly mask the key in GitHub Actions logs. This +# covers any reformatted appearance beyond GitHub's automatic secret masking. +# Harmless (and silent) outside Actions. +if [ -n "${GITHUB_ACTIONS:-}" ]; then + printf '::add-mask::%s\n' "$OPENAI_API_KEY" +fi + +WORK=$(mktemp -d) +SUMMARY="$WORK/summary.json" +COLLECTOR_LOG="$WORK/collector.log" +CODEX_LOG="$WORK/codex.log" + +COLLECTOR_PID="" +cleanup() { + # Ask the collector to flush its summary and stop; fall back to KILL. + if [ -n "$COLLECTOR_PID" ] && kill -0 "$COLLECTOR_PID" 2>/dev/null; then + kill -TERM "$COLLECTOR_PID" 2>/dev/null || true + # Give it a moment to write the summary. + for _ in 1 2 3 4 5 6 7 8 9 10; do + kill -0 "$COLLECTOR_PID" 2>/dev/null || break + sleep 0.2 + done + kill -KILL "$COLLECTOR_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT INT TERM + +# --- Start the mock collector ---------------------------------------------- + +log "starting mock collector on $BASE_URL" +MOCK_COLLECTOR_PORT="$PORT" MOCK_COLLECTOR_OUT="$SUMMARY" \ + bun run "$PLUGIN_DIR/scripts/mock-collector.ts" >"$COLLECTOR_LOG" 2>&1 & +COLLECTOR_PID=$! + +# Wait for it to accept connections. +ready=0 +for _ in $(seq 1 50); do + if curl -fsS -o /dev/null --max-time 1 "$BASE_URL/version" 2>/dev/null; then + ready=1 + break + fi + kill -0 "$COLLECTOR_PID" 2>/dev/null || break + sleep 0.2 +done +if [ "$ready" != "1" ]; then + scrub <"$COLLECTOR_LOG" >&2 || true + fail "mock collector never became ready on $BASE_URL" +fi +log "collector ready" + +# --- Run a real Codex session with tracing pointed at the collector -------- + +# These BRAINTRUST_* vars are read by the hook client (inherited from this +# process via Codex) and bundled into the session's reporting config. Pointing +# both URLs at the collector captures the login handshake and the data POST. +export TRACE_TO_BRAINTRUST=true +export BRAINTRUST_API_KEY=smoke-key +export BRAINTRUST_API_URL="$BASE_URL" +export BRAINTRUST_APP_URL="$BASE_URL" +export BRAINTRUST_PROJECT=trace-codex-smoke +# Keep the background event server short-lived so it doesn't linger after CI. +export BRAINTRUST_EVENT_SERVER_IDLE_TIMEOUT_MS=15000 +export BRAINTRUST_EVENT_SERVER_IDLE_CHECK_INTERVAL_MS=1000 +# Block on the terminal-event flush so the final spans are delivered before +# `codex exec` returns. Without this the hook just enqueues and relies on the +# server's idle-drain flush, which may not fire before this short-lived run tears +# the process tree down — making the assertion below flaky. +export BRAINTRUST_FLUSH_ON_TURN_END=true + +# Pin the release the launcher downloads, when requested. The launcher strips an +# optional leading 'v', so either form is fine here. +if [ -n "$RELEASE_VERSION" ]; then + export TRACE_CODEX_RELEASE_VERSION="$RELEASE_VERSION" + log "pinning plugin binary to release $RELEASE_VERSION" +else + log "using installed plugin manifest version (no pin)" +fi + +log "running codex exec (say hi)" +# CODEX_API_KEY is supported only by `codex exec` and is set inline for just +# this invocation. --dangerously-bypass-hook-trust runs the plugin's hooks +# without the interactive /hooks trust prompt (one-off automation escape hatch). +set +e +CODEX_API_KEY="$OPENAI_API_KEY" codex exec \ + --skip-git-repo-check \ + --dangerously-bypass-hook-trust \ + --sandbox read-only \ + "say hi" >"$CODEX_LOG" 2>&1 +codex_status=$? +set -e +log "codex exec exited with status $codex_status" +if [ "$codex_status" -ne 0 ]; then + log "--- codex output (key redacted) ---" + scrub <"$CODEX_LOG" >&2 || true +fi + +# With BRAINTRUST_FLUSH_ON_TURN_END=true (set above), the hook client blocks +# on a synchronous flush at session end, so by the time codex exec returns the +# trace should already be delivered. Stop the collector (via cleanup) to make it +# write its summary, then assert. +cleanup +COLLECTOR_PID="" # already stopped; avoid double-kill in the EXIT trap + +log "--- collector log (key redacted) ---" +scrub <"$COLLECTOR_LOG" >&2 || true + +[ -f "$SUMMARY" ] || fail "collector wrote no summary file" +total_rows=$(sed -n 's/.*"totalRows"[[:space:]]*:[[:space:]]*\([0-9]*\).*/\1/p' "$SUMMARY") +[ -n "$total_rows" ] || fail "could not parse totalRows from summary: $(cat "$SUMMARY")" +log "summary: $(cat "$SUMMARY")" + +if [ "$total_rows" -ge 1 ]; then + log "PASS: received $total_rows trace row(s)" + exit 0 +fi + +fail "expected >=1 trace row, got $total_rows" diff --git a/plugins/trace-codex/scripts/token-proxy.ts b/plugins/trace-codex/scripts/token-proxy.ts new file mode 100644 index 0000000..3344402 --- /dev/null +++ b/plugins/trace-codex/scripts/token-proxy.ts @@ -0,0 +1,372 @@ +// Token-counting proxy for verifying trace-codex's token accounting. +// +// Codex has no per-session token readout we can compare against, so this is a +// standalone debugging tool: a local HTTP proxy that sits between Codex and the +// real OpenAI API. It forwards every request through unchanged and tees the +// streaming response, parsing each model call's `usage` straight off the wire. +// You then compare these ground-truth numbers against what trace-codex records +// in Braintrust. +// +// It is NOT part of either plugin's runtime — purely a manual verification aid. +// +// ── Usage ────────────────────────────────────────────────────────────────── +// 1. Run the proxy: +// bun run scripts/token-proxy.ts +// It listens on http://127.0.0.1:53800 and, by default, forwards to the +// ChatGPT backend (https://chatgpt.com) — for ChatGPT-login Codex, the +// common case. It logs every request it receives. +// +// 2. Point Codex at it via a custom provider in ~/.codex/config.toml. These +// keys are USER-LEVEL only and MUST go ABOVE any [table] headers (in TOML, +// keys after a `[table]` header belong to that table), e.g. at the very top +// of the file: +// +// For ChatGPT login (auth_mode=chatgpt — check ~/.codex/auth.json): +// model_provider = "token-proxy" +// [model_providers.token-proxy] +// name = "token-proxy" +// # base_url must contain "/backend-api" so Codex uses ChatGPT path style; +// # the proxy forwards this exact path to https://chatgpt.com. +// base_url = "http://127.0.0.1:53800/backend-api/codex" +// requires_openai_auth = true # relay the ChatGPT token + account headers +// wire_api = "responses" +// +// For API-key login instead: run the proxy with +// TOKEN_PROXY_UPSTREAM=https://api.openai.com bun run scripts/token-proxy.ts +// and use: +// model_provider = "token-proxy" +// [model_providers.token-proxy] +// name = "token-proxy" +// base_url = "http://127.0.0.1:53800/v1" +// env_key = "OPENAI_API_KEY" +// wire_api = "responses" +// +// 3. Run a Codex session as usual. Each model call prints a usage line, and a +// running session total is kept. A machine-readable NDJSON log is written to +// TOKEN_PROXY_OUT (default: token-proxy.ndjson) — one line per call plus a +// final totals line — so you can diff against the trace. +// +// ── How usage is read ──────────────────────────────────────────────────────── +// Codex uses the Responses API with SSE streaming. The terminal +// `response.completed` event carries `response.usage`: +// { input_tokens, output_tokens, total_tokens, +// input_tokens_details: { cached_tokens }, +// output_tokens_details: { reasoning_tokens } } +// We pass the stream through byte-for-byte to Codex while parsing a copy, so the +// proxy is transparent. Non-streaming JSON responses are handled too (we read +// `usage` from the parsed body) for completeness. + +import { appendFileSync, writeFileSync } from "node:fs"; + +const PORT = Number(process.env.TOKEN_PROXY_PORT || "53800"); +// Upstream ORIGIN the proxy forwards to (scheme://host, no path). The proxy is a +// transparent tunnel: it forwards the exact request path Codex builds onto this +// origin. Default is the ChatGPT host Codex uses when logged in via ChatGPT +// (auth_mode=chatgpt) — the common case. For an API-key login, set +// TOKEN_PROXY_UPSTREAM=https://api.openai.com (and use the api-key provider). +const UPSTREAM = ( + process.env.TOKEN_PROXY_UPSTREAM || + process.env.OPENAI_BASE_URL || + "https://chatgpt.com" +).replace(/\/+$/, ""); +const OUT = process.env.TOKEN_PROXY_OUT || "/tmp/token-proxy.ndjson"; +// When set, dump the raw upstream SSE event blocks to TOKEN_PROXY_DEBUG_OUT so we +// can inspect the exact event shape (the ChatGPT backend may differ from the +// public Responses API). +const DEBUG = process.env.TOKEN_PROXY_DEBUG === "1"; +const DEBUG_OUT = process.env.TOKEN_PROXY_DEBUG_OUT || "token-proxy-raw.log"; + +/** Per-call usage, normalized to the keys we care about. */ +interface Usage { + input_tokens: number; + cached_input_tokens: number; + output_tokens: number; + reasoning_output_tokens: number; + total_tokens: number; +} + +const ZERO_USAGE: Usage = { + input_tokens: 0, + cached_input_tokens: 0, + output_tokens: 0, + reasoning_output_tokens: 0, + total_tokens: 0, +}; + +// Running totals across the proxy's lifetime (one session, typically). +let callCount = 0; +const sessionTotal: Usage = { ...ZERO_USAGE }; + +function log(message: string): void { + process.stderr.write(`token-proxy: ${message}\n`); +} + +// Truncate the NDJSON log on startup so each run is a fresh capture. +try { + writeFileSync(OUT, ""); +} catch (err) { + log(`could not open ${OUT}: ${String(err)} (file logging disabled)`); +} + +if (DEBUG) { + try { + writeFileSync(DEBUG_OUT, ""); + log(`DEBUG on: dumping raw upstream SSE blocks to ${DEBUG_OUT}`); + } catch { + // ignore + } +} + +function record(line: unknown): void { + try { + appendFileSync(OUT, `${JSON.stringify(line)}\n`); + } catch { + // best-effort; the stderr line is the primary output + } +} + +function num(v: unknown): number { + return typeof v === "number" && Number.isFinite(v) ? v : 0; +} + +/** + * Pull usage out of a Responses API `response` object (the shape under both the + * `response.completed` SSE event and a non-streaming JSON body). Returns null if + * there is no usage block. + */ +function usageFromResponse(response: unknown): Usage | null { + if (response === null || typeof response !== "object") return null; + const usage = (response as { usage?: unknown }).usage; + if (usage === null || typeof usage !== "object") return null; + const u = usage as Record; + const inputDetails = (u.input_tokens_details ?? {}) as Record; + const outputDetails = (u.output_tokens_details ?? {}) as Record; + return { + input_tokens: num(u.input_tokens), + cached_input_tokens: num(inputDetails.cached_tokens), + output_tokens: num(u.output_tokens), + reasoning_output_tokens: num(outputDetails.reasoning_tokens), + total_tokens: num(u.total_tokens), + }; +} + +/** Record a call's usage: bump totals, print a line, append to the NDJSON log. */ +function reportUsage(usage: Usage, model: string | undefined): void { + callCount += 1; + sessionTotal.input_tokens += usage.input_tokens; + sessionTotal.cached_input_tokens += usage.cached_input_tokens; + sessionTotal.output_tokens += usage.output_tokens; + sessionTotal.reasoning_output_tokens += usage.reasoning_output_tokens; + sessionTotal.total_tokens += usage.total_tokens; + + log( + `call #${callCount}${model ? ` (${model})` : ""}: ` + + `input=${usage.input_tokens} (cached=${usage.cached_input_tokens}) ` + + `output=${usage.output_tokens} (reasoning=${usage.reasoning_output_tokens}) ` + + `total=${usage.total_tokens} || session total=${sessionTotal.total_tokens}`, + ); + record({ kind: "call", call: callCount, model, usage, sessionTotal: { ...sessionTotal } }); +} + +/** + * A TransformStream that passes bytes through untouched while scanning the SSE + * text for the terminal usage. Codex receives the exact upstream bytes; we just + * watch a copy. We look for `response.completed` (and tolerate `response.usage` + * appearing on other terminal events) and parse the embedded JSON object's + * usage. The model id is pulled from the same event when present. + */ +function makeUsageSniffer(): TransformStream { + const decoder = new TextDecoder(); + let buffer = ""; + let reported = false; + + const tryParseEvent = (block: string): void => { + if (block.trim().length === 0) return; + if (DEBUG) { + try { + appendFileSync(DEBUG_OUT, `${block}\n---\n`); + } catch { + // ignore + } + } + // An SSE event block is lines like "event: ...\ndata: {json}". Concatenate + // all data: lines (the Responses API sends single-line JSON, but be safe). + const dataLines: string[] = []; + for (const line of block.split("\n")) { + const trimmed = line.replace(/\r$/, ""); + if (trimmed.startsWith("data:")) dataLines.push(trimmed.slice(5).trimStart()); + } + if (dataLines.length === 0) return; + const data = dataLines.join(""); + if (data === "[DONE]" || data.length === 0) return; + let parsed: unknown; + try { + parsed = JSON.parse(data); + } catch { + return; // not JSON we can read; ignore + } + if (parsed === null || typeof parsed !== "object") return; + const obj = parsed as Record; + // Find a usage block wherever it lives: nested under `response` (the + // completed/incomplete events) or at the top level. Report each one we see — + // a single model call's stream carries usage once, so the last one wins if + // there are several. (Don't short-circuit on the first: some backends emit a + // partial usage earlier in the stream.) + const response = (obj.response ?? obj) as Record; + const usage = usageFromResponse(response) ?? usageFromResponse(obj); + if (usage === null) return; + const model = + typeof response.model === "string" + ? response.model + : typeof obj.model === "string" + ? obj.model + : undefined; + reportUsage(usage, model); + reported = true; + }; + + // Whole-body accumulator for the non-SSE JSON fallback, capped so a huge + // response can't balloon memory (usage lives in the small completed event / + // top-level object, so a few MB is plenty). + let fullBody = ""; + const FULL_BODY_CAP = 4_000_000; + + // Last-resort: try to parse the entire body as one JSON object and pull usage + // from it (covers non-streaming JSON responses). + const tryParseWholeBody = (): void => { + if (reported || fullBody.length === 0) return; + let parsed: unknown; + try { + parsed = JSON.parse(fullBody); + } catch { + return; + } + if (parsed === null || typeof parsed !== "object") return; + const obj = parsed as Record; + const response = (obj.response ?? obj) as Record; + const usage = usageFromResponse(response) ?? usageFromResponse(obj); + if (usage === null) return; + const model = + typeof response.model === "string" + ? response.model + : typeof obj.model === "string" + ? obj.model + : undefined; + reportUsage(usage, model); + reported = true; + }; + + return new TransformStream({ + transform(chunk, controller) { + controller.enqueue(chunk); // pass through immediately, unmodified + const text = decoder.decode(chunk, { stream: true }); + buffer += text; + if (fullBody.length < FULL_BODY_CAP) fullBody += text; + // SSE events are separated by a blank line. Process complete blocks. + let sep = buffer.indexOf("\n\n"); + while (sep !== -1) { + const block = buffer.slice(0, sep); + buffer = buffer.slice(sep + 2); + tryParseEvent(block); + sep = buffer.indexOf("\n\n"); + } + }, + flush() { + if (buffer.length > 0) tryParseEvent(buffer); + // SSE parsing found nothing — try the whole body as a single JSON object. + tryParseWholeBody(); + if (!reported) { + log(`call completed with no usage found in response (path may be non-model)`); + } + }, + }); +} + +/** Build the upstream URL: UPSTREAM + the part of the path after our `/v1`. */ +function upstreamUrl(reqUrl: string): string { + // Transparent tunnel: forward the exact path + query Codex built onto the + // upstream origin. (The provider's base_url is configured so Codex builds the + // correct downstream path; the proxy never rewrites it.) + const { pathname, search } = new URL(reqUrl); + return `${UPSTREAM}${pathname}${search}`; +} + +const server = Bun.serve({ + port: PORT, + hostname: "127.0.0.1", + // Generous: model calls can stream for minutes. + idleTimeout: 0, + async fetch(req) { + const target = upstreamUrl(req.url); + const path = new URL(req.url).pathname; + // Log every request so it's obvious when Codex is (or isn't) hitting us. + log(`${req.method} ${path} -> ${target}`); + + // Forward the request verbatim (method, headers, body). Codex already set + // the Authorization header (ChatGPT token or API key) and the account/session + // headers, so we just relay them. Drop `host` so fetch derives the correct + // upstream host (otherwise it carries our 127.0.0.1:PORT and breaks routing). + const headers = new Headers(req.headers); + headers.delete("host"); + let upstream: Response; + try { + upstream = await fetch(target, { + method: req.method, + headers, + body: req.body, + // `duplex: "half"` is required to stream a request body; not in the DOM + // RequestInit type, so widen via a cast. + ...({ duplex: "half" } as Record), + redirect: "manual", + }); + } catch (err) { + log(`upstream fetch failed for ${req.method} ${path}: ${String(err)}`); + return new Response(JSON.stringify({ error: "proxy_upstream_failed" }), { + status: 502, + headers: { "content-type": "application/json" }, + }); + } + + const contentType = upstream.headers.get("content-type") ?? ""; + log(` <- ${upstream.status} ${contentType || "(no content-type)"}`); + + // Always tee the body through the usage sniffer. We do NOT gate on + // content-type: the ChatGPT backend streams SSE with no (or an unexpected) + // content-type header, so matching "text/event-stream" misses it. The sniffer + // passes bytes through unchanged and looks for usage in either SSE `data:` + // events or a whole-body JSON object, so it's safe for any response. + if (upstream.body) { + const teed = upstream.body.pipeThrough(makeUsageSniffer()); + return new Response(teed, { + status: upstream.status, + statusText: upstream.statusText, + headers: upstream.headers, + }); + } + + // No body (e.g. a HEAD or 204): pass straight through. + return new Response(upstream.body, { + status: upstream.status, + statusText: upstream.statusText, + headers: upstream.headers, + }); + }, +}); + +log(`listening on http://127.0.0.1:${server.port} -> ${UPSTREAM}`); +log(`writing per-call usage to ${OUT}`); + +function summarize(): void { + log( + `SESSION TOTAL over ${callCount} call(s): ` + + `input=${sessionTotal.input_tokens} (cached=${sessionTotal.cached_input_tokens}) ` + + `output=${sessionTotal.output_tokens} (reasoning=${sessionTotal.reasoning_output_tokens}) ` + + `total=${sessionTotal.total_tokens}`, + ); + record({ kind: "session_total", calls: callCount, sessionTotal: { ...sessionTotal } }); + server.stop(true); + process.exit(0); +} + +process.on("SIGTERM", summarize); +process.on("SIGINT", summarize); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index ab63e57..7f5da2f 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -1,175 +1,1392 @@ import { describe, expect, test } from "bun:test"; import type { ReportingConfig, SpanFactory } from "../../braintrust/logger.ts"; import type { EnqueueEvent } from "../../server/routes.ts"; -import { createTestLogger, withCapturedTrace } from "../../test-helpers.ts"; +import { createTestLogger, spansToTree, withCapturedTrace } from "../../test-helpers.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; import { assertProducesTrace, + assistantMessage, + compacted, configEvent, + customToolCall, + customToolCallOutput, + FakeTranscriptReader, + functionCall, + functionCallOutput, + MultiFileTranscriptReader, + postCompact, + postToolUse, + preCompact, + preToolUse, + reasoning, + sessionMeta, sessionStart, stop, - userPromptSubmit, + subagentStart, + subagentStop, + TEST_TRANSCRIPT_PATH, + taskComplete, + taskStarted, + tokenCount, + transcript, + turnContext, + userMessage, + userMessageItem, } from "./test-helpers.ts"; +import type { + TranscriptReader, + TranscriptReadResult, + TranscriptWaitResult, + WaitForOptions, +} from "./transcript-reader.ts"; -describe("CodexEventProcessor", () => { - test("SessionStart produces a root span", async () => { +// Helpers below build a session as an ordered list of transcript writes and +// hooks. The final `stop()` hook triggers the catch-up that turns the buffered +// transcript records into spans (waiting for the task_complete sentinel). + +describe("CodexEventProcessor: root span", () => { + test("session_meta opens a root span named after the cwd basename", async () => { + await assertProducesTrace( + [ + sessionStart({ source: "startup" }), + sessionMeta({ cwd: "/whatever/myapp", id: "session-1" }), + turnContext({ model: "gpt-5.5" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: myapp", type: "task" }, + metadata: { + session_id: "session-1", + model: "gpt-5.5", + source: "startup", + transcript_path: TEST_TRANSCRIPT_PATH, + }, + children: [], + }, + ); + }); + + test("root span name handles a trailing slash in cwd", async () => { + await assertProducesTrace( + [sessionStart(), sessionMeta({ cwd: "/whatever/myapp/" }), stop({ turn_id: "t1" })], + { span_attributes: { name: "codex: myapp", type: "task" }, children: [] }, + ); + }); + + test("root span falls back to 'codex session' when cwd is missing", async () => { + await assertProducesTrace([sessionStart(), sessionMeta({}), stop({ turn_id: "t1" })], { + span_attributes: { name: "codex session", type: "task" }, + children: [], + }); + }); + + test("model is backfilled from turn_context", async () => { + await assertProducesTrace( + [sessionStart(), sessionMeta({ cwd: "/work" }), turnContext({ model: "gpt-5.5" }), stop({})], + { + span_attributes: { name: "codex: work", type: "task" }, + metadata: { model: "gpt-5.5" }, + children: [], + }, + ); + }); + + test("root carries source and permission_mode from the SessionStart hook", async () => { + await assertProducesTrace( + [ + sessionStart({ source: "resume", permission_mode: "acceptEdits" }), + sessionMeta({ cwd: "/work" }), + stop({}), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + metadata: { source: "resume", permission_mode: "acceptEdits" }, + children: [], + }, + ); + }); + + test("duplicate session_meta still yields a single root span", async () => { + await assertProducesTrace( + [sessionStart(), sessionMeta({ cwd: "/work" }), sessionMeta({ cwd: "/work" }), stop({})], + { span_attributes: { name: "codex: work", type: "task" }, children: [] }, + ); + }); + + test("Stop ends the root span", async () => { + await assertProducesTrace([sessionStart(), sessionMeta({ cwd: "/work" }), stop({})], { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [], + }); + }); +}); + +describe("CodexEventProcessor: turn spans", () => { + test("task_started/task_complete become a turn span with prompt input and assistant output", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "what's your name?" }), + taskComplete({ turn_id: "t1", last_agent_message: "I'm Codex." }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + input: "what's your name?", + output: "I'm Codex.", + metadata: { turn_id: "t1" }, + ended: true, + children: [], + }, + ], + }, + ); + }); + + test("two turns become two ordered child spans", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "one" }), + taskComplete({ turn_id: "t1", last_agent_message: "first" }), + taskStarted({ turn_id: "t2" }), + userMessage({ message: "two" }), + taskComplete({ turn_id: "t2", last_agent_message: "second" }), + stop({ turn_id: "t2" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true }, + { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true }, + ], + }, + ); + }); + + test("a turn whose task_complete lands after its Stop is closed by the final flush catch-up", async () => { + // Regression: the last turn's task_complete can be written just after its + // Stop hook fires, so the Stop's bounded wait misses it. The final flush + // read must still pick it up and close the turn. Here task_complete is + // ordered AFTER stop() in the list (so it is only present at flush time). + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "one" }), + taskComplete({ turn_id: "t1", last_agent_message: "first" }), + stop({ turn_id: "t1" }), + taskStarted({ turn_id: "t2" }), + userMessage({ message: "two" }), + // Tool call/output present at Stop time, but task_complete is not yet. + functionCall({ turn_id: "t2", name: "exec_command", call_id: "c2" }), + functionCallOutput({ call_id: "c2", output: "ok" }), + stop({ turn_id: "t2" }), + // task_complete written after the Stop — only seen by the flush catch-up. + taskComplete({ turn_id: "t2", last_agent_message: "second" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true }, + { + span_attributes: { name: "turn: t2", type: "task" }, + output: "second", + ended: true, + children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { span_attributes: { name: "exec_command", type: "tool" }, ended: true }, + ], + }, + ], + }, + ); + }); + + test("each turn closes when its own Stop arrives (per-turn Stops)", async () => { + // Mirrors real Codex: Stop fires per turn. Turn 1 is processed/closed on its + // own Stop; turn 2 opens after and closes on the second Stop. Regression for + // the bug where the second turn never closed. + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "one" }), + taskComplete({ turn_id: "t1", last_agent_message: "first" }), + stop({ turn_id: "t1" }), + taskStarted({ turn_id: "t2" }), + userMessage({ message: "two" }), + taskComplete({ turn_id: "t2", last_agent_message: "second" }), + stop({ turn_id: "t2" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true }, + { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true }, + ], + }, + ); + }); + + test("a turn still open when the root ends is closed by the root-end backstop", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "hi" }), + // No task_complete; the Stop hook ends the root and closes the turn. + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, input: "hi", ended: true }, + ], + }, + ); + }); +}); + +describe("CodexEventProcessor: span timing (from transcript timestamps)", () => { + // :10Z=1767225610, :11=...611, :12=...612, :13=...613, :14=...614, :15=...615. + test("an llm span starts at its turn's start (first child), not its output record", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // LLM call: the assistant message lands at (:12) — but the model's work + // for this call began at the turn's start (:11), since this is the first + // child. It requests a tool (:13) — that's its last output, so the llm + // span ends there. The tool result arrives (:14) and the token_count + // (:15) supplies metrics only (it does NOT define the llm span's end). + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:12Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "go" }], + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:13Z", + payload: { + type: "function_call", + name: "exec_command", + call_id: "c1", + metadata: { turn_id: "t1" }, + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:14Z", + payload: { type: "function_call_output", call_id: "c1", output: "ok" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:15Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:16Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + metrics: { start: 1767225610, end: 1767225616 }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + metrics: { start: 1767225611, end: 1767225616 }, + ended: true, + children: [ + { + // LLM call: starts at the turn's start (:11, first child), not + // the message record (:12); ENDS when it emits the tool call (:13), + // its last output — NOT at token_count (:15). This is the fix for + // the llm span swallowing the tool's execution. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225613 }, + ended: true, + }, + { + // Tool span uses its own record timestamps: function_call (:13) + // to function_call_output (:14). Contiguous with the llm (no + // overlap): llm ends at :13, tool starts at :13. + span_attributes: { name: "exec_command", type: "tool" }, + metrics: { start: 1767225613, end: 1767225614 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // When an llm call is the only child of its turn, its span should span almost + // the whole turn (start at turn start, end at token_count just before + // task_complete) — not appear near-instant. + test("a lone llm span fills its turn", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // The model "thinks" for several seconds; its output only lands at :15. + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:15Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "answer" }], + }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:15Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:16Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "answer" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + metrics: { start: 1767225611, end: 1767225616 }, + children: [ + { + // Starts at turn start (:11), not the output record (:15). + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225615 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // Regression for the "front-of-turn gap" bug: a model call whose ONLY output is + // a tool call (no assistant message or readable reasoning before it) must still + // open an llm span, so the model's work (often many seconds of thinking before + // it emits the tool call) is covered rather than leaving an unaccounted gap + // between the turn start and the first child. + test("a tool-only model call still opens an llm span filling the front of the turn", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // The model "thinks" for ~20s, then its first (and only) output for this + // call is a tool call at :30 — no preceding message or reasoning. + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:30Z", + payload: { + type: "function_call", + name: "exec_command", + call_id: "c1", + metadata: { turn_id: "t1" }, + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:32Z", + payload: { type: "function_call_output", call_id: "c1", output: "ok" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:32Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:33Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + metrics: { start: 1767225611, end: 1767225633 }, + children: [ + { + // The llm span covers the model's work from the turn start (:11) + // until it emits the tool call (:30, its only output) — filling + // the ~20s "thinking" gap. It does NOT extend to the token_count + // (:32); that would overlap the tool, which is impossible. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225630 }, + ended: true, + }, + { + // The tool runs :30 -> :32, contiguous with (not inside) the llm. + span_attributes: { name: "exec_command", type: "tool" }, + metrics: { start: 1767225630, end: 1767225632 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // Two model calls separated by a tool: the second call must start where the + // tool ended (the model's most recent sibling), so the spans tile the turn + // contiguously instead of each looking instant. + test("a later llm span starts where the prior sibling (a tool) ended", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // Call 1: opens at turn start (:11) on its first output item (an + // assistant message at :11), then emits a tool call (:12, its last + // output → llm-1 ends there). The token_count (:13) is metrics-only. + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:11Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "let me check" }], + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:12Z", + payload: { + type: "function_call", + name: "exec_command", + call_id: "c1", + metadata: { turn_id: "t1" }, + }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:13Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + // Tool runs :12 -> :14 (output lands after call 1's token_count). + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:14Z", + payload: { type: "function_call_output", call_id: "c1", output: "ok" }, + }), + // Call 2: the model resumes after the tool result. Its output lands at + // :18, but it began working when the tool returned (:14). + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:18Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "done" }], + }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:18Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:19Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + children: [ + { + // Call 1: turn start (:11) -> its last output, the tool call + // (:12). Contiguous with the tool; does NOT extend to :13/:14. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225612 }, + ended: true, + }, + { + // Tool: :12 -> :14. + span_attributes: { name: "exec_command", type: "tool" }, + metrics: { start: 1767225612, end: 1767225614 }, + ended: true, + }, + { + // Call 2: starts at the tool's end (:14), the most recent sibling + // end — NOT its own output record (:18). Ends at its last output, + // the message (:18). The token_count (:18) is metrics-only. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225614, end: 1767225618 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); +}); + +describe("CodexEventProcessor: tool spans", () => { + test("a function_call/output pair becomes a tool span under its turn", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "run it" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "call_1", + arguments: '{"cmd":"ls"}', + }), + functionCallOutput({ call_id: "call_1", output: "file.txt" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + output: "done", + ended: true, + children: [ + // The model call that emitted the tool call (its only output) opens + // an llm span, so the model's work isn't an unaccounted gap. + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { + span_attributes: { name: "exec_command", type: "tool" }, + input: '{"cmd":"ls"}', + output: "file.txt", + metadata: { tool_name: "exec_command", call_id: "call_1", turn_id: "t1" }, + ended: true, + children: [], + }, + ], + }, + ], + }, + ); + }); + + test("custom_tool_call (apply_patch) becomes a tool span using the raw name", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + customToolCall({ + turn_id: "t1", + name: "apply_patch", + call_id: "call_2", + input: "*** Begin Patch", + }), + customToolCallOutput({ call_id: "call_2", output: "Success" }), + taskComplete({ turn_id: "t1", last_agent_message: "ok" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { + span_attributes: { name: "apply_patch", type: "tool" }, + input: "*** Begin Patch", + output: "Success", + ended: true, + }, + ], + }, + ], + }, + ); + }); + + test("multiple tool calls in one turn become ordered sibling tool spans", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1" }), + functionCallOutput({ call_id: "c1", output: "a" }), + customToolCall({ turn_id: "t1", name: "apply_patch", call_id: "c2" }), + customToolCallOutput({ call_id: "c2", output: "b" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + // Both tool calls come from one model call, so a single llm span + // opens (on the first tool call) and closes at turn end. + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { span_attributes: { name: "exec_command", type: "tool" }, output: "a", ended: true }, + { span_attributes: { name: "apply_patch", type: "tool" }, output: "b", ended: true }, + ], + }, + ], + }, + ); + }); + + test("an unpaired tool call (no output) is closed when its turn ends", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1", arguments: "{}" }), + // No output for c1; the turn completes anyway. + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { + span_attributes: { name: "exec_command", type: "tool" }, + input: "{}", + ended: true, + children: [], + }, + ], + }, + ], + }, + ); + }); + + test("a tool call with no matching open turn span is skipped", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + // turn_id "tX" was never opened; the tool span must be skipped. + functionCall({ turn_id: "tX", name: "exec_command", call_id: "c1" }), + functionCallOutput({ call_id: "c1", output: "x" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [] }, + ], + }, + ); + }); +}); + +describe("CodexEventProcessor: permissions", () => { + test("an escalated tool call is annotated with permission metadata and a tag", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "c1", + arguments: JSON.stringify({ + cmd: "curl example.com", + sandbox_permissions: "require_escalated", + justification: "Need network access", + prefix_rule: ["curl"], + }), + }), + functionCallOutput({ call_id: "c1", output: "ok" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { + permission: { + sandbox_permissions: "require_escalated", + justification: "Need network access", + prefix_rule: ["curl"], + }, + }, + tags: ["permission-request"], + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // An escalation is a failed sandboxed attempt followed by an escalated retry. + // Both are separate tool spans (distinct call_ids); only the retry carries the + // permission annotation. + test("a sandboxed attempt then escalated retry: only the retry is annotated", async () => { await assertProducesTrace( - [sessionStart({ model: "gpt-5.5", cwd: "/work", source: "startup" })], + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "attempt", + arguments: JSON.stringify({ cmd: "curl example.com" }), + }), + functionCallOutput({ call_id: "attempt", output: "Error: network blocked" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "retry", + arguments: JSON.stringify({ + cmd: "curl example.com", + sandbox_permissions: "require_escalated", + justification: "Need network access", + }), + }), + functionCallOutput({ call_id: "retry", output: "ok" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], { span_attributes: { name: "codex: work", type: "task" }, - input: { model: "gpt-5.5", cwd: "/work", source: "startup" }, - metadata: { session_id: "session-1", model: "gpt-5.5" }, - children: [], + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + // Both attempts come from one model call, so a single llm span opens. + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { + // The failed sandboxed attempt: no permission annotation. + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { call_id: "attempt", permission: undefined }, + output: "Error: network blocked", + ended: true, + }, + { + // The escalated retry: annotated. + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { + call_id: "retry", + permission: { + sandbox_permissions: "require_escalated", + justification: "Need network access", + }, + }, + tags: ["permission-request"], + output: "ok", + ended: true, + }, + ], + }, + ], }, ); }); - test("root span is named after the project directory (basename of cwd)", async () => { - await assertProducesTrace([sessionStart({ cwd: "/whatever/myapp" })], { - span_attributes: { name: "codex: myapp", type: "task" }, - children: [], - }); - }); - - test("root span name handles a trailing slash in cwd", async () => { - await assertProducesTrace([sessionStart({ cwd: "/whatever/myapp/" })], { - span_attributes: { name: "codex: myapp", type: "task" }, - children: [], - }); + test("a non-escalated tool call has no permission metadata or tag", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "c1", + arguments: JSON.stringify({ cmd: "ls" }), + }), + functionCallOutput({ call_id: "c1", output: "file.txt" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { permission: undefined }, + ended: true, + }, + ], + }, + ], + }, + ); }); +}); - test("root span falls back to 'codex session' when cwd is missing", async () => { - await assertProducesTrace([sessionStart({})], { - span_attributes: { name: "codex session", type: "task" }, - children: [], - }); +describe("CodexEventProcessor: compaction", () => { + // A compaction is its own turn containing a `compacted` record. That turn span + // is relabeled "compaction", tagged, annotated, and given an llm child span + // showing what the compaction did (input = prior history; output = the + // replacement history). The trigger comes from the compact hooks, and the turn + // is terminated by PostCompact (a compaction turn gets no Stop of its own). + test("a compaction turn becomes a compaction span with an llm child", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "tc" }), + preCompact({ turn_id: "tc", trigger: "manual" }), + compacted({ + message: "", + replacement_history: [ + { type: "message", role: "user", content: [{ type: "input_text", text: "hi" }] }, + { type: "compaction", encrypted_content: "opaque" }, + ], + window_id: 1, + }), + tokenCount({ total_tokens: 6451 }), + taskComplete({ turn_id: "tc", last_agent_message: null }), + postCompact({ turn_id: "tc", trigger: "manual" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "compaction", type: "task" }, + metadata: { + compaction: { trigger: "manual", replaced_message_count: 2, window_id: 1 }, + }, + tags: ["compaction"], + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + metadata: { compaction: true }, + // Output is shaped as the kept context + a clear summary marker + // (the real summary is encrypted), not the raw replacement list. + output: { + summary: "[summary unavailable — encrypted by Codex]", + kept_messages: [ + { + type: "message", + role: "user", + content: [{ type: "input_text", text: "hi" }], + }, + ], + }, + metrics: { tokens: 6451 }, + ended: true, + }, + ], + }, + ], + }, + ); }); - test("a session with no Stop stays active (no end time)", async () => { + // The trigger must be captured whether the hook arrives before or after the + // transcript's compacted record. Here the hook comes AFTER (e.g. a resumed + // session whose compacted record is read during SessionStart's catch-up); the + // trigger is back-filled onto the already-built span. + test("trigger is back-filled when the compact hook arrives after the record", async () => { await assertProducesTrace( - [sessionStart({ model: "gpt-5.5", source: "startup" }), userPromptSubmit({ prompt: "hi" })], + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "tc" }), + compacted({ replacement_history: [{ type: "compaction" }], window_id: 2 }), + tokenCount({ total_tokens: 10 }), + taskComplete({ turn_id: "tc", last_agent_message: null }), + // Stop processes the compacted record (trigger not yet known)... + stop({ turn_id: "tc" }), + // ...then the PostCompact hook reveals the trigger, back-filled onto the span. + postCompact({ turn_id: "tc", trigger: "auto" }), + ], { - span_attributes: { name: "codex session", type: "task" }, - ended: false, - children: [], + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "compaction", type: "task" }, + metadata: { + compaction: { trigger: "auto", replaced_message_count: 1, window_id: 2 }, + }, + tags: ["compaction"], + }, + ], }, ); }); - test("Stop ends the root span", async () => { + // Regression for the live bug: a compaction turn gets no Stop, and PostCompact + // fires BEFORE Codex writes the turn's task_complete. PostCompact must do the + // same bounded wait Stop does, so the compaction span closes within the hook + // instead of lingering open until idle eviction (where it may never flush). + test("PostCompact waits for a task_complete that lands after it, closing the span", async () => { + class DelayedRevealReader implements TranscriptReader { + private lines: string[] = []; + private withheld: string | null = null; + private readsUntilReveal = 0; + + append(line: string): void { + this.lines.push(line); + } + + withhold(line: string, afterReads: number): void { + this.withheld = line; + this.readsUntilReveal = afterReads; + } + + readFrom(_path: string, offset: number): TranscriptReadResult { + if (this.withheld !== null) { + if (this.readsUntilReveal <= 0) { + this.lines.push(this.withheld); + this.withheld = null; + } else { + this.readsUntilReveal -= 1; + } + } + const buf = this.lines.length > 0 ? `${this.lines.join("\n")}\n` : ""; + const from = offset > buf.length ? 0 : offset; + const slice = buf.slice(from); + const out: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + out.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines: out, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + return { lines, offset: next, sentinelFound: lines.some(predicate) }; + } + } + + const reader = new DelayedRevealReader(); + const write = (record: Record) => + reader.append(JSON.stringify({ timestamp: "2026-01-01T00:00:10Z", ...record })); + write({ type: "session_meta", payload: { id: "s", cwd: "/work" } }); + write({ type: "event_msg", payload: { type: "task_started", turn_id: "tc" } }); + write({ + timestamp: "2026-01-01T00:00:11Z", + type: "compacted", + payload: { replacement_history: [{ type: "compaction" }], window_id: 1 }, + }); + write({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + // task_complete is withheld until after PostCompact's initial read, so only + // its bounded waitFor reveals it. + reader.withhold( + JSON.stringify({ + timestamp: "2026-01-01T00:00:15Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "tc", last_agent_message: null }, + }), + 1, + ); + + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + await processor.process(sessionStart({ session_id: "s" })); + await processor.process(preCompact({ session_id: "s", turn_id: "tc", trigger: "manual" })); + // PostCompact's initial read won't see task_complete; its bounded wait must. + await processor.process(postCompact({ session_id: "s", turn_id: "tc", trigger: "manual" })); + + const tree = spansToTree(await trace.drain()); + const compaction = tree?.children.find((c) => c.name === "compaction"); + expect(compaction).toBeDefined(); + // The span closed within the hook (end time from the late task_complete). + expect(compaction?.metrics?.end).toBe(1767225615); // 2026-01-01T00:00:15Z + } finally { + trace.cleanup(); + } + }); +}); + +describe("CodexEventProcessor: llm spans", () => { + test("a model call becomes an llm span (output + token metrics) under its turn", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "hi" }), - stop({}), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "yo!" }), + reasoning(), + assistantMessage("Hey. What are we working on?"), + tokenCount({ input_tokens: 100, output_tokens: 14, total_tokens: 114 }), + taskComplete({ turn_id: "t1", last_agent_message: "Hey. What are we working on?" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, - children: [], + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "Hey. What are we working on?" }, + // Codex token keys mapped to Braintrust standard metric names. + metrics: { prompt_tokens: 100, completion_tokens: 14, tokens: 114 }, + ended: true, + }, + ], + }, + ], }, ); }); - test("ends on the first Stop; later events do not reopen it", async () => { + // A reasoning item with a readable summary is surfaced as a `reasoning` entry + // in the llm output (Braintrust's OpenAI Responses shape), interleaved before + // the assistant message. + test("a reasoning summary is captured as a reasoning item in the llm output", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "hi", turn_id: "t1" }), - stop({ turn_id: "t1", last_assistant_message: "hello" }), - userPromptSubmit({ prompt: "another thing", turn_id: "t2" }), - stop({ turn_id: "t2", last_assistant_message: "ok" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessageItem("solve it"), + reasoning(["**Inspecting the problem**", "**Deriving the formula**"]), + assistantMessage("The answer is 42."), + tokenCount({ input_tokens: 10, output_tokens: 5, total_tokens: 15 }), + taskComplete({ turn_id: "t1", last_agent_message: "The answer is 42." }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, children: [ - { span_attributes: { name: "turn: t1", type: "task" } }, - { span_attributes: { name: "turn: t2", type: "task" } }, + { + span_attributes: { name: "turn: t1", type: "task" }, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: [ + { + type: "reasoning", + summary: [ + { type: "summary_text", text: "**Inspecting the problem**" }, + { type: "summary_text", text: "**Deriving the formula**" }, + ], + }, + { role: "assistant", content: "The answer is 42." }, + ], + ended: true, + }, + ], + }, ], }, ); }); - test("a turn becomes a child span with prompt input and assistant output", async () => { + // A reasoning item with only encrypted content (no readable summary) opens the + // llm span but adds nothing to the output. + test("an encrypted-only reasoning item adds no reasoning to the output", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "what's your name?", turn_id: "t1" }), - stop({ turn_id: "t1", last_assistant_message: "I'm Codex." }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessageItem("hi"), + reasoning(), // no summary + assistantMessage("Hello."), + tokenCount({ total_tokens: 5 }), + taskComplete({ turn_id: "t1", last_agent_message: "Hello." }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, - ended: true, + span_attributes: { name: "codex: work", type: "task" }, children: [ { span_attributes: { name: "turn: t1", type: "task" }, - input: "what's your name?", - output: "I'm Codex.", - metadata: { turn_id: "t1" }, - ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + // Single assistant message only — no reasoning entry. + output: { role: "assistant", content: "Hello." }, + }, + ], }, ], }, ); }); - test("an open turn with no matching Stop stays active", async () => { + test("llm input is reconstructed from prior conversation messages", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "hi", turn_id: "t1" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + // The prompt arrives as a response_item user message (history source). + userMessageItem("yo!"), + assistantMessage("Hey."), + tokenCount({ input_tokens: 10, output_tokens: 2, total_tokens: 12 }), + taskComplete({ turn_id: "t1", last_agent_message: "Hey." }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, - ended: false, + span_attributes: { name: "codex: work", type: "task" }, + ended: true, children: [ { span_attributes: { name: "turn: t1", type: "task" }, - input: "hi", - ended: false, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + // Input = the user prompt seen before the call opened (chat msgs). + input: [{ role: "user", content: "yo!" }], + output: { role: "assistant", content: "Hey." }, + }, + ], }, ], }, ); }); - test("a Stop with no matching turn does not create a turn span", async () => { + test("multiple model calls interleave with tool spans in execution order", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - stop({ turn_id: "nope", last_assistant_message: "orphan" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "do it" }), + // First model call: asks for a tool. + assistantMessage("calling tool"), + functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1" }), + tokenCount({ input_tokens: 10, output_tokens: 5, total_tokens: 15 }), + functionCallOutput({ call_id: "c1", output: "result" }), + // Second model call: final answer. + assistantMessage("done"), + tokenCount({ input_tokens: 20, output_tokens: 3, total_tokens: 23 }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, - children: [], + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + // Ordered siblings: llm(call 1), tool, llm(call 2). + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + // The call produced an assistant message plus a tool call. + output: [ + { role: "assistant", content: "calling tool" }, + { + role: "assistant", + content: null, + tool_calls: [ + { + id: "c1", + type: "function", + function: { name: "exec_command", arguments: "{}" }, + }, + ], + }, + ], + ended: true, + }, + { span_attributes: { name: "exec_command", type: "tool" }, ended: true }, + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "done" }, + ended: true, + }, + ], + }, + ], }, ); }); - test("duplicate SessionStart still yields a single root span", async () => { + test("a model call with no closing token_count is closed when the turn ends", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - sessionStart({ model: "gpt-5.5", source: "startup" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "hi" }), + assistantMessage("partial"), + // No token_count; turn completes anyway. + taskComplete({ turn_id: "t1", last_agent_message: "partial" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, - children: [], + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "partial" }, + ended: true, + }, + ], + }, + ], }, ); }); +}); - test("a config event builds the per-session factory with its config", () => { - // Spy provider records the config it was asked to build a factory for. +describe("CodexEventProcessor: config / master switch", () => { + test("a config event builds the per-session factory with its config", async () => { let seen: ReportingConfig | undefined; const fake: SpanFactory = { startSpan: () => ({ id: "s", end: () => 0, log: () => {} }) as unknown as ReturnType< SpanFactory["startSpan"] >, + rehydrateSpan: () => + ({ id: "s", end: () => 0, log: () => {} }) as unknown as ReturnType< + SpanFactory["startSpan"] + >, flush: async () => {}, }; const provider = (config?: ReportingConfig) => { @@ -177,7 +1394,8 @@ describe("CodexEventProcessor", () => { return fake; }; - const processor = new CodexEventProcessor("sess-1", createTestLogger(), provider); + const reader = new FakeTranscriptReader(); + const processor = new CodexEventProcessor("sess-1", createTestLogger(), provider, reader); const cfg: EnqueueEvent = { queueId: "sess-1", eventSource: "codex-hook", @@ -190,9 +1408,16 @@ describe("CodexEventProcessor", () => { traceToBraintrust: true, }, }; - processor.process(cfg); - // Factory is built lazily on first span (the SessionStart). - processor.process(sessionStart({ session_id: "sess-1", model: "gpt-5.5" })); + await processor.process(cfg); + // Factory is built lazily on first span (session_meta). + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:00Z", + type: "session_meta", + payload: { id: "sess-1", cwd: "/work" }, + }), + ); + await processor.process(sessionStart({ session_id: "sess-1" })); expect(seen).toEqual({ project: "team-project", @@ -206,9 +1431,14 @@ describe("CodexEventProcessor", () => { test("root span metadata carries the configured project", async () => { await assertProducesTrace( - [configEvent({ project: "team-project" }), sessionStart({ model: "gpt-5.5" })], + [ + configEvent({ project: "team-project" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + stop({}), + ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, metadata: { project: "team-project" }, children: [], }, @@ -216,20 +1446,32 @@ describe("CodexEventProcessor", () => { }); test("when tracing is disabled, no spans are produced", async () => { + const reader = new FakeTranscriptReader(); const trace = withCapturedTrace(); try { - const processor = new CodexEventProcessor("s", createTestLogger(), () => trace.spanFactory); + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); // Config event WITHOUT traceToBraintrust (defaults off). - processor.process({ + await processor.process({ queueId: "s", eventSource: "codex-hook", eventSourceVersion: null, eventName: CODEX_CONFIG_EVENT, eventData: { project: "p" }, }); - processor.process(sessionStart({ session_id: "s" })); - processor.process(userPromptSubmit({ session_id: "s", turn_id: "t1", prompt: "hi" })); - processor.process(stop({ session_id: "s", turn_id: "t1", last_assistant_message: "yo" })); + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:00Z", + type: "session_meta", + payload: { id: "s", cwd: "/work" }, + }), + ); + await processor.process(sessionStart({ session_id: "s" })); + await processor.process(stop({ session_id: "s", turn_id: "t1" })); await processor.flush(); const spans = await trace.drain(); @@ -246,14 +1488,586 @@ describe("CodexEventProcessor", () => { project: "team-project", additionalMetadata: { team: "platform", model: "SHOULD_BE_OVERRIDDEN" }, }), - sessionStart({ model: "gpt-5.5" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + stop({}), ], { - span_attributes: { name: "codex session", type: "task" }, - // team comes from additionalMetadata; model is the standard key (wins). + span_attributes: { name: "codex: work", type: "task" }, metadata: { team: "platform", model: "gpt-5.5", project: "team-project" }, children: [], }, ); }); }); + +describe("CodexEventProcessor: transcript catch-up plumbing", () => { + test("each hook reads from the advancing offset (each line read once)", async () => { + const reader = new FakeTranscriptReader(); + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "hi" }), + taskComplete({ turn_id: "t1", last_agent_message: "yo" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [{ span_attributes: { name: "turn: t1", type: "task" }, ended: true }], + }, + { reader }, + ); + + // Reads: SessionStart (readFrom), Stop (waitFor). The injected config event + // has no transcript_path, so it does not read. + expect(reader.readOffsets.length).toBeGreaterThanOrEqual(2); + for (let i = 1; i < reader.readOffsets.length; i++) { + expect(reader.readOffsets[i]).toBeGreaterThanOrEqual(reader.readOffsets[i - 1] as number); + } + expect(reader.readOffsets[0]).toBe(0); + }); + + test("a malformed transcript line is skipped without breaking handling", async () => { + const reader = new FakeTranscriptReader(); + reader.append("this is not valid json"); + await assertProducesTrace( + [sessionStart(), sessionMeta({ cwd: "/work" }), stop({ turn_id: "t1" })], + { span_attributes: { name: "codex: work", type: "task" }, ended: true, children: [] }, + { reader }, + ); + }); + + test("a turn whose task_complete lands after its Stop is closed by the bounded wait", async () => { + // Regression for the live bug: Codex writes task_complete seconds after the + // Stop hook fires. The Stop's initial read doesn't see it (turn stays open), + // so the bounded waitFor must keep reading until it appears, then close the + // turn — with the transcript-derived end time, not a wall-clock value. + class DelayedRevealReader implements TranscriptReader { + private lines: string[] = []; + private withheld: string | null = null; + private readsUntilReveal = 0; + + append(line: string): void { + this.lines.push(line); + } + + /** Append a line that only becomes visible after `afterReads` more reads. */ + withhold(line: string, afterReads: number): void { + this.withheld = line; + this.readsUntilReveal = afterReads; + } + + private visible(): string[] { + return this.lines; + } + + readFrom(_path: string, offset: number): TranscriptReadResult { + // Reveal the withheld line once enough reads have happened. + if (this.withheld !== null) { + if (this.readsUntilReveal <= 0) { + this.lines.push(this.withheld); + this.withheld = null; + } else { + this.readsUntilReveal -= 1; + } + } + const visible = this.visible(); + const buf = visible.length > 0 ? `${visible.join("\n")}\n` : ""; + const from = offset > buf.length ? 0 : offset; + const slice = buf.slice(from); + const out: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + out.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines: out, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + return { lines, offset: next, sentinelFound: lines.some(predicate) }; + } + } + + const reader = new DelayedRevealReader(); + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: "s", cwd: "/work" }, + }), + ); + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "t1" }, + }), + ); + // task_complete is withheld: not visible at the Stop's waitFor, appears later. + reader.withhold( + JSON.stringify({ + timestamp: "2026-01-01T00:00:15Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + 1, + ); + + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + await processor.process(sessionStart({ session_id: "s" })); + // Stop's bounded wait won't see task_complete yet; turn stays open. + await processor.process(stop({ session_id: "s", turn_id: "t1" })); + // flush polls until the withheld task_complete is revealed and closes t1. + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + const turn = tree?.children[0]; + expect(turn?.name).toBe("turn: t1"); + expect(turn?.metrics?.end).toBe(1767225615); // 2026-01-01T00:00:15Z + } finally { + trace.cleanup(); + } + }); +}); + +describe("CodexEventProcessor: subagents", () => { + const MAIN = "/test/main.jsonl"; + const SUB = "/test/sub.jsonl"; + + // A main session that spawns one subagent. The subagent writes to its own + // transcript file; the processor must nest its full hierarchy (subagent root + // -> turn -> llm/tool) under the spawn_agent tool span that launched it, and + // never let the two files' offsets cross-contaminate. + test("a subagent renders as a nested trace under its spawn_agent tool", async () => { + const reader = new MultiFileTranscriptReader(); + const w = (path: string, record: Record) => + reader.append(path, JSON.stringify(transcript(record).record)); + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + + // Main session: root + a turn that calls spawn_agent. + w(MAIN, { type: "session_meta", payload: { id: "s", cwd: "/work/app" } }); + w(MAIN, { type: "event_msg", payload: { type: "task_started", turn_id: "main-1" } }); + w(MAIN, { type: "turn_context", payload: { model: "gpt-5.5" } }); + w(MAIN, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(MAIN, { + type: "response_item", + payload: { + type: "function_call", + name: "spawn_agent", + call_id: "call_spawn", + arguments: "{}", + metadata: { turn_id: "main-1" }, + }, + }); + w(MAIN, { + type: "response_item", + payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" }, + }); + w(MAIN, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN })); + await processor.process( + preToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + }), + ); + // PostToolUse reveals the agent_id; maps it to the spawn tool span. + await processor.process( + postToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + tool_response: JSON.stringify({ agent_id: "agent-1" }), + }), + ); + + // Subagent: its own transcript + lifecycle hooks (carry agent_id). + w(SUB, { type: "session_meta", payload: { id: "agent-1" } }); + w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-1" } }); + w(SUB, { type: "turn_context", payload: { model: "gpt-5.5" } }); + w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(SUB, { + type: "response_item", + payload: { + type: "function_call", + name: "exec_command", + call_id: "call_exec", + arguments: "{}", + metadata: { turn_id: "sub-1" }, + }, + }); + w(SUB, { + type: "response_item", + payload: { type: "function_call_output", call_id: "call_exec", output: "ok" }, + }); + w(SUB, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + w(SUB, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" }, + }); + await processor.process( + subagentStart({ + session_id: "s", + transcript_path: SUB, + agent_id: "agent-1", + agent_type: "default", + }), + ); + await processor.process( + subagentStop({ + session_id: "s", + transcript_path: MAIN, + agent_transcript_path: SUB, + agent_id: "agent-1", + }), + ); + + // Main session completes. + w(MAIN, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "all done" }, + }); + await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" })); + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + expect(tree?.name).toBe("codex: app"); + // The root carries the main session's transcript path. + expect(tree?.metadata?.transcript_path).toBe(MAIN); + const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); + expect(mainTurn).toBeDefined(); + + // The spawn_agent tool span is under the main turn, and the subagent root + // is a SIBLING of it (both under the main turn) — not nested in the tool. + const spawn = mainTurn?.children.find((c) => c.name === "spawn_agent"); + expect(spawn?.type).toBe("tool"); + expect(spawn?.children.some((c) => c.name === "subagent: agent-1")).toBe(false); + const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1"); + expect(subRoot).toBeDefined(); + expect(subRoot?.type).toBe("task"); + // The subagent root carries its own transcript path. + expect(subRoot?.metadata?.transcript_path).toBe(SUB); + + const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); + expect(subTurn).toBeDefined(); + const exec = subTurn?.children.find((c) => c.name === "exec_command"); + expect(exec?.type).toBe("tool"); + + // No subagent turn leaked to the top level (the original bug). + expect(tree?.children.some((c) => c.name === "turn: sub-1")).toBe(false); + } finally { + trace.cleanup(); + } + }); + + // Regression for the live bug: after SubagentStop finalizes a subagent, a later + // flush (which drains every scope) must NOT re-read the subagent transcript and + // re-open its already-closed spans — doing so left the subagent root/turn/llm + // hanging (no end) because the re-opened, un-ended spans overwrote the closed + // ones. A finalized subagent scope must be inert. + test("a finalized subagent's spans stay closed across later flushes", async () => { + const reader = new MultiFileTranscriptReader(); + const w = (path: string, record: Record) => + reader.append(path, JSON.stringify(transcript(record).record)); + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + + // Main session spawns a subagent and stays open (no main Stop yet). + w(MAIN, { type: "session_meta", payload: { id: "s", cwd: "/work/app" } }); + w(MAIN, { type: "event_msg", payload: { type: "task_started", turn_id: "main-1" } }); + w(MAIN, { + type: "response_item", + payload: { + type: "function_call", + name: "spawn_agent", + call_id: "call_spawn", + arguments: "{}", + metadata: { turn_id: "main-1" }, + }, + }); + w(MAIN, { + type: "response_item", + payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" }, + }); + await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN })); + await processor.process( + postToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + tool_response: JSON.stringify({ agent_id: "agent-1" }), + }), + ); + + // Subagent transcript: a turn with a model call, then task_complete, then a + // TRAILING record after task_complete. SubagentStop's bounded wait stops at + // the task_complete sentinel, leaving the trailing record unconsumed; a + // later flush would otherwise re-read it and re-open spans on the finalized + // scope (the bug). With the fix, the finalized scope is inert. + w(SUB, { type: "session_meta", payload: { id: "agent-1" } }); + w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-1" } }); + w(SUB, { type: "turn_context", payload: { model: "gpt-5.5" } }); + w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(SUB, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + w(SUB, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" }, + }); + await processor.process( + subagentStart({ + session_id: "s", + transcript_path: SUB, + agent_id: "agent-1", + agent_type: "default", + }), + ); + await processor.process( + subagentStop({ + session_id: "s", + transcript_path: MAIN, + agent_transcript_path: SUB, + agent_id: "agent-1", + }), + ); + + // After finalize, MORE records land in the subagent transcript (Codex can + // write trailing records, and the rollout file persists). A later flush + // drains every scope and re-reads this file. If the finalized scope weren't + // inert, these would re-open a turn + llm on it, leaving spans hanging. + w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-2" } }); + w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(SUB, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + await processor.flush(); + + // Main session finishes. + w(MAIN, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "done" }, + }); + await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" })); + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + // The subagent root is a sibling of the spawn_agent tool (under main-1). + const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); + const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1"); + const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); + + // The subagent root and turn are closed (not hanging), and the post-finalize + // records did NOT re-open anything: no second turn, no re-opened llm. + expect(subRoot?.metrics?.end).toBeDefined(); + expect(subTurn?.metrics?.end).toBeDefined(); + expect(subRoot?.children.some((c) => c.name === "turn: sub-2")).toBe(false); + expect(subRoot?.children.filter((c) => c.name?.startsWith("turn:")).length).toBe(1); + } finally { + trace.cleanup(); + } + }); + + // LLM-span start timing is derived per-turn, per-scope, so a subagent's calls + // must never influence the main session's call timing (or vice versa) even + // though both run concurrently. Each scope tracks its own turn start / + // last-child-end, keyed by its own transcript file. + test("llm span start timing is isolated per scope across concurrent subagents", async () => { + const reader = new MultiFileTranscriptReader(); + const w = (path: string, record: Record) => + reader.append(path, JSON.stringify(record)); + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + + // Main turn starts at :10 and spawns a subagent; its own model call's output + // lands much later (:30), but the call began at the turn start (:10). + w(MAIN, { + timestamp: "2026-01-01T00:00:09Z", + type: "session_meta", + payload: { id: "s", cwd: "/work/app" }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:10Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "main-1" }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:10Z", + type: "response_item", + payload: { + type: "function_call", + name: "spawn_agent", + call_id: "call_spawn", + arguments: "{}", + metadata: { turn_id: "main-1" }, + }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:11Z", + type: "response_item", + payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" }, + }); + await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN })); + await processor.process( + postToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + tool_response: JSON.stringify({ agent_id: "agent-1" }), + }), + ); + + // Subagent turn starts at :20 (its own clock); its model output lands at + // :25 and closes at :25. Its llm span must start at the SUBAGENT turn start + // (:20), not be affected by the main scope's :10/:11 boundaries. + w(SUB, { + timestamp: "2026-01-01T00:00:20Z", + type: "session_meta", + payload: { id: "agent-1" }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:20Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "sub-1" }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:25Z", + type: "response_item", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "sub" }], + }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:25Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:26Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" }, + }); + await processor.process( + subagentStart({ + session_id: "s", + transcript_path: SUB, + agent_id: "agent-1", + agent_type: "default", + }), + ); + await processor.process( + subagentStop({ + session_id: "s", + transcript_path: MAIN, + agent_transcript_path: SUB, + agent_id: "agent-1", + }), + ); + + // Main session's own model call: output lands at :30, closes at :30. Its + // llm span must start at the MAIN turn start (:10) — the most recent child + // of main-1 is the spawn_agent tool (ended :11), so it starts there. + w(MAIN, { + timestamp: "2026-01-01T00:00:30Z", + type: "response_item", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "main" }], + }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:30Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:31Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "all done" }, + }); + await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" })); + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); + // Main llm: opens on the spawn_agent tool call (the model's first output, + // before any message), so it starts at the main turn's start (:10 = + // 1767225610) and runs until its token_count (:30). The single llm call + // spans the spawn tool and the later message. + const mainLlm = mainTurn?.children.find((c) => c.name === "llm"); + expect(mainLlm?.metrics?.start).toBe(1767225610); + expect(mainLlm?.metrics?.end).toBe(1767225630); + + // Subagent llm: starts at the SUBAGENT turn start (:20 = 1767225620), wholly + // independent of the main scope's boundaries; ends at its token_count (:25). + const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1"); + const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); + const subLlm = subTurn?.children.find((c) => c.name === "llm"); + expect(subLlm?.metrics?.start).toBe(1767225620); + expect(subLlm?.metrics?.end).toBe(1767225625); + } finally { + trace.cleanup(); + } + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index dd95c50..55fbfe2 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -1,10 +1,28 @@ -// Processor for Codex events. A leading config event (same queueId) configures -// how this session reports to Braintrust (project/credentials), so the processor -// builds its own per-session SpanFactory. On SessionStart it opens a Braintrust -// root span. Each turn is a child span: UserPromptSubmit opens it (input = the -// user prompt), and the matching Stop closes it (output = the final assistant -// message). The first Stop also ends the root span. Buffered spans are delivered -// via flush(). +// Processor for a single Codex session. Spans are built from the session +// transcript file (the "rollout"), which is the source of truth for everything +// inside a session: the root, each turn, and each tool call, in true execution +// order. The Codex lifecycle hooks drive when we read the transcript and supply +// a few fields the transcript lacks. +// +// Two ordered inputs: +// - Hooks: `process(event)` is called once per hook. A leading config event +// (same queueId) configures reporting. Each hook triggers a transcript +// "catch-up". A SessionStart hook contributes `source`/`permission_mode` to +// the root span (the transcript does not record these). A Stop hook is the +// turn-terminal signal: before catching up we wait (bounded) for the turn's +// `task_complete` to be written, so the final spans aren't truncated. +// - Transcript: each new record is turned into spans by `processTranscriptEvent`: +// session_meta -> open the root span (start time = its timestamp) +// task_started -> open a turn span (child of root) +// user_message -> set the open turn's input (the prompt) +// function_call / custom_tool_call / tool_search_call +// -> open a tool span (child of its turn), keyed by call_id +// *_output -> close the matching tool span (output) +// task_complete -> set the turn's output and close it +// +// Span start times come from transcript timestamps, so sibling spans (LLM and +// tool calls, later) render in execution order. Buffered spans are delivered via +// flush(). Everything is wrapped so tracing can never break a Codex turn. import { defaultSpanFactoryProvider, @@ -12,15 +30,98 @@ import { type Span, type SpanFactory, type SpanFactoryProvider, + type SpanRef, + spanRef, } from "../../braintrust/logger.ts"; import type { Logger } from "../../log.ts"; import type { EventProcessor } from "../../processor/event-processor.ts"; import type { EnqueueEvent } from "../../server/routes.ts"; +import { PLUGIN_VERSION } from "../../version.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; +import type { SnapshotStore } from "./snapshot-store.ts"; +import { + type CodexSnapshot, + type ConversationItemSnapshot, + isCompatibleSnapshot, + redactReportingConfig, + type ScopeSnapshot, + SNAPSHOT_SCHEMA_VERSION, +} from "./state-snapshot.ts"; +import { + EVT_TASK_COMPLETE, + EVT_TASK_STARTED, + EVT_TOKEN_COUNT, + EVT_USER_MESSAGE, + ITEM_CUSTOM_TOOL_CALL, + ITEM_CUSTOM_TOOL_CALL_OUTPUT, + ITEM_FUNCTION_CALL, + ITEM_FUNCTION_CALL_OUTPUT, + ITEM_MESSAGE, + ITEM_REASONING, + ITEM_TOOL_SEARCH_CALL, + ITEM_TOOL_SEARCH_OUTPUT, + parseTranscriptLine, + RECORD_COMPACTED, + RECORD_EVENT_MSG, + RECORD_RESPONSE_ITEM, + RECORD_SESSION_META, + RECORD_TURN_CONTEXT, + type TranscriptRecord, +} from "./transcript.ts"; +import { defaultTranscriptReader, type TranscriptReader } from "./transcript-reader.ts"; const SESSION_START = "SessionStart"; -const USER_PROMPT_SUBMIT = "UserPromptSubmit"; const STOP = "Stop"; +const SUBAGENT_START = "SubagentStart"; +const SUBAGENT_STOP = "SubagentStop"; +const POST_TOOL_USE = "PostToolUse"; +const PRE_COMPACT = "PreCompact"; +const POST_COMPACT = "PostCompact"; +const SPAWN_AGENT_TOOL = "spawn_agent"; +// Tag applied to a tool span whose call requested escalated permissions, so +// permission-gated actions are easy to find/filter in Braintrust. +const PERMISSION_TAG = "permission-request"; +// Tag applied to a turn span that performed a context compaction, so compactions +// are easy to find/filter in Braintrust. +const COMPACTION_TAG = "compaction"; + +// Bound on how long a Stop hook waits for the turn's task_complete to appear in +// the transcript before giving up and emitting whatever exists. Codex can write +// task_complete several seconds after firing the Stop hook, so this is generous; +// the wait returns as soon as the sentinel is seen, and only the last turn of a +// session ever relies on it (earlier turns are closed by the next hook's +// catch-up). Still bounded so a stuck writer can never stall the Codex turn. +const SENTINEL_TIMEOUT_MS = 10_000; +const SENTINEL_INTERVAL_MS = 25; + +// Bound on how long flush() polls the transcript for still-open turns to close +// (their task_complete can land seconds after the terminal Stop). flush() is off +// the Codex hot path, so this is generous; it returns immediately once no turns +// remain open, and is still bounded so a stuck writer can't hang shutdown. +const FLUSH_TIMEOUT_MS = 15_000; +const FLUSH_INTERVAL_MS = 50; + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Whether a hook signals the end of a turn (so we should wait, bounded, for the + * turn's task_complete to be written). Stop ends a normal turn; SubagentStop a + * subagent turn; PostCompact a compaction turn — which, unlike the others, never + * gets a Stop, so this is what lets its span close promptly instead of lingering + * open until idle eviction. + */ +function isTurnTerminal(eventName: string): boolean { + return eventName === STOP || eventName === SUBAGENT_STOP || eventName === POST_COMPACT; +} + +/** Tool-call response_item subtypes that OPEN a tool span. */ +const TOOL_CALL_TYPES = new Set([ITEM_FUNCTION_CALL, ITEM_CUSTOM_TOOL_CALL, ITEM_TOOL_SEARCH_CALL]); +/** Tool-call response_item subtypes that CLOSE a tool span (by call_id). */ +const TOOL_OUTPUT_TYPES = new Set([ + ITEM_FUNCTION_CALL_OUTPUT, + ITEM_CUSTOM_TOOL_CALL_OUTPUT, + ITEM_TOOL_SEARCH_OUTPUT, +]); /** * The final path segment of `cwd`, used to name the root span (e.g. @@ -34,6 +135,285 @@ function projectDirName(cwd: string | undefined): string | undefined { return segments.length > 0 ? segments[segments.length - 1] : undefined; } +/** Parse an ISO timestamp to Unix seconds (Braintrust's start time unit). */ +function isoToUnixSeconds(ts: unknown): number | undefined { + if (typeof ts !== "string") return undefined; + const ms = Date.parse(ts); + return Number.isNaN(ms) ? undefined : ms / 1000; +} + +/** Extra root-span fields the transcript doesn't record; supplied by the hook. */ +interface RootEnrichment { + source?: string; + permissionMode?: string; +} + +/** + * An OpenAI-style chat message. Used to reconstruct each LLM call's input (the + * transcript does not log the literal request) and to shape its output, so + * Braintrust renders these spans as proper LLM calls. Tool calls become an + * assistant message with `tool_calls`; tool results become a `tool` message. + */ +interface ChatMessage { + role: "developer" | "user" | "assistant" | "tool"; + content: string | null; + tool_calls?: Array<{ + id: string; + type: "function"; + function: { name: string; arguments: string }; + }>; + tool_call_id?: string; +} + +/** + * A reasoning ("thinking") item, in the OpenAI Responses shape Braintrust + * renders. `summary` is the readable reasoning text the model exposes (Codex + * records it on `reasoning` transcript items as `summary_text` entries; the full + * chain-of-thought itself is encrypted and not available). Interleaved with chat + * messages in an LLM call's output and in the conversation history. + */ +interface ReasoningItem { + type: "reasoning"; + summary: Array<{ type: "summary_text"; text: string }>; +} + +/** An item in an LLM call's output or the conversation history. */ +type ConversationItem = ChatMessage | ReasoningItem; + +/** State for the LLM span currently in progress (one model call). */ +interface OpenLlm { + span: Span; + turnId: string | undefined; + /** Items produced by this call: reasoning, assistant text, and/or tool calls. */ + output: ConversationItem[]; + /** + * Timestamp (Unix seconds) of this call's last model-output item (message / + * reasoning / tool call). This is when the model finished generating and is + * used as the span's end time — NOT the closing token_count's timestamp. Codex + * writes the token_count after the tool result (at the same instant as the + * tool output), so ending the span there would wrongly stretch it to swallow + * the tool's execution. The token_count only supplies token metrics. + */ + lastOutputTime: number | undefined; + /** + * When true, the span's output was already set at creation (e.g. a compaction + * call, whose output is the replacement history), so closing it must not + * overwrite it with the accumulated `output` messages. + */ + outputPreset?: boolean; +} + +/** + * An open turn span plus the timing we need to place its child LLM spans. The + * transcript records a model call's output only when it lands (just before the + * call's token_count), with no record at request-send time — so an LLM span + * stamped from its first output item would look near-instant. Instead we start + * each LLM span at the end of the most recently-ended child of this turn + * (`lastChildEndTime`), or at the turn's own start (`startTime`) when it's the + * first child. That makes a call's span cover request→response, and consecutive + * children tile the turn. All per-turn (hence per-scope), so concurrent + * subagents never affect each other's timing. + */ +interface OpenTurn { + span: Span; + /** Turn start (Unix seconds), used as the first child's start time. */ + startTime: number | undefined; + /** Max end time (Unix seconds) among this turn's already-ended children. */ + lastChildEndTime: number | undefined; +} + +/** + * All parsing state tied to a single transcript file. The main session has one + * scope (its parent is the root span); each subagent gets its own scope (its + * parent is the subagent root span, a child of the spawn_agent tool that created + * it). Keeping offset + open spans per file is essential: a session and its + * subagents are interleaved on one hook stream but write to separate transcript + * files, so a single shared offset would read the wrong file's bytes. + */ +interface TranscriptScope { + /** Absolute transcript path this scope reads from. */ + readonly path: string; + /** "main" session vs. a spawned subagent. */ + readonly kind: "main" | "subagent"; + /** + * Parent span for this scope's turn spans. For the main scope this is the + * root span; for a subagent it is the subagent root span. Resolved lazily so a + * subagent scope can be registered before its parent span is built. + */ + readonly parentSpan: () => Span | null; + /** Byte offset already consumed from `path`. */ + offset: number; + /** Turn spans currently open in this scope, keyed by turn_id (with the timing + * used to place their child LLM spans). */ + readonly openTurns: Map; + /** turn_ids whose Stop has fired but whose task_complete hasn't been seen. */ + readonly turnsAwaitingCompletion: Set; + /** OpenAI-style chat history (messages + reasoning), used to reconstruct each + * LLM call's input. */ + readonly conversationHistory: ConversationItem[]; + /** The currently-open LLM span in this scope, if a model call is in progress. */ + openLlm: OpenLlm | null; + /** Tool spans currently open in this scope, keyed by call_id. */ + readonly openTools: Map; + /** Model slug, learned from this scope's first turn_context. */ + model: string | undefined; + /** End time (Unix seconds) of the most recently completed turn in this scope. */ + lastTurnEndTime: number | undefined; + /** + * For a subagent scope: its root span (the span the subagent's turns hang + * under) and whether it has been ended. Null/false for the main scope, which + * uses the processor's root span instead. The root span is created lazily from + * the subagent's own session_meta record (so it gets a real start time); + * `pendingSubagent` holds what's needed to create it until then. + */ + subagentRootSpan: Span | null; + subagentEnded: boolean; + pendingSubagent?: { agentId: string; agentType: string | undefined; parent: Span }; +} + +/** Map Codex token usage keys onto Braintrust's standard metric names. */ +function tokenMetrics(usage: Record): Record { + const metrics: Record = {}; + const num = (v: unknown): number | undefined => + typeof v === "number" && Number.isFinite(v) ? v : undefined; + const map: Array<[string, string]> = [ + ["input_tokens", "prompt_tokens"], + ["output_tokens", "completion_tokens"], + ["total_tokens", "tokens"], + ["cached_input_tokens", "prompt_cached_tokens"], + ["reasoning_output_tokens", "completion_reasoning_tokens"], + ]; + for (const [from, to] of map) { + const v = num(usage[from]); + if (v !== undefined) metrics[to] = v; + } + return metrics; +} + +/** + * Shape an LLM call's output for Braintrust. Braintrust renders a single + * assistant message or an array of messages as a chat completion; a single + * message is returned directly, multiple are returned as an array. + */ +function llmOutput(messages: ConversationItem[]): unknown { + if (messages.length === 1) return messages[0]; + return messages; +} + +/** + * Extract the readable reasoning summary from a `reasoning` item's `summary` + * field: a list of `{type: "summary_text", text}` entries. Returns the + * non-empty entries (matching Braintrust's reasoning shape), or undefined when + * there's nothing readable (the full reasoning is encrypted and not exposed). + */ +function reasoningSummary( + summary: unknown, +): Array<{ type: "summary_text"; text: string }> | undefined { + if (!Array.isArray(summary)) return undefined; + const parts: Array<{ type: "summary_text"; text: string }> = []; + for (const entry of summary) { + if (entry && typeof entry === "object") { + const text = (entry as { text?: unknown }).text; + if (typeof text === "string" && text.length > 0) { + parts.push({ type: "summary_text", text }); + } + } + } + return parts.length > 0 ? parts : undefined; +} + +/** Extract the concatenated text from a response_item message's content array. */ +function messageText(content: unknown): string | undefined { + if (!Array.isArray(content)) return undefined; + const parts: string[] = []; + for (const part of content) { + if (part && typeof part === "object" && typeof (part as { text?: unknown }).text === "string") { + parts.push((part as { text: string }).text); + } + } + return parts.length > 0 ? parts.join("") : undefined; +} + +/** + * Shape a compaction's `replacement_history` into a readable "after compaction" + * output. The replacement history is the new context Codex keeps going forward: + * a few recent messages kept verbatim, plus a `compaction` entry holding the + * summary as `encrypted_content` (opaque to us). We surface the readable kept + * messages and replace the encrypted summary with a clear marker, so the span + * doesn't look like the model merely echoed the last user message. + */ +function compactionOutput(replacement: unknown[] | undefined): Record { + if (replacement === undefined) { + return { summary: "[unavailable]", kept_messages: [] }; + } + const kept: unknown[] = []; + let summaryEncrypted = false; + for (const entry of replacement) { + if (entry !== null && typeof entry === "object") { + const e = entry as Record; + if (e.type === "compaction") { + if (typeof e.encrypted_content === "string") summaryEncrypted = true; + continue; + } + } + kept.push(entry); + } + return { + summary: summaryEncrypted ? "[summary unavailable — encrypted by Codex]" : "[no summary]", + kept_messages: kept, + }; +} + +/** A tool call's permission/escalation request, parsed from its arguments. */ +interface PermissionInfo { + /** The requested sandbox permission, e.g. "require_escalated". */ + sandbox_permissions: string; + /** The model's justification shown to the user (may be absent). */ + justification?: string; + /** The command prefix the escalation applies to (may be absent). */ + prefix_rule?: unknown; +} + +/** + * Parse a tool call's permission/escalation request from its arguments. Codex + * records an escalated retry's request inline in the function_call arguments + * (`sandbox_permissions`, `justification`, `prefix_rule`); when present, we + * surface it on the tool span. `args` is the raw arguments (a JSON string for + * function_call, or an object for custom_tool_call). Returns undefined when + * there's no escalation request. + */ +function permissionInfo(args: unknown): PermissionInfo | undefined { + let obj: Record | undefined; + if (typeof args === "string") { + try { + const parsed = JSON.parse(args) as unknown; + if (parsed !== null && typeof parsed === "object") obj = parsed as Record; + } catch { + return undefined; + } + } else if (args !== null && typeof args === "object") { + obj = args as Record; + } + if (obj === undefined) return undefined; + const sandbox = obj.sandbox_permissions; + if (typeof sandbox !== "string" || sandbox.length === 0) return undefined; + const info: PermissionInfo = { sandbox_permissions: sandbox }; + if (typeof obj.justification === "string") info.justification = obj.justification; + if (obj.prefix_rule !== undefined) info.prefix_rule = obj.prefix_rule; + return info; +} + +// ConversationItems (chat messages / reasoning) are already plain JSON, so they +// round-trip through a snapshot unchanged. These two helpers just bridge the +// nominal type boundary between the live union and the snapshot's record array +// (via unknown) without copying. +function toItemSnapshots(items: ConversationItem[]): ConversationItemSnapshot[] { + return items as unknown as ConversationItemSnapshot[]; +} +function fromItemSnapshots(items: ConversationItemSnapshot[]): ConversationItem[] { + return items as unknown as ConversationItem[]; +} + export class CodexEventProcessor implements EventProcessor { private readonly logger: Logger; private readonly queueId: string | null; @@ -43,18 +423,75 @@ export class CodexEventProcessor implements EventProcessor { private reportingConfig: ReportingConfig | undefined; private rootSpan: Span | null = null; private rootEnded = false; - // Turn spans currently open, keyed by turn_id. A turn is opened by - // UserPromptSubmit and closed (ended) by the matching Stop. - private readonly openTurns = new Map(); + // The time (Unix seconds) the root span was ended, if it has been. Persisted so + // that on resume — where rehydrating the root necessarily re-emits its row — we + // can re-assert the end time and keep the root closed, rather than leaving it + // looking open (a rehydrated handle is created with no end). + private rootEndTime: number | undefined; + // Root enrichment from the SessionStart hook (source/permission_mode), which + // the transcript lacks. Applied when the root span is created, or patched onto + // the root if it already exists. + private rootEnrichment: RootEnrichment = {}; + // Reads new transcript content on each hook (the "catch-up"). + private readonly transcriptReader: TranscriptReader; + // One parsing scope per transcript file (the main session plus one per + // subagent), keyed by transcript path. Each owns its own offset and open + // spans, so interleaved hooks for different files never cross-contaminate. + private readonly scopes = new Map(); + // The main session's scope (its turns hang under the root span). Created on + // the first hook carrying the main transcript path. + private mainScope: TranscriptScope | null = null; + // call_id -> the turn span that contains a spawn_agent tool call, recorded when + // that tool span opens in the main transcript. The spawn_agent PostToolUse hook + // carries the resulting agent_id and the call_id (tool_use_id), letting us map + // agent_id to that turn span (below). We keep the TURN (not the tool span) so a + // subagent's root can be parented as a SIBLING of the spawn_agent tool — both + // under the same turn — rather than nested inside the tool. + private readonly spawnTurnSpansByCallId = new Map(); + // agent_id -> the turn span under which that subagent should be parented (the + // turn that ran the spawn_agent tool). Resolved from the spawn_agent PostToolUse + // (agent_id + call_id) via spawnTurnSpansByCallId. + private readonly spawnTurnSpansByAgentId = new Map(); + // turn_id -> compaction trigger ("manual"/"auto"), recorded from the + // PreCompact/PostCompact hooks (the transcript's compacted record lacks the + // trigger). A side map so the hook and the transcript's compacted record can + // arrive in any order: whichever comes first stores its half here / below, and + // the second one reconciles. (On a resumed session the transcript's compacted + // record is read during SessionStart's catch-up, before the hook arrives.) + private readonly compactionTriggerByTurn = new Map(); + // turn_id -> the compaction span built from the transcript, kept so a later + // PreCompact/PostCompact hook can back-fill the trigger onto it. + private readonly compactionSpansByTurn = new Map(); + // span_id -> the name/type the span was created (or renamed) with. The SDK + // doesn't expose these as getters, but a rehydrated span must re-assert them + // (else the SDK infers a wrong name from the call stack on resume), so we + // remember them here and feed them into each SpanRef at serialize time. + private readonly spanAttrs = new Map< + string, + { name: string; type: SpanRef["type"]; startTime: number | undefined } + >(); + // Optional per-session state store. When present, the processor persists a + // snapshot of its resumable state on flush and, on its first event, rehydrates + // any snapshot left by a previous server run for this session (so a session + // that outlived the server — idle shutdown or explicit close+resume — keeps + // building the same trace instead of starting over). Null disables resume. + private readonly snapshotStore: SnapshotStore | null; + // Whether we've already attempted a one-time restore from the store. Restore + // runs lazily on the first non-config event, once the span factory exists. + private restoreAttempted = false; constructor( queueId: string | null, logger: Logger, spanFactoryProvider: SpanFactoryProvider = defaultSpanFactoryProvider, + transcriptReader: TranscriptReader = defaultTranscriptReader, + snapshotStore: SnapshotStore | null = null, ) { this.queueId = queueId; this.logger = logger; this.spanFactoryProvider = spanFactoryProvider; + this.transcriptReader = transcriptReader; + this.snapshotStore = snapshotStore; } /** The session's SpanFactory, built from the config event on first use. */ @@ -65,11 +502,32 @@ export class CodexEventProcessor implements EventProcessor { return this.spanFactoryInstance; } - process(event: EnqueueEvent): void { + /** Whether this session reports to Braintrust at all (master switch). */ + private get tracingEnabled(): boolean { + return this.reportingConfig?.traceToBraintrust === true; + } + + // Dispatcher. The config event short-circuits (it configures reporting and + // must run before any spans). Every other hook (a) records any hook-only data + // it carries (root enrichment, subagent lifecycle), (b) catches up the + // relevant transcript into spans, and on a main-session Stop (c) ends the root + // span. Wrapped so transcript work can never break the turn. + async process(event: EnqueueEvent): Promise { if (event.eventName === CODEX_CONFIG_EVENT) { this.configure(event); return; } + + // First real event for this session: if a previous server run left a + // snapshot (idle shutdown mid-session, or an explicit close+resume), rebuild + // our state from it before processing. Runs once, regardless of which hook + // arrives first — the very next hook after a restart may not be SessionStart, + // and may not be preceded by a config event. Restore brings back the + // session's reporting config, so it MUST run before the master-switch check + // below (otherwise a resumed session with no leading config event would look + // untraced and we'd drop its events). + this.ensureRestored(); + // Master switch: when tracing is disabled, drop everything (no SDK calls). if (!this.tracingEnabled) { this.logger.debug("codex processor: tracing disabled; dropping event", { @@ -78,30 +536,434 @@ export class CodexEventProcessor implements EventProcessor { }); return; } + + const data = (event.eventData ?? {}) as Record; + const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; + if (event.eventName === SESSION_START) { - this.startRootSpan(event); + this.recordRootEnrichment(event); + } + + // Compaction trigger is hook-only (not in the transcript). Record it keyed by + // turn_id, and if the compaction span was already built from the transcript + // (e.g. on a resumed session its compacted record is read during + // SessionStart's catch-up, before this hook arrives), back-fill the trigger + // onto it now. Order-independent: handleCompacted reads the map; this patches + // the span — whichever happens second reconciles. + if (event.eventName === PRE_COMPACT || event.eventName === POST_COMPACT) { + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + const trigger = typeof data.trigger === "string" ? data.trigger : undefined; + if (turnId !== undefined && trigger !== undefined) { + this.compactionTriggerByTurn.set(turnId, trigger); + this.backfillCompactionTrigger(turnId, trigger); + } + } + + // A subagent is starting: open its root span (under the spawn_agent tool that + // created it) and register a scope for its transcript so its hooks build a + // full turn/llm/tool hierarchy inside that span. + if (event.eventName === SUBAGENT_START) { + this.handleSubagentStart(event); + } + + // Resolve the scope this hook belongs to (main vs. a specific subagent) by + // its transcript path, so each file keeps its own offset and open spans. + const scope = this.scopeForEvent(event); + + // A turn-terminal signal: the turn is finishing, so mark it (flush, and the + // catch-up below, wait for this turn's task_complete, which Codex may write + // after the hook). Stop ends a normal turn; SubagentStop ends a subagent + // turn; PostCompact ends a compaction turn (which gets no Stop of its own). + if (isTurnTerminal(event.eventName) && scope !== null) { + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + if (turnId !== undefined) scope.turnsAwaitingCompletion.add(turnId); + } + + if (scope !== null) { + try { + await this.catchUpScope(scope, event); + } catch (err) { + this.logger.error("codex processor: transcript catch-up failed", { + queueId: this.queueId, + eventName: event.eventName, + error: String(err), + }); + } + } + + // After catching up the main transcript (which creates the spawn_agent tool + // span), map the resulting subagent's agent_id to that span so a following + // SubagentStart can nest the subagent under it. The PostToolUse's tool_use_id + // is the spawn_agent function_call's call_id; its tool_response carries the + // agent_id. + if (event.eventName === POST_TOOL_USE && agentId === undefined) { + this.recordSpawnedAgent(event); + } + + // A subagent has finished: close its turns and root span. + if (event.eventName === SUBAGENT_STOP && agentId !== undefined) { + this.handleSubagentStop(event, agentId); + } + + // End the (main) root span on the first main-session Stop. A SubagentStop + // must NOT end the root — the parent session is still running. + if (event.eventName === STOP && agentId === undefined) { + this.endRootSpan(this.mainScope?.lastTurnEndTime); + } + } + + // Read transcript lines appended to this scope's file since its last read and + // turn each into spans. Never throws. + // + // On a turn-terminal hook (Stop/SubagentStop/PostCompact), first consume + // whatever's already on disk. If that closes the turn, we're done. Otherwise + // Codex hasn't written this turn's task_complete yet (it can lag the hook by + // seconds), so wait (bounded) for it — but only then, so a hook whose turn is + // already complete returns immediately. PostCompact matters specifically + // because a compaction turn gets no Stop, so without this its span would stay + // open until idle eviction (and may never be flushed). + private async catchUpScope(scope: TranscriptScope, event: EnqueueEvent): Promise { + const data = (event.eventData ?? {}) as Record; + + // Initial read of everything available now. + const initial = this.transcriptReader.readFrom(scope.path, scope.offset); + scope.offset = initial.offset; + this.consumeLines(scope, initial.lines); + + if (!isTurnTerminal(event.eventName)) return; + + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + // If the turn already closed during the initial read, no wait is needed. + if (turnId === undefined || !scope.openTurns.has(turnId)) return; + + // Wait (bounded) for this turn's task_complete to be written. + const result = await this.transcriptReader.waitFor( + scope.path, + scope.offset, + (line) => this.isTaskCompleteFor(line, turnId), + { timeoutMs: SENTINEL_TIMEOUT_MS, intervalMs: SENTINEL_INTERVAL_MS }, + ); + scope.offset = result.offset; + this.consumeLines(scope, result.lines); + if (!result.sentinelFound) { + this.logger.warn("codex processor: task_complete sentinel not found; partial turn", { + queueId: this.queueId, + eventName: event.eventName, + turnId, + }); + } + } + + // ========================================================================== + // Transcript scopes + // ========================================================================== + + // Resolve (creating if needed) the scope a hook belongs to, by its transcript + // path. A subagent hook (carrying agent_id) uses the subagent's transcript; + // its scope is created by SubagentStart and looked up here. Everything else is + // the main session. Returns null if the hook carries no usable transcript path + // or a subagent scope hasn't been registered yet. + private scopeForEvent(event: EnqueueEvent): TranscriptScope | null { + const data = (event.eventData ?? {}) as Record; + const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; + // SubagentStop reports the subagent file under agent_transcript_path (its + // transcript_path points at the PARENT session), so prefer that. + const path = + agentId !== undefined && typeof data.agent_transcript_path === "string" + ? data.agent_transcript_path + : typeof data.transcript_path === "string" + ? data.transcript_path + : undefined; + if (path === undefined) return null; + + if (agentId === undefined) return this.ensureMainScope(path); + + // Subagent hook: its scope must already exist (registered by SubagentStart). + const existing = this.scopes.get(path); + if (existing !== undefined) return existing; + this.logger.debug("codex processor: subagent hook before its scope exists; skipping", { + queueId: this.queueId, + agentId, + }); + return null; + } + + // The main session's scope, created on first use. Its turns hang under the + // processor's root span. + private ensureMainScope(path: string): TranscriptScope { + if (this.mainScope !== null) return this.mainScope; + const scope = this.newScope(path, "main", () => this.rootSpan); + this.mainScope = scope; + this.scopes.set(path, scope); + return scope; + } + + private newScope( + path: string, + kind: "main" | "subagent", + parentSpan: () => Span | null, + ): TranscriptScope { + return { + path, + kind, + parentSpan, + offset: 0, + openTurns: new Map(), + turnsAwaitingCompletion: new Set(), + conversationHistory: [], + openLlm: null, + openTools: new Map(), + model: undefined, + lastTurnEndTime: undefined, + subagentRootSpan: null, + subagentEnded: false, + }; + } + + // ========================================================================== + // Subagent lifecycle + // ========================================================================== + + // Record the agent_id produced by a spawn_agent tool, mapping it to that + // tool's span (located by call_id), so a later SubagentStart can parent the + // subagent's root span under the spawning tool. No-op for non-spawn tools or + // when the tool span isn't known. + private recordSpawnedAgent(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const toolName = typeof data.tool_name === "string" ? data.tool_name : undefined; + if (toolName !== SPAWN_AGENT_TOOL) return; + const callId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined; + if (callId === undefined) return; + const response = data.tool_response; + let agentId: string | undefined; + if (typeof response === "string") { + try { + const parsed = JSON.parse(response) as Record; + if (typeof parsed.agent_id === "string") agentId = parsed.agent_id; + } catch { + // tool_response wasn't JSON; nothing to map. + } + } else if (response !== null && typeof response === "object") { + const ar = (response as Record).agent_id; + if (typeof ar === "string") agentId = ar; + } + if (agentId === undefined) return; + const turnSpan = this.spawnTurnSpansByCallId.get(callId); + if (turnSpan === undefined) { + this.logger.debug("codex processor: spawn_agent PostToolUse with no known turn span", { + queueId: this.queueId, + callId, + agentId, + }); return; } - if (event.eventName === USER_PROMPT_SUBMIT) { - this.startTurnSpan(event); + this.spawnTurnSpansByAgentId.set(agentId, turnSpan); + } + + // A subagent is starting. Register a scope reading the subagent's own + // transcript so its turns/llm/tools build a full hierarchy. The subagent root + // span is created lazily from the subagent's session_meta (so it gets a real + // start time); we stash the parent here — the turn span that ran the + // spawn_agent tool (matched by agent_id from the spawn_agent PostToolUse), so + // the subagent root is a SIBLING of that spawn_agent tool span. Falls back to + // the main root span if that mapping isn't known. + private handleSubagentStart(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; + const path = typeof data.transcript_path === "string" ? data.transcript_path : undefined; + if (agentId === undefined || path === undefined) { + this.logger.warn("codex processor: SubagentStart missing agent_id/transcript_path", { + queueId: this.queueId, + }); return; } - if (event.eventName === STOP) { - this.endTurnSpan(event); - this.endRootSpan(); + if (this.scopes.has(path)) return; // already registered + + const parent = this.spawnTurnSpansByAgentId.get(agentId) ?? this.rootSpan; + if (parent === null) { + this.logger.warn("codex processor: SubagentStart with no parent span; skipping", { + queueId: this.queueId, + agentId, + }); return; } - // Other events will attach child spans to the root span in later phases. - this.logger.debug("codex processor: no-op", { + const agentType = typeof data.agent_type === "string" ? data.agent_type : undefined; + const scope = this.newScope(path, "subagent", () => scope.subagentRootSpan); + scope.pendingSubagent = { agentId, agentType, parent }; + this.scopes.set(path, scope); + this.logger.info("codex processor: registered subagent scope", { queueId: this.queueId, - eventName: event.eventName, + agentId, }); } + // Create a subagent's root span from its session_meta record (sibling of the + // spawn_agent tool span — both under the spawning turn — with the record's + // timestamp as start time). Called once per subagent scope, when its + // session_meta is first consumed. + private startSubagentRootSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const pending = scope.pendingSubagent; + if (pending === undefined || scope.subagentRootSpan !== null) return; + const startTime = isoToUnixSeconds(record.timestamp); + try { + const subagentName = `subagent: ${pending.agentId}`; + scope.subagentRootSpan = this.trackSpan( + pending.parent.startSpan({ + name: subagentName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { + metadata: { + agent_id: pending.agentId, + agent_type: pending.agentType, + // The subagent's own rollout transcript on disk. + transcript_path: scope.path, + }, + }, + }), + subagentName, + "task", + startTime, + ); + this.logger.info("codex processor: opened subagent root span", { + queueId: this.queueId, + agentId: pending.agentId, + spanId: scope.subagentRootSpan.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open subagent root span", { + queueId: this.queueId, + agentId: pending.agentId, + error: String(err), + }); + } + } + + // A subagent has finished (its task_complete has been consumed by the catch-up + // in process()). Close any spans still open in its scope, then end its root + // span at the subagent's last turn end time. + private handleSubagentStop(event: EnqueueEvent, agentId: string): void { + const data = (event.eventData ?? {}) as Record; + const path = + typeof data.agent_transcript_path === "string" ? data.agent_transcript_path : undefined; + const scope = path !== undefined ? this.scopes.get(path) : undefined; + if (scope === undefined || scope.subagentRootSpan === null) { + this.logger.debug("codex processor: SubagentStop with no scope/root span", { + queueId: this.queueId, + agentId, + }); + return; + } + if (scope.subagentEnded) return; + scope.subagentEnded = true; + const endTime = scope.lastTurnEndTime; + const endArgs = endTime !== undefined ? { endTime } : undefined; + this.endOpenLlmSpan(scope, endTime); + this.closeAllOpenTools(scope, endArgs); + for (const [, turn] of scope.openTurns) { + try { + turn.span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling subagent turn span", { + queueId: this.queueId, + agentId, + error: String(err), + }); + } + } + scope.openTurns.clear(); + try { + scope.subagentRootSpan.end(endArgs); + this.logger.info("codex processor: ended subagent root span", { + queueId: this.queueId, + agentId, + }); + } catch (err) { + this.logger.error("codex processor: failed to end subagent root span", { + queueId: this.queueId, + agentId, + error: String(err), + }); + } + } + + // Parse and process a batch of transcript lines into spans for the given + // scope, advancing nothing (the caller owns the offset). Used by catch-up and + // by the final flush read. + private consumeLines(scope: TranscriptScope, lines: string[]): void { + // A finalized subagent scope is inert: never re-open/mutate its spans (a + // re-read after SubagentStop would otherwise re-open already-closed spans). + if (scope.subagentEnded) return; + for (const line of lines) { + const record = parseTranscriptLine(line); + if (record === null) continue; + this.processTranscriptEvent(scope, record); + } + } + + // Cheap check used as the Stop sentinel predicate: is this line a + // task_complete for `turnId` (or any task_complete when turnId is unknown)? + private isTaskCompleteFor(line: string, turnId: string | undefined): boolean { + const record = parseTranscriptLine(line); + if (record === null) return false; + if (record.type !== RECORD_EVENT_MSG || record.payload?.type !== EVT_TASK_COMPLETE) { + return false; + } + if (turnId === undefined) return true; + return record.payload?.turn_id === turnId; + } + + // Turn one parsed transcript record into span operations for its scope. + // Unknown record types are ignored. Never throws; per-record failures logged. + private processTranscriptEvent(scope: TranscriptScope, record: TranscriptRecord): void { + try { + if (record.type === RECORD_SESSION_META) { + // The main session's session_meta opens the processor root span; a + // subagent's opens its (lazily created) subagent root span. + if (scope.kind === "main") this.startRootSpan(record); + else this.startSubagentRootSpan(scope, record); + return; + } + if (record.type === RECORD_TURN_CONTEXT) { + this.noteModel(scope, record); + return; + } + if (record.type === RECORD_COMPACTED) { + this.handleCompacted(scope, record); + return; + } + if (record.type === RECORD_EVENT_MSG) { + const ptype = record.payload?.type; + if (ptype === EVT_TASK_STARTED) this.startTurnSpan(scope, record); + else if (ptype === EVT_USER_MESSAGE) this.setTurnInput(scope, record); + else if (ptype === EVT_TASK_COMPLETE) this.endTurnSpan(scope, record); + else if (ptype === EVT_TOKEN_COUNT) this.closeLlmSpan(scope, record); + return; + } + if (record.type === RECORD_RESPONSE_ITEM) { + const ptype = record.payload?.type ?? ""; + if (ptype === ITEM_MESSAGE) this.handleMessageItem(scope, record); + else if (ptype === ITEM_REASONING) this.handleReasoningItem(scope, record); + else if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(scope, record); + else if (TOOL_OUTPUT_TYPES.has(ptype)) this.endToolSpan(scope, record); + return; + } + } catch (err) { + this.logger.error("codex processor: failed to process transcript record", { + queueId: this.queueId, + type: record.type, + payloadType: record.payload?.type, + error: String(err), + }); + } + } + + // ========================================================================== + // Hook-driven configuration / enrichment + // ========================================================================== + // Record the session's reporting config from the config event. The - // SpanFactory is built lazily (on first span) from this config. Arriving - // before the root span is created, this lets the session report to its own - // project/account. + // SpanFactory is built lazily (on first span) from this config. private configure(event: EnqueueEvent): void { if (this.spanFactoryInstance !== null) { this.logger.warn("codex processor: config event after factory built; ignoring", { @@ -132,71 +994,263 @@ export class CodexEventProcessor implements EventProcessor { }); } - /** Whether this session reports to Braintrust at all (master switch). */ - private get tracingEnabled(): boolean { - return this.reportingConfig?.traceToBraintrust === true; + // Capture source/permission_mode from a SessionStart hook (the transcript + // lacks these). If the root span already exists, patch them onto it now. + private recordRootEnrichment(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const source = typeof data.source === "string" ? data.source : undefined; + const permissionMode = + typeof data.permission_mode === "string" ? data.permission_mode : undefined; + if (source !== undefined) this.rootEnrichment.source = source; + if (permissionMode !== undefined) this.rootEnrichment.permissionMode = permissionMode; + + if (this.rootSpan !== null) { + try { + this.rootSpan.log({ + metadata: { + source: this.rootEnrichment.source, + permission_mode: this.rootEnrichment.permissionMode, + }, + }); + } catch (err) { + this.logger.error("codex processor: failed to enrich root span", { + queueId: this.queueId, + error: String(err), + }); + } + } } - async flush(): Promise { - if (this.rootSpan === null) return; + // ========================================================================== + // Transcript-driven span building + // ========================================================================== + + private startRootSpan(record: TranscriptRecord): void { + if (this.rootSpan !== null) { + this.logger.warn("codex processor: duplicate session_meta; keeping existing root span", { + queueId: this.queueId, + }); + return; + } + const payload = (record.payload ?? {}) as Record; + const cwd = typeof payload.cwd === "string" ? payload.cwd : undefined; + const sessionId = typeof payload.id === "string" ? payload.id : this.queueId; + const startTime = isoToUnixSeconds(record.timestamp); + + const projectDir = projectDirName(cwd); + const spanName = projectDir ? `codex: ${projectDir}` : "codex session"; + try { - await this.rootSpan.flush(); - this.logger.debug("codex processor: flush ok", { queueId: this.queueId }); + this.rootSpan = this.trackSpan( + this.spanFactory.startSpan({ + name: spanName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input: { model: this.mainScope?.model, cwd, source: this.rootEnrichment.source }, + metadata: { + // User-provided extras first, so the standard keys below win. + ...this.reportingConfig?.additionalMetadata, + session_id: sessionId, + model: this.mainScope?.model, + cwd, + source: this.rootEnrichment.source, + permission_mode: this.rootEnrichment.permissionMode, + cli_version: + typeof payload.cli_version === "string" ? payload.cli_version : undefined, + project: this.reportingConfig?.project, + // The session's rollout transcript on disk (the source for this trace). + transcript_path: this.mainScope?.path, + }, + }, + }), + spanName, + "task", + startTime, + ); + this.logger.info("codex processor: opened root span", { + queueId: this.queueId, + spanId: this.rootSpan.id, + }); } catch (err) { - this.logger.error("codex processor: flush failed", { + this.logger.error("codex processor: failed to open root span", { queueId: this.queueId, error: String(err), }); } } - // Open a child span for a turn on UserPromptSubmit. Keyed by turn_id so the - // matching Stop can close it. The user prompt is the span's input. - private startTurnSpan(event: EnqueueEvent): void { - if (this.rootSpan === null) { - this.logger.warn("codex processor: turn without a root span; ignoring", { + // Learn the scope's model from turn_context (session_meta has none). Backfill + // it onto the scope's parent span input/metadata the first time we see it. + private noteModel(scope: TranscriptScope, record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const model = typeof payload.model === "string" ? payload.model : undefined; + if (model === undefined || scope.model !== undefined) return; + scope.model = model; + const parent = scope.parentSpan(); + if (parent !== null) { + try { + parent.log({ input: { model }, metadata: { model } }); + } catch (err) { + this.logger.error("codex processor: failed to backfill model", { + queueId: this.queueId, + error: String(err), + }); + } + } + } + + // Open a turn span on task_started, child of the scope's parent, by turn_id. + private startTurnSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const parent = scope.parentSpan(); + if (parent === null) { + this.logger.warn("codex processor: task_started without a parent span; ignoring", { queueId: this.queueId, }); return; } - - const data = (event.eventData ?? {}) as Record; - const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - const prompt = typeof data.prompt === "string" ? data.prompt : undefined; - const model = typeof data.model === "string" ? data.model : undefined; - + const payload = (record.payload ?? {}) as Record; + const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined; if (turnId === undefined) { - this.logger.warn("codex processor: UserPromptSubmit without turn_id; skipping turn span", { + this.logger.warn("codex processor: task_started without turn_id; skipping turn span", { queueId: this.queueId, }); return; } - - if (this.openTurns.has(turnId)) { + if (scope.openTurns.has(turnId)) { this.logger.warn("codex processor: duplicate turn_id; keeping existing turn span", { queueId: this.queueId, turnId, }); return; } + const startTime = isoToUnixSeconds(record.timestamp); + const turnName = `turn: ${turnId}`; + try { + const turnSpan = this.trackSpan( + parent.startSpan({ + name: turnName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { metadata: { turn_id: turnId, model: scope.model } }, + }), + turnName, + "task", + startTime, + ); + scope.openTurns.set(turnId, { span: turnSpan, startTime, lastChildEndTime: undefined }); + this.logger.info("codex processor: opened turn span", { + queueId: this.queueId, + turnId, + spanId: turnSpan.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open turn span", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + } + + // Handle a `compacted` record: the current turn is a context compaction (Codex + // runs it as its own turn). Relabel that turn span as a "compaction" span with + // metadata (trigger from the PreCompact/PostCompact hook, the count of messages + // the compaction collapsed to, and the window_id) and a tag, then open an llm + // span representing the compaction model call. The following token_count closes + // that llm span with its token usage, so the trace shows what the compaction + // did: the conversation before it (input) and the kept context after (output). + private handleCompacted(scope: TranscriptScope, record: TranscriptRecord): void { + const turnId = this.latestOpenTurnId(scope); + const turn = this.latestOpenTurn(scope); + if (turnId === undefined || turn === undefined) { + this.logger.debug("codex processor: compacted record with no open turn span", { + queueId: this.queueId, + }); + return; + } + const turnSpan = turn.span; + const payload = (record.payload ?? {}) as Record; + const replacement = Array.isArray(payload.replacement_history) + ? (payload.replacement_history as unknown[]) + : undefined; + const trigger = this.compactionTriggerByTurn.get(turnId); + // Relabel the turn span as a compaction span and annotate it. Keep the span + // so a PreCompact/PostCompact hook arriving later can back-fill the trigger. + this.compactionSpansByTurn.set(turnId, turnSpan); try { - const turnSpan = this.rootSpan.startSpan({ - name: `turn: ${turnId}`, + turnSpan.setAttributes({ name: "compaction", type: "task" }); + // Remember the rename so a resumed snapshot re-asserts "compaction", not + // the original "turn: ..." name. Preserve the span's original start time. + this.spanAttrs.set(turnSpan.spanId, { + name: "compaction", type: "task", - event: { - input: prompt, - metadata: { turn_id: turnId, model }, + startTime: this.spanAttrs.get(turnSpan.spanId)?.startTime, + }); + turnSpan.log({ + metadata: { + compaction: { + trigger, + replaced_message_count: replacement?.length, + window_id: payload.window_id, + }, }, + tags: [COMPACTION_TAG], }); - this.openTurns.set(turnId, turnSpan); - this.logger.info("codex processor: opened turn span", { + } catch (err) { + this.logger.error("codex processor: failed to annotate compaction span", { queueId: this.queueId, turnId, - spanId: turnSpan.id, + error: String(err), }); + } + + // Open an llm span for the compaction model call. To make it self-evident + // (and avoid the "model just echoed the user" look), shape input/output as + // before/after of the context window: + // input = the conversation history before compaction (+ its count) + // output = the kept context after compaction (readable messages) plus a + // clear marker for the summary, which Codex stores encrypted and + // does not expose to us. + // Closed by the turn's token_count (metrics) like any other model call. + // Start the span at the turn's start (compaction is the sole child), not the + // record timestamp (which is when the compaction result lands), so its + // duration reflects the compaction call rather than looking instant. + const startTime = this.llmStartTimeFor(turn) ?? isoToUnixSeconds(record.timestamp); + const before = scope.conversationHistory.map((item) => ({ ...item })); + const output = compactionOutput(replacement); + const compactionLlmName = scope.model ?? "compaction"; + try { + const span = this.trackSpan( + turnSpan.startSpan({ + name: compactionLlmName, + type: "llm", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input: { messages_before_compaction: before.length, history: before }, + output, + metadata: { model: scope.model, turn_id: turnId, compaction: true }, + }, + }), + compactionLlmName, + "llm", + startTime, + ); + // Reuse the open-llm slot so the turn's token_count closes it with usage. + // outputPreset: the replacement history is already logged as the output, so + // the closing token_count must not overwrite it. lastOutputTime is the + // compacted record's time (when the compaction output landed); the closing + // token_count provides metrics only. + scope.openLlm = { + span, + turnId, + output: [], + outputPreset: true, + lastOutputTime: isoToUnixSeconds(record.timestamp), + }; } catch (err) { - this.logger.error("codex processor: failed to open turn span", { + this.logger.error("codex processor: failed to open compaction llm span", { queueId: this.queueId, turnId, error: String(err), @@ -204,33 +1258,83 @@ export class CodexEventProcessor implements EventProcessor { } } - // Close the turn span matching the Stop's turn_id, recording the final - // assistant message as the span's output. - private endTurnSpan(event: EnqueueEvent): void { - const data = (event.eventData ?? {}) as Record; - const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - const output = - typeof data.last_assistant_message === "string" ? data.last_assistant_message : undefined; + // Patch the trigger onto a compaction span when its PreCompact/PostCompact hook + // arrives after the span was already built from the transcript. No-op if the + // span isn't known yet (handleCompacted will read the trigger from the map). + private backfillCompactionTrigger(turnId: string, trigger: string): void { + const span = this.compactionSpansByTurn.get(turnId); + if (span === undefined) return; + try { + span.log({ metadata: { compaction: { trigger } } }); + } catch (err) { + this.logger.error("codex processor: failed to back-fill compaction trigger", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + } - if (turnId === undefined) { - this.logger.debug("codex processor: Stop without turn_id; no turn span to close", { + // Set the open turn's input from a user_message event (the prompt). The + // user_message carries no turn_id, so it applies to the most recently opened + // still-open turn in this scope. + private setTurnInput(scope: TranscriptScope, record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const prompt = typeof payload.message === "string" ? payload.message : undefined; + if (prompt === undefined) return; + // Note: we do NOT add the prompt to conversationHistory here — the same + // prompt also arrives as a response_item user message (handleMessageItem), + // which is the canonical source for history. This event only sets the turn's + // input for display. + const turn = this.latestOpenTurn(scope); + if (turn === undefined) { + this.logger.debug("codex processor: user_message with no open turn span", { queueId: this.queueId, }); return; } + try { + turn.span.log({ input: prompt }); + } catch (err) { + this.logger.error("codex processor: failed to set turn input", { + queueId: this.queueId, + error: String(err), + }); + } + } - const turnSpan = this.openTurns.get(turnId); - if (turnSpan === undefined) { - this.logger.debug("codex processor: Stop with no open turn span", { + // Close the turn span on task_complete, recording the final assistant message + // as output. Also closes any tool spans still open under that turn. + private endTurnSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined; + const output = + typeof payload.last_agent_message === "string" ? payload.last_agent_message : undefined; + if (turnId === undefined) { + this.logger.debug("codex processor: task_complete without turn_id", { + queueId: this.queueId, + }); + return; + } + const turn = scope.openTurns.get(turnId); + if (turn === undefined) { + this.logger.debug("codex processor: task_complete with no open turn span", { queueId: this.queueId, turnId, }); return; } - + const turnSpan = turn.span; + const endTime = isoToUnixSeconds(record.timestamp); + // Backstop: close any spans still open without their own closing record + // (a trailing model call with no token_count, or a tool with no output) at + // the turn's end time. + this.endOpenLlmSpan(scope, endTime); + this.endOpenToolSpansForTurn(scope, turnId, endTime); try { turnSpan.log({ output }); - turnSpan.end(); + turnSpan.end(endTime !== undefined ? { endTime } : undefined); + if (endTime !== undefined) scope.lastTurnEndTime = endTime; this.logger.info("codex processor: ended turn span", { queueId: this.queueId, turnId }); } catch (err) { this.logger.error("codex processor: failed to end turn span", { @@ -239,17 +1343,475 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } finally { - this.openTurns.delete(turnId); + scope.openTurns.delete(turnId); + scope.turnsAwaitingCompletion.delete(turnId); + } + } + + // ========================================================================== + // LLM spans (transcript-driven) + // + // One LLM call = a run of reasoning/message (+ tool) items terminated by a + // token_count. The span opens lazily on the first model-output item (assistant + // message or reasoning) and closes at the next token_count, which carries the + // call's token usage. Tool calls the model emits are sibling tool spans under + // the turn (handled elsewhere); the assistant text/reasoning is the LLM span's + // output, and its input is the conversation reconstructed up to the open. + // ========================================================================== + + // Ensure an LLM span is open for the scope's current model call, opening one + // (child of the active turn) on first use with input = the conversation so far. + private ensureLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void { + if (scope.openLlm !== null) return; + const turn = this.latestOpenTurn(scope); + if (turn === undefined) { + this.logger.debug("codex processor: model output with no open turn; skipping llm span", { + queueId: this.queueId, + }); + return; + } + const turnId = this.latestOpenTurnId(scope); + // Start the span where the model's work for this call actually began — the + // end of the turn's most recently-ended child, or the turn's start when this + // is the first child — NOT the record timestamp (which is when the model's + // output landed, just before its token_count, yielding a near-instant span). + const startTime = this.llmStartTimeFor(turn) ?? isoToUnixSeconds(record.timestamp); + // Snapshot the conversation as this call's input (the transcript doesn't log + // the real request, so this is a best-effort reconstruction). + const input = scope.conversationHistory.map((item) => ({ ...item })); + const llmName = scope.model ?? "llm"; + try { + const span = this.trackSpan( + turn.span.startSpan({ + name: llmName, + type: "llm", + ...(startTime !== undefined ? { startTime } : {}), + event: { input, metadata: { model: scope.model, turn_id: turnId } }, + }), + llmName, + "llm", + startTime, + ); + scope.openLlm = { span, turnId, output: [], lastOutputTime: startTime }; + this.logger.info("codex processor: opened llm span", { + queueId: this.queueId, + turnId, + spanId: span.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open llm span", { + queueId: this.queueId, + error: String(err), + }); + } + } + + // Record that the open LLM call produced an output item at `record`'s time, + // advancing its lastOutputTime. The LLM span ends at this time (when the model + // last generated), not at the closing token_count (which Codex writes after + // the tool result). No-op if no LLM span is open. + private noteLlmOutputTime(scope: TranscriptScope, record: TranscriptRecord): void { + if (scope.openLlm === null) return; + const t = isoToUnixSeconds(record.timestamp); + if (t === undefined) return; + if (scope.openLlm.lastOutputTime === undefined || t > scope.openLlm.lastOutputTime) { + scope.openLlm.lastOutputTime = t; + } + } + + // Handle a response_item message: an assistant message is this call's output + // (opens/feeds the LLM span); user/developer messages are input context. All + // are appended to conversation history as chat messages for reconstructing + // later calls' inputs. + private handleMessageItem(scope: TranscriptScope, record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const rawRole = typeof payload.role === "string" ? payload.role : "user"; + const role: ChatMessage["role"] = + rawRole === "assistant" || rawRole === "developer" || rawRole === "tool" ? rawRole : "user"; + const text = messageText(payload.content); + if (text === undefined) return; + + const msg: ChatMessage = { role, content: text }; + if (role === "assistant") { + this.ensureLlmSpan(scope, record); + scope.openLlm?.output.push(msg); + this.noteLlmOutputTime(scope, record); + } + scope.conversationHistory.push(msg); + } + + // Handle a reasoning item: it belongs to the current model call. Open the LLM + // span and, when the model exposed a readable reasoning summary, surface it as + // a `reasoning` item in the call's output and in the conversation history (so + // subsequent calls' input shows the prior thinking too). The full reasoning is + // encrypted, so items with no readable summary only open the span. + private handleReasoningItem(scope: TranscriptScope, record: TranscriptRecord): void { + this.ensureLlmSpan(scope, record); + // The reasoning item is a model output at this time, even when its summary is + // encrypted (empty) and adds nothing readable — so advance the span's end. + this.noteLlmOutputTime(scope, record); + const payload = (record.payload ?? {}) as Record; + const summary = reasoningSummary(payload.summary); + if (summary === undefined) return; + const item: ReasoningItem = { type: "reasoning", summary }; + scope.openLlm?.output.push(item); + scope.conversationHistory.push(item); + } + + // Close the current LLM call on a token_count (the segment boundary): end the + // LLM span with token-usage metrics and its accumulated output. (Tool spans + // are opened/closed by their own records, so nothing to flush here.) + private closeLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void { + if (scope.openLlm === null) return; + const payload = (record.payload ?? {}) as Record; + const info = (payload.info ?? {}) as Record; + const usage = (info.last_token_usage ?? {}) as Record; + const metrics = tokenMetrics(usage); + const { span, turnId, output, outputPreset, lastOutputTime } = scope.openLlm; + // End the span when the model last generated (its last output item), NOT at + // this token_count: Codex writes the token_count after the tool result (at + // the same instant as the tool output), so using it would stretch the span to + // swallow the tool's execution — making the llm span overlap the tool it + // called, which is causally impossible. The token_count only supplies metrics. + // Fall back to the token_count time if we somehow have no output time. + const endTime = lastOutputTime ?? isoToUnixSeconds(record.timestamp); + try { + span.log(outputPreset ? { metrics } : { output: llmOutput(output), metrics }); + span.end(endTime !== undefined ? { endTime } : undefined); + // This LLM call is a child of its turn; advance the turn's boundary so the + // next child (LLM or tool) starts where this one ended (its last output). + this.noteChildEnded(scope, turnId, endTime); + this.logger.info("codex processor: ended llm span", { queueId: this.queueId }); + } catch (err) { + this.logger.error("codex processor: failed to end llm span", { + queueId: this.queueId, + error: String(err), + }); + } finally { + scope.openLlm = null; } } - // End the root span on the first Stop event. Subsequent Stops are ignored - // (the SDK keeps the first end time anyway). - private endRootSpan(): void { + // End any open LLM span without a closing token_count (e.g. on turn/root end). + // Prefer the model's last-output time (when generation actually ended) over the + // caller's fallback endTime (the turn/root end), so a trailing call isn't + // stretched to the turn boundary. + private endOpenLlmSpan(scope: TranscriptScope, endTime: number | undefined): void { + if (scope.openLlm === null) return; + const { span, turnId, output, lastOutputTime } = scope.openLlm; + const effectiveEnd = lastOutputTime ?? endTime; + try { + if (output.length > 0) span.log({ output: llmOutput(output) }); + span.end(effectiveEnd !== undefined ? { endTime: effectiveEnd } : undefined); + this.noteChildEnded(scope, turnId, effectiveEnd); + } catch (err) { + this.logger.error("codex processor: failed to end dangling llm span", { + queueId: this.queueId, + error: String(err), + }); + } finally { + scope.openLlm = null; + } + } + + // Open a tool span (child of its turn) for a tool-call response_item, keyed by + // call_id so the matching *_output can close it. Each tool span is opened and + // closed by its own records (no cross-segment buffering), which keeps it robust + // against the transcript being read in pieces. Tool spans are ordered by their + // transcript start time, so they render between the LLM calls around them. + private startToolSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; + // Most tool calls carry `name`; some (e.g. tool_search_call) don't, so fall + // back to the record subtype with the "_call" suffix stripped. + const ptype = typeof payload.type === "string" ? payload.type : undefined; + const name = + typeof payload.name === "string" + ? payload.name + : ptype !== undefined + ? ptype.replace(/_call$/, "") + : undefined; + const meta = (payload.metadata ?? {}) as Record; + const turnId = typeof meta.turn_id === "string" ? meta.turn_id : this.latestOpenTurnId(scope); + // function_call uses `arguments` (a JSON string); custom_tool_call uses `input`. + const input = payload.arguments ?? payload.input; + + // Record the tool call as an assistant message carrying a tool_call in the + // conversation history (input context for later calls). Done regardless of + // whether we end up creating a span, so reconstructed history stays faithful. + const argsString = typeof input === "string" ? input : JSON.stringify(input ?? {}); + const toolCallMsg: ChatMessage = { + role: "assistant", + content: null, + tool_calls: [ + { + id: callId ?? "", + type: "function", + function: { name: name ?? "tool", arguments: argsString }, + }, + ], + }; + scope.conversationHistory.push(toolCallMsg); + + if (callId === undefined) { + this.logger.warn("codex processor: tool call without call_id; skipping tool span", { + queueId: this.queueId, + turnId, + }); + return; + } + if (turnId === undefined) { + this.logger.warn("codex processor: tool call without turn_id; skipping tool span", { + queueId: this.queueId, + callId, + }); + return; + } + const turn = scope.openTurns.get(turnId); + if (turn === undefined) { + this.logger.warn("codex processor: tool call with no open turn span; skipping", { + queueId: this.queueId, + turnId, + callId, + }); + return; + } + const turnSpan = turn.span; + + // A tool call IS a model output, so it belongs to the current LLM call. Open + // the LLM span if one isn't already open (a call whose only output is a tool + // call — no preceding assistant message or readable reasoning — would + // otherwise leave the model's work, often many seconds, with no llm span, + // appearing as an unaccounted gap at the front of the turn). Record the tool + // call as this LLM call's output too. Done only once we know the tool call is + // valid (real open turn), so a tool call for an unknown turn opens nothing. + this.ensureLlmSpan(scope, record); + scope.openLlm?.output.push(toolCallMsg); + // The model emitted this tool call now — that's when its generation ended. + this.noteLlmOutputTime(scope, record); + if (scope.openTools.has(callId)) { + this.logger.warn("codex processor: duplicate call_id; keeping existing tool span", { + queueId: this.queueId, + callId, + }); + return; + } + // If this call requested escalated permissions (Codex records the request + // inline in the arguments), surface it on the span as metadata and a tag. + const permission = permissionInfo(input); + const startTime = isoToUnixSeconds(record.timestamp); + const toolName = name ?? "tool"; + try { + const toolSpan = this.trackSpan( + turnSpan.startSpan({ + name: toolName, + type: "tool", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input, + metadata: { tool_name: name, call_id: callId, turn_id: turnId, permission }, + ...(permission !== undefined ? { tags: [PERMISSION_TAG] } : {}), + }, + }), + toolName, + "tool", + startTime, + ); + scope.openTools.set(callId, { span: toolSpan, turnId }); + // Remember the TURN span that ran a spawn_agent tool (main scope only) so a + // subagent's root span can be parented as a sibling of the tool (both under + // this turn). The agent_id isn't in the transcript; it arrives on the + // spawn_agent PostToolUse hook, which we map to this turn by call_id. + if (scope.kind === "main" && name === SPAWN_AGENT_TOOL) { + this.spawnTurnSpansByCallId.set(callId, turnSpan); + } + this.logger.info("codex processor: opened tool span", { + queueId: this.queueId, + turnId, + callId, + spanId: toolSpan.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open tool span", { + queueId: this.queueId, + turnId, + callId, + error: String(err), + }); + } + } + + // Close the tool span matching a *_output record's call_id, recording the + // output, and add the result to conversation history as a `tool` message. + private endToolSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; + const output = payload.output ?? payload.result; + scope.conversationHistory.push({ + role: "tool", + content: typeof output === "string" ? output : JSON.stringify(output ?? null), + tool_call_id: callId ?? "", + }); + if (callId === undefined) { + this.logger.debug("codex processor: tool output without call_id", { + queueId: this.queueId, + }); + return; + } + const entry = scope.openTools.get(callId); + if (entry === undefined) { + this.logger.debug("codex processor: tool output with no open tool span", { + queueId: this.queueId, + callId, + }); + return; + } + const endTime = isoToUnixSeconds(record.timestamp); + try { + entry.span.log({ output }); + entry.span.end(endTime !== undefined ? { endTime } : undefined); + // A tool is a child of its turn; advance the turn's boundary so a following + // LLM call (the model resuming after the tool result) starts here. + this.noteChildEnded(scope, entry.turnId, endTime); + this.logger.info("codex processor: ended tool span", { queueId: this.queueId, callId }); + } catch (err) { + this.logger.error("codex processor: failed to end tool span", { + queueId: this.queueId, + callId, + error: String(err), + }); + } finally { + scope.openTools.delete(callId); + } + } + + // End any still-open tool spans owned by the given turn (e.g. a call whose + // output never arrived) when the turn ends, at the turn's end time. + private endOpenToolSpansForTurn( + scope: TranscriptScope, + turnId: string, + endTime: number | undefined, + ): void { + for (const [callId, entry] of scope.openTools) { + if (entry.turnId !== turnId) continue; + try { + entry.span.end(endTime !== undefined ? { endTime } : undefined); + } catch (err) { + this.logger.error("codex processor: failed to end dangling tool span", { + queueId: this.queueId, + turnId, + callId, + error: String(err), + }); + } finally { + scope.openTools.delete(callId); + } + } + } + + // Close every still-open tool span in a scope (used when ending a subagent or + // the root). Clears the scope's openTools. + private closeAllOpenTools( + scope: TranscriptScope, + endArgs: { endTime: number } | undefined, + ): void { + for (const [callId, entry] of scope.openTools) { + try { + entry.span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling tool span", { + queueId: this.queueId, + callId, + error: String(err), + }); + } + } + scope.openTools.clear(); + } + + // The most recently opened turn that is still open in this scope (used to + // attach a user_message or an LLM span, which carry no turn_id). + private latestOpenTurn(scope: TranscriptScope): OpenTurn | undefined { + let last: OpenTurn | undefined; + for (const turn of scope.openTurns.values()) last = turn; + return last; + } + + // The start time (Unix seconds) to use for an LLM span opening now under the + // given turn: the end of the turn's most recently-ended child if any, else the + // turn's own start. This makes the LLM span cover request->response (the + // transcript has no record at request-send time) and lets consecutive children + // tile the turn. Per-turn state, so concurrent subagents never interfere. + private llmStartTimeFor(turn: OpenTurn): number | undefined { + return turn.lastChildEndTime ?? turn.startTime; + } + + // Record that a child of the given turn ended at `endTime`, advancing the + // turn's lastChildEndTime so the next LLM span starts where this child ended. + // No-op when endTime is unknown or the turn isn't open. + private noteChildEnded( + scope: TranscriptScope, + turnId: string | undefined, + endTime: number | undefined, + ): void { + if (turnId === undefined || endTime === undefined) return; + const turn = scope.openTurns.get(turnId); + if (turn === undefined) return; + if (turn.lastChildEndTime === undefined || endTime > turn.lastChildEndTime) { + turn.lastChildEndTime = endTime; + } + } + + // The turn_id of the most recently opened still-open turn in this scope. + private latestOpenTurnId(scope: TranscriptScope): string | undefined { + let last: string | undefined; + for (const turnId of scope.openTurns.keys()) last = turnId; + return last; + } + + // Number of turns (across all scopes) whose Stop has fired but which are still + // open (awaiting their task_complete). flush() polls only while this is > 0. + private countPendingTurns(): number { + let n = 0; + for (const scope of this.scopes.values()) { + for (const turnId of scope.turnsAwaitingCompletion) { + if (scope.openTurns.has(turnId)) n += 1; + } + } + return n; + } + + // End the root span on the first main-session Stop. Subsequent Stops are + // ignored (the SDK records a span's end time once). `endTime` is the completing + // turn's end time so the root isn't stamped with a late wall-clock value; falls + // back to the SDK's now() if unknown. Any still-open spans in the main scope + // are closed at the same time so nothing dangles. + private endRootSpan(endTime: number | undefined): void { if (this.rootSpan === null || this.rootEnded) return; this.rootEnded = true; + this.rootEndTime = endTime; + const endArgs = endTime !== undefined ? { endTime } : undefined; + const scope = this.mainScope; + // Backstop: close any open LLM span, any still-open tool spans, then any + // still-open turn spans in the main scope, so nothing dangles. + if (scope !== null) { + this.endOpenLlmSpan(scope, endTime); + this.closeAllOpenTools(scope, endArgs); + for (const [turnId, turn] of scope.openTurns) { + try { + turn.span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling turn span on root end", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + } + scope.openTurns.clear(); + } try { - this.rootSpan.end(); + this.rootSpan.end(endArgs); this.logger.info("codex processor: ended root span", { queueId: this.queueId }); } catch (err) { this.logger.error("codex processor: failed to end root span", { @@ -257,58 +1819,358 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } + // NOTE: we deliberately do NOT delete the snapshot here. Ending the root span + // means "this turn's Stop fired", not "the session is over" — Codex has no + // session-end hook, and later turns attach as children of this same root. The + // snapshot must persist so a restart between turns can resume the session; + // stale ones are reclaimed by the store's age-based GC. } - private startRootSpan(event: EnqueueEvent): void { - if (this.rootSpan !== null) { - this.logger.warn("codex processor: duplicate SessionStart; keeping existing root span", { + async flush(): Promise { + if (this.rootSpan === null) return; + // Final catch-up: a turn's task_complete can land slightly after its Stop + // hook fires, so the Stop's bounded wait may miss it and leave the turn open. + // flush() happens after the terminal Stop's /flush (and on idle/eviction). + // Poll every scope's transcript until every open turn has closed (or a + // bounded timeout elapses), processing new records as they appear. This + // processes the transcript (it does not unilaterally end spans), staying + // consistent with "the transcript is the truth". + await this.drainOpenTurns(); + try { + await this.rootSpan.flush(); + this.logger.debug("codex processor: flush ok", { queueId: this.queueId }); + } catch (err) { + this.logger.error("codex processor: flush failed", { queueId: this.queueId, + error: String(err), }); - return; } + // Persist state so a future server run can resume this session's trace. Done + // after the catch-up + flush above so the snapshot reflects the latest + // offsets and closed spans. + this.persistState(); + } - const data = (event.eventData ?? {}) as Record; - const model = typeof data.model === "string" ? data.model : undefined; - const cwd = typeof data.cwd === "string" ? data.cwd : undefined; - const source = typeof data.source === "string" ? data.source : undefined; - const permissionMode = - typeof data.permission_mode === "string" ? data.permission_mode : undefined; - - // Name the root span after the directory Codex was launched from, e.g. - // "codex: myapp" for /whatever/myapp. Falls back to "codex session" when cwd - // is unknown. - const projectDir = projectDirName(cwd); - const spanName = projectDir ? `codex: ${projectDir}` : "codex session"; + // ========================================================================== + // State persistence (resume support) + // ========================================================================== + // Persist this session's snapshot so a future server run can resume the trace. + // Best-effort: the store never throws. No-op without a store/session id, or + // before any root span exists (nothing worth resuming yet). + // + // We persist even once the root span has ended. Codex's "Stop" is per-TURN, + // not per-session (there is no session-end hook), and we end the root on the + // first Stop — but the session lives on and more turns can follow, each a + // child of that same (ended) root. So the snapshot must survive past root-end, + // or a server restart between turns would lose the thread and start a brand-new + // trace. Stale snapshots from sessions that truly never resume are reclaimed by + // the store's age-based GC, not by deleting here. + private persistState(): void { + if (this.snapshotStore === null || this.queueId === null) return; + if (this.rootSpan === null) return; // nothing worth resuming yet try { - this.rootSpan = this.spanFactory.startSpan({ - name: spanName, - type: "task", - event: { - input: { model, cwd, source }, - metadata: { - // User-provided extras first, so the standard keys below win on - // conflict. - ...this.reportingConfig?.additionalMetadata, - session_id: this.queueId, - model, - cwd, - source, - permission_mode: permissionMode, - event_source_version: event.eventSourceVersion, - project: this.reportingConfig?.project, - }, - }, + this.snapshotStore.write(this.queueId, this.serialize()); + } catch (err) { + this.logger.error("codex processor: failed to serialize state", { + queueId: this.queueId, + error: String(err), }); - this.logger.info("codex processor: opened root span", { + } + } + + // One-time restore from a snapshot left by a previous server run. Runs lazily + // on the first event (after reporting is configured, so the span factory and + // tracing-enabled check are settled). Guarded so it happens at most once. + private ensureRestored(): void { + if (this.restoreAttempted) return; + this.restoreAttempted = true; + if (this.snapshotStore === null || this.queueId === null) return; + // If we've already built a root span this run, there's nothing to restore + // (we're the original processor, not a post-restart one). + if (this.rootSpan !== null) return; + const snapshot = this.snapshotStore.read(this.queueId); + if (snapshot === null) return; + if (!isCompatibleSnapshot(snapshot, PLUGIN_VERSION)) { + this.logger.info("codex processor: snapshot version mismatch; starting fresh", { queueId: this.queueId, - spanId: this.rootSpan.id, + snapshotVersion: snapshot.pluginVersion, + snapshotSchema: snapshot.schemaVersion, + }); + this.snapshotStore.delete(this.queueId); + return; + } + try { + this.restore(snapshot); + this.logger.info("codex processor: restored session state from snapshot", { + queueId: this.queueId, + scopes: snapshot.scopes.length, }); } catch (err) { - this.logger.error("codex processor: failed to open root span", { + this.logger.error("codex processor: failed to restore snapshot; starting fresh", { queueId: this.queueId, error: String(err), }); } } + + // Remember the name/type a span was created (or renamed) with, so it can be + // re-asserted on rehydration. Returns the span unchanged for call-site + // convenience. + private trackSpan( + span: Span, + name: string, + type: SpanRef["type"], + startTime: number | undefined, + ): Span { + this.spanAttrs.set(span.spanId, { name, type, startTime }); + return span; + } + + // Capture a span's identity plus its remembered name/type/start for the + // snapshot, so a rehydrated handle keeps them (and isn't re-stamped to the + // resume time). + private refFor(span: Span): SpanRef { + const attrs = this.spanAttrs.get(span.spanId); + return spanRef(span, attrs?.name, attrs?.type, attrs?.startTime); + } + + // Build a serializable snapshot of all resumable state. Span handles are + // captured as identities (spanId/rootSpanId/parents) plus their name/type; the + // transcript on disk remains the source of truth for content, so nothing here + // re-derives spans. + private serialize(): CodexSnapshot { + const scopes: ScopeSnapshot[] = []; + for (const scope of this.scopes.values()) { + scopes.push({ + path: scope.path, + kind: scope.kind, + offset: scope.offset, + openTurns: Array.from(scope.openTurns, ([turnId, turn]) => ({ + turnId, + span: this.refFor(turn.span), + startTime: turn.startTime, + lastChildEndTime: turn.lastChildEndTime, + })), + turnsAwaitingCompletion: Array.from(scope.turnsAwaitingCompletion), + conversationHistory: toItemSnapshots(scope.conversationHistory), + openLlm: + scope.openLlm === null + ? null + : { + span: this.refFor(scope.openLlm.span), + turnId: scope.openLlm.turnId, + output: toItemSnapshots(scope.openLlm.output), + lastOutputTime: scope.openLlm.lastOutputTime, + outputPreset: scope.openLlm.outputPreset, + }, + openTools: Array.from(scope.openTools, ([callId, entry]) => ({ + callId, + span: this.refFor(entry.span), + turnId: entry.turnId, + })), + model: scope.model, + lastTurnEndTime: scope.lastTurnEndTime, + subagentRootSpan: + scope.subagentRootSpan === null ? null : this.refFor(scope.subagentRootSpan), + subagentEnded: scope.subagentEnded, + ...(scope.pendingSubagent !== undefined + ? { + pendingSubagent: { + agentId: scope.pendingSubagent.agentId, + agentType: scope.pendingSubagent.agentType, + parent: this.refFor(scope.pendingSubagent.parent), + }, + } + : {}), + }); + } + return { + pluginVersion: PLUGIN_VERSION, + schemaVersion: SNAPSHOT_SCHEMA_VERSION, + sessionId: this.queueId, + savedAt: Date.now(), + reportingConfig: redactReportingConfig(this.reportingConfig), + rootSpan: this.rootSpan === null ? null : this.refFor(this.rootSpan), + rootEnded: this.rootEnded, + rootEndTime: this.rootEndTime, + rootEnrichment: { ...this.rootEnrichment }, + mainScopePath: this.mainScope?.path ?? null, + scopes, + // We deliberately do NOT persist the span-bearing side-maps + // (spawnTurnSpansBy*, compactionSpansByTurn). They retain references to + // spans that have since CLOSED, and rehydrating a closed span re-opens it + // (the SDK stamps a fresh start and no end), producing duplicate, never- + // closing spans on resume. The only things they enable across a restart are + // rare in-flight reconciliations (nesting a subagent spawned right before + // the restart; back-filling a compaction trigger whose hook lands after the + // restart) — acceptable to lose. The trigger map is plain strings (no span + // refs), so it's harmless to keep. + compactionTriggerByTurn: Array.from(this.compactionTriggerByTurn, ([key, value]) => ({ + key, + value, + })), + }; + } + + // Rebuild live state from a snapshot. Span handles are recreated bound to their + // original ids (so further log()/end() merge into the same rows). Each scope's + // lazy parent-span resolver is reconstructed the same way newScope/ + // handleSubagentStart do, so subsequent transcript records attach correctly. + private restore(snapshot: CodexSnapshot): void { + // Reporting config first: it determines the span factory (project/creds). + // The snapshot is the source of truth — we adopt it as-is rather than + // merging in this run's config, so a resume whose env/settings drifted from + // the original session can't clobber the snapshot's values. The one field + // the snapshot can't carry is the `apiKey` secret (intentionally never + // persisted), so we re-attach the live one resolved from this run's config + // event / env. This must happen before we touch `this.spanFactory`, which is + // built lazily from `this.reportingConfig`. + if (snapshot.reportingConfig !== undefined) { + this.reportingConfig = { + ...snapshot.reportingConfig, + apiKey: this.reportingConfig?.apiKey, + }; + } + + const factory = this.spanFactory; + // Rehydrate a handle and re-remember its name/type/start, so if this resumed + // processor is itself snapshotted again, the attributes survive the next hop. + const rehydrate = (ref: SpanRef): Span => { + const span = factory.rehydrateSpan(ref); + if (ref.name !== undefined && ref.type !== undefined) { + this.spanAttrs.set(span.spanId, { + name: ref.name, + type: ref.type, + startTime: ref.startTime, + }); + } + return span; + }; + + this.rootSpan = snapshot.rootSpan === null ? null : rehydrate(snapshot.rootSpan); + this.rootEnded = snapshot.rootEnded; + this.rootEndTime = snapshot.rootEndTime; + this.rootEnrichment = { ...snapshot.rootEnrichment }; + // Rehydrating the root necessarily re-emits its row with no end time. If the + // root was already ended before the restart, re-assert that end now so it + // stays closed (a later turn can still attach as a child of an ended root). + if (this.rootEnded && this.rootSpan !== null) { + try { + this.rootSpan.end( + this.rootEndTime !== undefined ? { endTime: this.rootEndTime } : undefined, + ); + } catch (err) { + this.logger.error("codex processor: failed to re-end root span on restore", { + queueId: this.queueId, + error: String(err), + }); + } + } + + for (const s of snapshot.scopes) { + const isMain = s.path === snapshot.mainScopePath; + // The lazy parent resolver mirrors live construction: a main scope's turns + // hang under the root span; a subagent scope's under its own (restored) + // subagent root span. + const scope: TranscriptScope = { + path: s.path, + kind: s.kind, + parentSpan: isMain ? () => this.rootSpan : () => scope.subagentRootSpan, + offset: s.offset, + openTurns: new Map( + s.openTurns.map((t) => [ + t.turnId, + { + span: rehydrate(t.span), + startTime: t.startTime, + lastChildEndTime: t.lastChildEndTime, + }, + ]), + ), + turnsAwaitingCompletion: new Set(s.turnsAwaitingCompletion), + conversationHistory: fromItemSnapshots(s.conversationHistory), + openLlm: + s.openLlm === null + ? null + : { + span: rehydrate(s.openLlm.span), + turnId: s.openLlm.turnId, + output: fromItemSnapshots(s.openLlm.output), + lastOutputTime: s.openLlm.lastOutputTime, + outputPreset: s.openLlm.outputPreset, + }, + openTools: new Map( + s.openTools.map((t) => [t.callId, { span: rehydrate(t.span), turnId: t.turnId }]), + ), + model: s.model, + lastTurnEndTime: s.lastTurnEndTime, + subagentRootSpan: s.subagentRootSpan === null ? null : rehydrate(s.subagentRootSpan), + subagentEnded: s.subagentEnded, + ...(s.pendingSubagent !== undefined + ? { + pendingSubagent: { + agentId: s.pendingSubagent.agentId, + agentType: s.pendingSubagent.agentType, + parent: rehydrate(s.pendingSubagent.parent), + }, + } + : {}), + }; + this.scopes.set(s.path, scope); + if (isMain) this.mainScope = scope; + } + + // Span-bearing side-maps (spawnTurnSpansBy*, compactionSpansByTurn) are not + // persisted — see serialize() — so there is nothing to rehydrate for them, + // which is what keeps a resume from re-opening already-closed spans. Only the + // plain-string trigger map is restored. + for (const e of snapshot.compactionTriggerByTurn) { + this.compactionTriggerByTurn.set(e.key, e.value); + } + } + + // Read and process new transcript records for every scope, retrying on a + // bounded interval, until no turn that has seen its Stop is still open (its + // task_complete can land seconds after the Stop hook) or the timeout elapses. + // Always does at least one read so a task_complete already on disk closes its + // turn immediately. Crucially, this does NOT wait for turns that are merely + // mid-progress (no Stop yet) — so an idle flush during an active turn returns + // promptly. On timeout, leaves whatever is still open for the backstops. + private async drainOpenTurns(): Promise { + if (this.scopes.size === 0) return; + const deadline = Date.now() + FLUSH_TIMEOUT_MS; + for (;;) { + for (const scope of this.scopes.values()) { + // A finalized subagent scope is inert: re-reading it would re-open spans + // that were already closed (and the SDK would overwrite the closed spans + // with un-ended ones). Skip it. + if (scope.subagentEnded) continue; + try { + const { lines, offset } = this.transcriptReader.readFrom(scope.path, scope.offset); + scope.offset = offset; + this.consumeLines(scope, lines); + } catch (err) { + this.logger.error("codex processor: final catch-up read failed", { + queueId: this.queueId, + path: scope.path, + error: String(err), + }); + } + } + // Only turns whose Stop has fired but whose task_complete hasn't arrived + // keep us waiting. (endTurnSpan removes them from both sets as they close.) + const pending = this.countPendingTurns(); + if (pending === 0 || Date.now() >= deadline) { + if (pending > 0) { + this.logger.warn("codex processor: turns awaiting completion at flush timeout", { + queueId: this.queueId, + pending, + }); + } + return; + } + await sleep(FLUSH_INTERVAL_MS); + } + } } diff --git a/plugins/trace-codex/src/agents/codex/register.ts b/plugins/trace-codex/src/agents/codex/register.ts index a905601..f4b194d 100644 --- a/plugins/trace-codex/src/agents/codex/register.ts +++ b/plugins/trace-codex/src/agents/codex/register.ts @@ -3,10 +3,12 @@ // another agent means adding a sibling module that exports the same shape. import type { EventBuilder } from "../../client/client.ts"; +import type { Logger } from "../../log.ts"; import type { EventProcessorFactory } from "../../processor/event-processor.ts"; import { buildEnqueueEvents, CODEX_EVENT_SOURCE } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; import { applySettingsToEnv, loadSettingsFile, settingsFilePath } from "./settings.ts"; +import { SnapshotStore } from "./snapshot-store.ts"; export interface Agent { /** Event source string this agent's events carry (matches the registry key). */ @@ -17,9 +19,11 @@ export interface Agent { buildEvents: EventBuilder; /** * Event names that terminate a turn/session. After enqueuing one of these, - * the hook asks the server to flush synchronously so the final spans are - * delivered before the process tree is torn down (e.g. in CI). Codex only - * exposes a per-turn "Stop" hook today; there is no session-end event. + * the hook asks the server to flush. By default the flush is fire-and-forget + * so the turn isn't stalled; setting BRAINTRUST_FLUSH_ON_TURN_END makes the + * hook block until the server confirms the final spans are delivered before + * the process tree is torn down (e.g. in CI). Codex only exposes a per-turn + * "Stop" hook today; there is no session-end event. */ terminalEvents: string[]; /** @@ -31,10 +35,28 @@ export interface Agent { loadSettings: () => string[]; } +// One shared snapshot store for all Codex sessions, created on first use. It +// persists each session's resumable state under PLUGIN_DATA/state so a restarted +// server can rehydrate in-progress traces. Created lazily (reusing the first +// session's logger) so a one-time GC of stale orphan snapshots runs the first +// time we actually trace, and so this module stays side-effect-free on import. +let sharedSnapshotStore: SnapshotStore | null = null; +function snapshotStore(logger: Logger): SnapshotStore { + if (sharedSnapshotStore === null) { + sharedSnapshotStore = new SnapshotStore(logger); + // Reclaim stale snapshots. Snapshots intentionally outlive their root span + // (later turns resume onto the same root), so they are never self-deleted on + // root-end; the only cleanup is this age-based sweep of ones that never + // resumed. + sharedSnapshotStore.gcOlderThan(); + } + return sharedSnapshotStore; +} + export const codexAgent: Agent = { eventSource: CODEX_EVENT_SOURCE, createProcessor: (queueId, logger, spanFactoryProvider) => - new CodexEventProcessor(queueId, logger, spanFactoryProvider), + new CodexEventProcessor(queueId, logger, spanFactoryProvider, undefined, snapshotStore(logger)), buildEvents: buildEnqueueEvents, terminalEvents: ["Stop"], loadSettings: () => applySettingsToEnv(loadSettingsFile(settingsFilePath())), diff --git a/plugins/trace-codex/src/agents/codex/resume.test.ts b/plugins/trace-codex/src/agents/codex/resume.test.ts new file mode 100644 index 0000000..488bee4 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/resume.test.ts @@ -0,0 +1,671 @@ +// Resume / state-persistence tests. +// +// When the event server shuts down mid-session (idle timeout) or the user closes +// and later resumes, a fresh CodexEventProcessor is created for the same session +// id. These tests verify that, given a persisted snapshot, the new processor +// rehydrates the original spans and continues the SAME trace — producing the +// same result as if one processor had handled the whole session — instead of +// duplicating or orphaning spans. + +import { describe, expect, test } from "bun:test"; +import type { SpanFactory } from "../../braintrust/logger.ts"; +import { + type CapturedSpan, + createTestLogger, + diffSpan, + type ExpectedSpan, + spansToTree, + withCapturedTrace, +} from "../../test-helpers.ts"; +import { CodexEventProcessor } from "./event-processor.ts"; +import { SnapshotStore } from "./snapshot-store.ts"; +import type { CodexSnapshot } from "./state-snapshot.ts"; +import { + compacted, + configEvent, + FakeTranscriptReader, + functionCall, + functionCallOutput, + preToolUse, + sessionMeta, + sessionStart, + stop, + type TraceEntry, + taskComplete, + taskStarted, + tokenCount, + transcript, + turnContext, + userMessage, +} from "./test-helpers.ts"; + +/** An in-memory snapshot store, so tests don't touch disk. Mirrors the disk + * store's surface (the parts the processor uses). */ +class MemorySnapshotStore extends SnapshotStore { + private readonly mem = new Map(); + + constructor() { + super(createTestLogger(), { dir: "/dev/null/unused" }); + } + + override read(sessionId: string): CodexSnapshot | null { + const snap = this.mem.get(sessionId); + // Deep-clone so callers can't mutate stored state (matches disk JSON). + return snap ? (JSON.parse(JSON.stringify(snap)) as CodexSnapshot) : null; + } + + override write(sessionId: string, snapshot: CodexSnapshot): void { + this.mem.set(sessionId, JSON.parse(JSON.stringify(snapshot)) as CodexSnapshot); + } + + override delete(sessionId: string): void { + this.mem.delete(sessionId); + } + + has(sessionId: string): boolean { + return this.mem.has(sessionId); + } +} + +const isTranscriptEntry = (e: TraceEntry): e is ReturnType => + (e as { kind?: string }).kind === "transcript"; + +/** + * Run a list of entries (hooks + transcript writes) through a processor, then + * flush. Shares the reader, factory, and store across calls so a second + * processor can resume the first's session. Returns nothing; spans accumulate in + * the shared captured trace. + */ +async function runThrough( + entries: TraceEntry[], + deps: { + queueId: string; + reader: FakeTranscriptReader; + factory: SpanFactory; + store: SnapshotStore; + }, +): Promise { + const processor = new CodexEventProcessor( + deps.queueId, + createTestLogger(), + () => deps.factory, + deps.reader, + deps.store, + ); + for (const entry of entries) { + if (isTranscriptEntry(entry)) { + deps.reader.append(JSON.stringify(entry.record)); + } else { + await processor.process(entry); + } + } + await processor.flush(); +} + +/** Assert a captured trace matches an expected tree. */ +function expectTrace(spans: CapturedSpan[], expected: ExpectedSpan): void { + const tree = spansToTree(spans); + const diffs = diffSpan(tree, expected, "root"); + if (diffs.length > 0) { + throw new Error(`trace does not match expected:\n${diffs.join("\n")}`); + } +} + +describe("CodexEventProcessor: resume from snapshot", () => { + // The realistic mid-session restart: the server idle-shuts-down DURING a turn + // (before its Stop), then the next hook arrives on a new processor. The + // resumed processor must rehydrate the open root + turn and finish them, + // yielding one trace with the turn closed exactly once (no duplicates). + test("an idle restart mid-turn rehydrates open spans and finishes them", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-midturn"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: open the session and a turn, but NO Stop yet. The SessionStart + // and a PreToolUse hook drive transcript catch-up (in production every + // hook does). An idle flush then persists the open state. The config + // carries an apiKey to prove it's never written to the snapshot. + await runThrough( + [ + configEvent({ session_id: queueId, apiKey: "sk-secret-do-not-persist" }), + sessionMeta({ id: queueId, cwd: "/tmp/proj" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "hello" }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + // Mid-session: the root and turn are open, so a snapshot exists. + expect(store.has(queueId)).toBe(true); + const snap = store.read(queueId); + expect(snap?.rootEnded).toBe(false); + expect(snap?.scopes[0]?.openTurns.length).toBe(1); + // The persisted snapshot must never contain the apiKey (secret). + expect(JSON.stringify(snap)).not.toContain("apiKey"); + + // Run 2: a NEW processor (server restarted). The first event isn't a + // SessionStart — it's the Stop closing the turn. The resumed processor + // must restore the open root+turn and finish them. + await runThrough( + [ + // No config event this run: the resumed processor uses the snapshot's + // reporting config. (In production a resume DOES re-send config, but + // restore must work without it too.) + taskComplete({ turn_id: "turn-1", last_agent_message: "all done" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + // The resumed Stop ended the root, but the snapshot is kept (more turns + // could still follow — Codex's Stop is per-turn, not per-session). It's + // updated to reflect the now-ended root. + expect(store.has(queueId)).toBe(true); + expect(store.read(queueId)?.rootEnded).toBe(true); + + const spans = await trace.drain(); + // Exactly one root, one turn — no duplicates from the restart. + expectTrace(spans, { + span_attributes: { name: "codex: proj", type: "task" }, + metadata: { session_id: queueId }, + ended: true, + children: [ + { + span_attributes: { name: "turn: turn-1", type: "task" }, + input: "hello", + output: "all done", + ended: true, + children: [], + }, + ], + }); + } finally { + trace.cleanup(); + } + }); + + // The /shutdown-between-turns case: turn 1 completes (its Stop fires, which + // ends the root), the server is shut down, then the user sends another message. + // The root ending on the first Stop must NOT discard the snapshot — turn 2 has + // to resume under the SAME root, not start a brand-new trace. + test("a turn sent after a between-turns shutdown resumes under the same root", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-multiturn"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a full first turn, including its Stop (which ends the root span). + // Then the server shuts down (we stop using processor A after its flush). + await runThrough( + [ + configEvent({ session_id: queueId }), + sessionMeta({ id: queueId, cwd: "/tmp/proj" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "first" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "did first" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + // The root ended on turn 1's Stop — but the snapshot must SURVIVE, since + // more turns can still come (Codex's Stop is per-turn, not per-session). + expect(store.has(queueId)).toBe(true); + expect(store.read(queueId)?.rootEnded).toBe(true); + + // Run 2: a NEW processor (server restarted after /shutdown). A second turn + // arrives. It must attach under the SAME restored root, not a new one. + await runThrough( + [ + taskStarted({ turn_id: "turn-2" }), + userMessage({ message: "second" }), + taskComplete({ turn_id: "turn-2", last_agent_message: "did second" }), + stop({ turn_id: "turn-2" }), + ], + deps, + ); + + const spans = await trace.drain(); + // One root, two turns under it — a single continuous trace. + expectTrace(spans, { + span_attributes: { name: "codex: proj", type: "task" }, + metadata: { session_id: queueId }, + children: [ + { + span_attributes: { name: "turn: turn-1", type: "task" }, + output: "did first", + ended: true, + children: [], + }, + { + span_attributes: { name: "turn: turn-2", type: "task" }, + input: "second", + output: "did second", + ended: true, + children: [], + }, + ], + }); + } finally { + trace.cleanup(); + } + }); + + // A deeper hierarchy (turn -> tool) open across the restart must rehydrate with + // the correct span names (not SDK-inferred ones) and close cleanly. + test("an open tool span survives a restart and closes with its original name", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-tool"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a turn opens and calls a tool (function_call), but the tool's + // output hasn't arrived yet when the server idle-shuts-down. + await runThrough( + [ + configEvent({ session_id: queueId }), + sessionMeta({ id: queueId, cwd: "/tmp/proj" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "run a tool" }), + functionCall({ turn_id: "turn-1", call_id: "call-1", name: "shell" }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + const snap = store.read(queueId); + expect(snap?.scopes[0]?.openTools.length).toBe(1); + expect(snap?.scopes[0]?.openTools[0]?.span.name).toBe("shell"); + + // Run 2: a fresh processor. The tool output and turn completion arrive, then + // the Stop. The rehydrated tool span must close under the rehydrated turn. + await runThrough( + [ + functionCallOutput({ call_id: "call-1", output: "ok" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "tool done" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const spans = await trace.drain(); + expectTrace(spans, { + span_attributes: { name: "codex: proj", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: turn-1", type: "task" }, + output: "tool done", + ended: true, + children: [ + // The model call that emitted the tool call opens an llm span. + { span_attributes: { name: "gpt-5.5", type: "llm" }, ended: true }, + { + span_attributes: { name: "shell", type: "tool" }, + output: "ok", + ended: true, + children: [], + }, + ], + }, + ], + }); + } finally { + trace.cleanup(); + } + }); + + // The per-turn timing used to place LLM spans (turn start + last-child-end) + // must survive a restart, so an LLM call that opens AFTER the restart still + // starts at the right place (here: the turn's start, since it's the first + // child) rather than at its post-restart output record. + test("turn timing survives a restart so a resumed llm span starts correctly", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-timing"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: the turn opens at :11 but the model hasn't produced output yet when + // the server shuts down (no llm span exists yet — only the open turn). + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "turn_context", + payload: { model: "gpt-5.5" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + // The open turn's start time was persisted. + const snap = store.read(queueId); + expect(snap?.scopes[0]?.openTurns[0]?.turnId).toBe("turn-1"); + expect(snap?.scopes[0]?.openTurns[0]?.startTime).toBe(1767225611); + + // Run 2: a fresh processor. The model output finally lands (:18) and the + // call closes (:18). The resumed llm span must start at the turn's start + // (:11, restored), not at its output record (:18). + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:18Z", + type: "response_item", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "hi" }], + }, + }), + transcript({ + timestamp: "2026-01-01T00:00:18Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } }, + }), + transcript({ + timestamp: "2026-01-01T00:00:19Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "hi" }, + }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const spans = await trace.drain(); + const tree = spansToTree(spans); + const turn = tree?.children.find((c) => c.name === "turn: turn-1"); + const llm = turn?.children.find((c) => c.name === "gpt-5.5" || c.name === "llm"); + // Started at the restored turn start (:11), ended at token_count (:18). + expect(llm?.metrics?.start).toBe(1767225611); + expect(llm?.metrics?.end).toBe(1767225618); + } finally { + trace.cleanup(); + } + }); + + test("a version-incompatible snapshot is discarded and the session starts fresh", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-badver"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + + // Plant a snapshot with an incompatible plugin version. + store.write(queueId, { + pluginVersion: "0.0.0-incompatible", + schemaVersion: 1, + sessionId: queueId, + savedAt: Date.now(), + reportingConfig: { traceToBraintrust: true }, + rootSpan: { spanId: "stale", rootSpanId: "stale", spanParents: [] }, + rootEnded: false, + rootEndTime: undefined, + rootEnrichment: {}, + mainScopePath: "/old.jsonl", + scopes: [], + compactionTriggerByTurn: [], + }); + + const deps = { queueId, reader, factory: trace.spanFactory, store }; + await runThrough( + [ + configEvent({ session_id: queueId }), + sessionMeta({ id: queueId, cwd: "/tmp/fresh" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "hi" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "ok" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const spans = await trace.drain(); + // A fresh trace was built (the stale snapshot's "stale" span id is absent), + // and the incompatible snapshot was removed. + expect(spans.some((s) => s.span_id === "stale")).toBe(false); + expectTrace(spans, { + span_attributes: { name: "codex: fresh", type: "task" }, + children: [{ span_attributes: { name: "turn: turn-1" }, output: "ok", children: [] }], + }); + } finally { + trace.cleanup(); + } + }); + + // Regression for the duplicate/never-closing compaction span: a compaction that + // FINISHED before the restart must not be re-opened on resume. (Previously the + // compaction span lived forever in a side-map and was rehydrated — re-emitting + // it with a fresh start and no end.) + test("a completed compaction is not re-opened on resume", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-compact"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a turn, then a compaction turn that completes, all before shutdown. + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "turn_context", + payload: { model: "gpt-5.5" }, + }), + // A normal turn. + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + userMessage({ message: "hi" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "hello" }), + stop({ turn_id: "turn-1" }), + // A compaction turn that fully completes. + transcript({ + timestamp: "2026-01-01T00:00:20Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-2" }, + }), + compacted({ + replacement_history: [{ type: "message", role: "user", content: [] }], + window_id: 1, + }), + tokenCount({ total_tokens: 5 }), + transcript({ + timestamp: "2026-01-01T00:00:22Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-2", last_agent_message: "" }, + }), + // A non-terminal hook drives the catch-up that consumes the above. + preToolUse({ session_id: queueId }), + ], + deps, + ); + + // The compaction span finished in run 1. + let spans = await trace.drain(); + const compaction1 = spans.find((s) => s.span_attributes?.name === "compaction"); + expect(compaction1?.metrics?.end).toBeDefined(); + const compactionId = compaction1?.span_id; + + // Run 2: a new processor resumes and runs another turn. + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:30Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-3" }, + }), + userMessage({ message: "again" }), + taskComplete({ turn_id: "turn-3", last_agent_message: "ok" }), + stop({ turn_id: "turn-3" }), + ], + deps, + ); + + spans = await trace.drain(); + // The compaction span must NOT have been re-emitted (reopened) on resume: + // run 2 produces no row for its span_id. + expect(spans.some((s) => s.span_id === compactionId)).toBe(false); + } finally { + trace.cleanup(); + } + }); + + // The root span, ended before a restart, must keep its original start AND end + // after resume — rehydrating it re-emits its row, so we re-assert the end. + test("a root ended before a restart keeps its original start and end", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-rootend"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a full turn; its Stop ends the root at the turn's end (:13). + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + userMessage({ message: "hi" }), + transcript({ + timestamp: "2026-01-01T00:00:13Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "hello" }, + }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const snap = store.read(queueId); + expect(snap?.rootEnded).toBe(true); + expect(snap?.rootEndTime).toBe(1767225613); + + // Run 2: a new processor resumes (a second turn arrives). + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:20Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-2" }, + }), + userMessage({ message: "again" }), + taskComplete({ turn_id: "turn-2", last_agent_message: "ok" }), + stop({ turn_id: "turn-2" }), + ], + deps, + ); + + const spans = await trace.drain(); + const tree = spansToTree(spans); + // The root kept its original start (:10) and end (:13) — not corrupted to + // the resume time, not left open. + expect(tree?.metrics?.start).toBe(1767225610); + expect(tree?.metrics?.end).toBe(1767225613); + // The second turn still attached under the same root. + expect(tree?.children.some((c) => c.name === "turn: turn-2")).toBe(true); + } finally { + trace.cleanup(); + } + }); + + // An OPEN span spanning the restart must keep its original start (the SDK would + // otherwise stamp the rehydration time). Here the turn opens at :11 in run 1 + // and is closed in run 2. + test("an open turn span keeps its original start across a restart", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-openstart"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + userMessage({ message: "hi" }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:30Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "done" }, + }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const tree = spansToTree(await trace.drain()); + const turn = tree?.children.find((c) => c.name === "turn: turn-1"); + // Start preserved from run 1 (:11), end from run 2 (:30) — not the resume time. + expect(turn?.metrics?.start).toBe(1767225611); + expect(turn?.metrics?.end).toBe(1767225630); + } finally { + trace.cleanup(); + } + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/settings.test.ts b/plugins/trace-codex/src/agents/codex/settings.test.ts index 1ace67b..068816c 100644 --- a/plugins/trace-codex/src/agents/codex/settings.test.ts +++ b/plugins/trace-codex/src/agents/codex/settings.test.ts @@ -70,6 +70,16 @@ describe("loadSettingsFile", () => { expect(loadSettingsFile(path)).toEqual({}); }); + test("parses flushOnTurnEnd boolean; ignores non-boolean", () => { + expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: true })))).toEqual({ + flushOnTurnEnd: true, + }); + expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: false })))).toEqual({ + flushOnTurnEnd: false, + }); + expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: "yes" })))).toEqual({}); + }); + test("missing file returns {}", () => { expect(loadSettingsFile(join(dir, "absent.json"))).toEqual({}); }); @@ -125,6 +135,13 @@ describe("applySettingsToEnv", () => { expect(env.TRACE_TO_BRAINTRUST).toBe("true"); expect(env.BRAINTRUST_ADDITIONAL_METADATA).toBe('{"team":"platform"}'); }); + + test("maps flushOnTurnEnd to BRAINTRUST_FLUSH_ON_TURN_END", () => { + const env: NodeJS.ProcessEnv = {}; + const applied = applySettingsToEnv({ flushOnTurnEnd: true }, env); + expect(env.BRAINTRUST_FLUSH_ON_TURN_END).toBe("true"); + expect(applied).toEqual(["flushOnTurnEnd"]); + }); }); describe("settingsFilePath", () => { diff --git a/plugins/trace-codex/src/agents/codex/settings.ts b/plugins/trace-codex/src/agents/codex/settings.ts index f1cc7f0..3373539 100644 --- a/plugins/trace-codex/src/agents/codex/settings.ts +++ b/plugins/trace-codex/src/agents/codex/settings.ts @@ -25,6 +25,13 @@ export interface Settings { project?: string; /** Master switch: when false or unset, no traces are reported to Braintrust. */ traceToBraintrust?: boolean; + /** + * When true, the hook blocks at each turn's end (the Stop event) until the + * server confirms all events are processed and spans are flushed. Useful in + * programmatic/CI runs to guarantee traces are delivered before Codex exits. + * Defaults to false (fire-and-forget flush so the turn isn't stalled). + */ + flushOnTurnEnd?: boolean; /** Extra metadata merged into the root span (standard keys win on conflict). */ additionalMetadata?: Record; /** If set, record every event to this NDJSON file (for replay). */ @@ -44,6 +51,7 @@ export const SETTINGS_TO_ENV: Record = { appUrl: "BRAINTRUST_APP_URL", project: "BRAINTRUST_PROJECT", traceToBraintrust: "TRACE_TO_BRAINTRUST", + flushOnTurnEnd: "BRAINTRUST_FLUSH_ON_TURN_END", additionalMetadata: "BRAINTRUST_ADDITIONAL_METADATA", recordFile: "BRAINTRUST_EVENT_SERVER_RECORD_FILE", port: "BRAINTRUST_EVENT_SERVER_PORT", @@ -55,6 +63,8 @@ const SETTING_KEYS = Object.keys(SETTINGS_TO_ENV) as Array; const NUMBER_KEYS = new Set(["port", "idleTimeoutMs", "idleCheckIntervalMs"]); +const BOOLEAN_KEYS = new Set(["traceToBraintrust", "flushOnTurnEnd"]); + /** * The plugin's writable data directory, where config.json lives. Resolved * independently of the server Config layer so settings detection stays its own @@ -104,8 +114,8 @@ export function loadSettingsFile(path: string): Settings { if (typeof value === "number" && Number.isFinite(value)) { (settings as Record)[key] = value; } - } else if (key === "traceToBraintrust") { - if (typeof value === "boolean") settings.traceToBraintrust = value; + } else if (BOOLEAN_KEYS.has(key)) { + if (typeof value === "boolean") (settings as Record)[key] = value; } else if (key === "additionalMetadata") { if (typeof value === "object" && !Array.isArray(value)) { settings.additionalMetadata = value as Record; diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts new file mode 100644 index 0000000..2a85a85 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts @@ -0,0 +1,103 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync, utimesSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createTestLogger } from "../../test-helpers.ts"; +import { SnapshotStore } from "./snapshot-store.ts"; +import type { CodexSnapshot } from "./state-snapshot.ts"; + +function makeSnapshot(sessionId: string): CodexSnapshot { + return { + pluginVersion: "9.9.9", + schemaVersion: 1, + sessionId, + savedAt: Date.now(), + reportingConfig: { project: "p", traceToBraintrust: true }, + rootSpan: { spanId: "r", rootSpanId: "r", spanParents: [] }, + rootEnded: false, + rootEndTime: undefined, + rootEnrichment: { source: "startup" }, + mainScopePath: "/t.jsonl", + scopes: [], + compactionTriggerByTurn: [], + }; +} + +describe("SnapshotStore", () => { + let dir: string; + let store: SnapshotStore; + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), "snap-store-")); + store = new SnapshotStore(createTestLogger(), { dir }); + }); + + afterEach(() => { + rmSync(dir, { recursive: true, force: true }); + }); + + test("read returns null when no snapshot exists", () => { + expect(store.read("missing")).toBeNull(); + }); + + test("write then read round-trips a snapshot", () => { + const snap = makeSnapshot("sess-1"); + store.write("sess-1", snap); + const loaded = store.read("sess-1"); + expect(loaded).toEqual(snap); + }); + + test("write is isolated per session id", () => { + store.write("a", makeSnapshot("a")); + store.write("b", makeSnapshot("b")); + expect(store.read("a")?.sessionId).toBe("a"); + expect(store.read("b")?.sessionId).toBe("b"); + }); + + test("delete removes a snapshot", () => { + store.write("sess-1", makeSnapshot("sess-1")); + store.delete("sess-1"); + expect(store.read("sess-1")).toBeNull(); + }); + + test("delete of a missing snapshot does not throw", () => { + expect(() => store.delete("nope")).not.toThrow(); + }); + + test("read of a malformed snapshot file returns null (never throws)", () => { + store.write("bad", makeSnapshot("bad")); + // Corrupt the file on disk. + writeFileSync(join(dir, "state", "bad.json"), "{ not json"); + expect(store.read("bad")).toBeNull(); + }); + + test("a session id with path separators can't escape the state dir", () => { + const evil = "../../escape"; + store.write(evil, makeSnapshot(evil)); + // It round-trips by the same (sanitized) key... + expect(store.read(evil)?.sessionId).toBe(evil); + // ...and never created anything outside the state dir. + expect(() => rmSync(join(dir, "state"), { recursive: true })).not.toThrow(); + }); + + test("gcOlderThan removes snapshots older than the ttl but keeps fresh ones", () => { + store.write("old", makeSnapshot("old")); + store.write("fresh", makeSnapshot("fresh")); + // Backdate "old" two hours. + const oldPath = join(dir, "state", "old.json"); + const twoHoursAgo = new Date(Date.now() - 2 * 60 * 60 * 1000); + utimesSync(oldPath, twoHoursAgo, twoHoursAgo); + + store.gcOlderThan(60 * 60 * 1000); // 1 hour ttl + + expect(store.read("old")).toBeNull(); + expect(store.read("fresh")?.sessionId).toBe("fresh"); + }); + + test("gcOlderThan on a non-existent state dir does not throw", () => { + const empty = new SnapshotStore(createTestLogger(), { + dir: join(dir, "does-not-exist"), + }); + expect(() => empty.gcOlderThan()).not.toThrow(); + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.ts new file mode 100644 index 0000000..625e23d --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/snapshot-store.ts @@ -0,0 +1,161 @@ +// Per-session snapshot store: persists each Codex session's resumable processor +// state to one JSON file under the plugin's writable data directory, so a +// restarted server can rehydrate the session's in-progress trace. +// +// This is Codex-owned (not part of the generic event server): *where* a plugin +// may persist state is the coding agent's call. Codex hands plugins a writable +// PLUGIN_DATA directory; that's our base. Each agent that wants resume support +// brings its own store with whatever location rules its host imposes. +// +// Like the event recorder and the span logger, this never throws: a persistence +// failure must not be able to break a Codex turn. Every operation is wrapped and +// failures are logged and swallowed. Writes are atomic (temp file + rename) so a +// crash mid-write can't leave a half-written snapshot that fails to parse. + +import { + mkdirSync, + readdirSync, + readFileSync, + renameSync, + rmSync, + statSync, + writeFileSync, +} from "node:fs"; +import { join } from "node:path"; +import type { Logger } from "../../log.ts"; +import type { CodexSnapshot } from "./state-snapshot.ts"; + +/** Subdirectory of the data dir that holds per-session snapshots. */ +const STATE_SUBDIR = "state"; + +/** Default age after which an orphaned snapshot is garbage-collected (7 days). + * Orphans happen when a server crashes before a session's root span ends (which + * is when the snapshot is normally deleted). */ +export const DEFAULT_SNAPSHOT_TTL_MS = 7 * 24 * 60 * 60 * 1000; + +/** + * Resolve the plugin's writable data directory. Precedence matches the rest of + * the plugin (server Config + settings): explicit override, then Codex's + * PLUGIN_DATA, then a temp fallback so the binary still runs standalone. + */ +function dataDir(env: NodeJS.ProcessEnv): string { + return ( + env.BRAINTRUST_EVENT_SERVER_LOG_DIR || + env.PLUGIN_DATA || + `${env.TMPDIR || "/tmp"}/braintrust-event-server` + ); +} + +/** + * Map a session id to its snapshot filename. Session ids are UUIDs, but sanitize + * defensively so a stray id can never escape the state directory. + */ +function snapshotFileName(sessionId: string): string { + const safe = sessionId.replace(/[^A-Za-z0-9._-]/g, "_"); + return `${safe}.json`; +} + +export interface SnapshotStoreOptions { + /** Override the data dir base. Defaults to the resolved PLUGIN_DATA. */ + dir?: string; + env?: NodeJS.ProcessEnv; +} + +/** + * Reads and writes per-session {@link CodexSnapshot}s under + * `/state/.json`. All methods are best-effort and never + * throw. + */ +export class SnapshotStore { + private readonly dir: string; + private readonly logger: Logger; + + constructor(logger: Logger, options: SnapshotStoreOptions = {}) { + this.logger = logger; + const base = options.dir ?? dataDir(options.env ?? process.env); + this.dir = join(base, STATE_SUBDIR); + } + + private pathFor(sessionId: string): string { + return join(this.dir, snapshotFileName(sessionId)); + } + + /** Load a session's snapshot, or null if absent/unreadable/malformed. */ + read(sessionId: string): CodexSnapshot | null { + let raw: string; + try { + raw = readFileSync(this.pathFor(sessionId), "utf8"); + } catch { + return null; // absent is the common case + } + try { + return JSON.parse(raw) as CodexSnapshot; + } catch (err) { + this.logger.warn("snapshot store: malformed snapshot; ignoring", { + sessionId, + error: String(err), + }); + return null; + } + } + + /** Persist a session's snapshot atomically. Never throws. */ + write(sessionId: string, snapshot: CodexSnapshot): void { + const finalPath = this.pathFor(sessionId); + const tmpPath = `${finalPath}.${process.pid}.tmp`; + try { + mkdirSync(this.dir, { recursive: true }); + writeFileSync(tmpPath, JSON.stringify(snapshot)); + renameSync(tmpPath, finalPath); + } catch (err) { + this.logger.error("snapshot store: write failed", { + sessionId, + error: String(err), + }); + // Best-effort cleanup of the temp file so it doesn't linger. + try { + rmSync(tmpPath, { force: true }); + } catch { + // ignore + } + } + } + + /** Delete a session's snapshot (e.g. once its root span has ended). */ + delete(sessionId: string): void { + try { + rmSync(this.pathFor(sessionId), { force: true }); + } catch (err) { + this.logger.error("snapshot store: delete failed", { + sessionId, + error: String(err), + }); + } + } + + /** + * Remove snapshots older than `ttlMs` (by file mtime). Called at server + * startup to clean up orphans left by a previous crash. Never throws. + */ + gcOlderThan(ttlMs: number = DEFAULT_SNAPSHOT_TTL_MS): void { + let entries: string[]; + try { + entries = readdirSync(this.dir); + } catch { + return; // no state dir yet: nothing to GC + } + const cutoff = Date.now() - ttlMs; + for (const name of entries) { + if (!name.endsWith(".json")) continue; + const full = join(this.dir, name); + try { + if (statSync(full).mtimeMs < cutoff) { + rmSync(full, { force: true }); + this.logger.debug("snapshot store: gc removed stale snapshot", { file: name }); + } + } catch (err) { + this.logger.debug("snapshot store: gc skip", { file: name, error: String(err) }); + } + } + } +} diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts new file mode 100644 index 0000000..5498993 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts @@ -0,0 +1,62 @@ +import { describe, expect, test } from "bun:test"; +import { + type CodexSnapshot, + isCompatibleSnapshot, + redactReportingConfig, + SNAPSHOT_SCHEMA_VERSION, +} from "./state-snapshot.ts"; + +function snapshot(overrides: Partial = {}): CodexSnapshot { + return { + pluginVersion: "1.2.3", + schemaVersion: SNAPSHOT_SCHEMA_VERSION, + sessionId: "s", + savedAt: 0, + reportingConfig: undefined, + rootSpan: null, + rootEnded: false, + rootEndTime: undefined, + rootEnrichment: {}, + mainScopePath: null, + scopes: [], + compactionTriggerByTurn: [], + ...overrides, + }; +} + +describe("isCompatibleSnapshot", () => { + test("accepts a matching plugin + schema version", () => { + expect(isCompatibleSnapshot(snapshot(), "1.2.3")).toBe(true); + }); + + test("rejects a mismatched plugin version", () => { + expect(isCompatibleSnapshot(snapshot({ pluginVersion: "9.9.9" }), "1.2.3")).toBe(false); + }); + + test("rejects a mismatched schema version", () => { + expect( + isCompatibleSnapshot(snapshot({ schemaVersion: SNAPSHOT_SCHEMA_VERSION + 1 }), "1.2.3"), + ).toBe(false); + }); +}); + +describe("redactReportingConfig", () => { + test("strips the apiKey but keeps everything else", () => { + const redacted = redactReportingConfig({ + project: "p", + apiKey: "sk-secret", + apiUrl: "https://api", + traceToBraintrust: true, + }); + expect(redacted).toEqual({ + project: "p", + apiUrl: "https://api", + traceToBraintrust: true, + }); + expect("apiKey" in (redacted ?? {})).toBe(false); + }); + + test("returns undefined for an undefined config", () => { + expect(redactReportingConfig(undefined)).toBeUndefined(); + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.ts new file mode 100644 index 0000000..0564ee4 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.ts @@ -0,0 +1,152 @@ +// Serializable snapshot of a CodexEventProcessor's resumable state. +// +// The processor builds a session's trace incrementally, holding all its +// bookkeeping in memory: where it is in each transcript file (byte offsets), +// which spans are still open (and their identities), the reconstructed +// conversation history used to shape LLM-call inputs, and a few side-maps that +// reconcile hook-only data with the transcript. The rollout transcript on disk +// is the durable source of truth for span *content*; this snapshot captures only +// the bookkeeping needed to pick up exactly where a previous run left off. +// +// The server is short-lived: it shuts down after an idle window (default 5 min) +// or when the user closes the session, and a later hook (a "resume", or just the +// next turn) spins up a fresh processor for the same session id. Without a +// snapshot that processor would re-read the transcript from zero with brand-new +// span ids, producing duplicate/orphaned spans. With one, it rehydrates the +// original spans (via {@link SpanFactory.rehydrateSpan}) and continues, so every +// further log()/end() merges into the same trace. +// +// This module defines the snapshot *shape* and pure mapping helpers only; the +// processor owns turning its live state (which holds Span handles) into a +// snapshot and back, since that needs the span factory. Span handles are stored +// here as {@link SpanRef} identities (span_id/root_span_id/parents), captured +// from the synchronous getters on a live span. + +import type { ReportingConfig, SpanRef } from "../../braintrust/logger.ts"; + +/** + * Bumped whenever the snapshot shape changes incompatibly. A snapshot whose + * schemaVersion doesn't match is discarded on load (the session simply starts + * fresh, losing only resume continuity), so we never try to rehydrate state we + * can't interpret. + */ +export const SNAPSHOT_SCHEMA_VERSION = 4; + +/** A conversation item (chat message or reasoning), stored verbatim. These are + * plain JSON already, so they round-trip without transformation. */ +export type ConversationItemSnapshot = Record; + +/** An open LLM span and the output items accumulated for it so far. */ +export interface OpenLlmSnapshot { + span: SpanRef; + turnId: string | undefined; + output: ConversationItemSnapshot[]; + /** Time (Unix seconds) of the call's last model-output item; the span's end. */ + lastOutputTime: number | undefined; + outputPreset?: boolean; +} + +/** An open tool span and the turn it belongs to (keyed by call_id in the map). */ +export interface OpenToolSnapshot { + callId: string; + span: SpanRef; + turnId: string; +} + +/** An open turn span (keyed by turn_id in the map), plus the timing used to + * place its child LLM spans. */ +export interface OpenTurnSnapshot { + turnId: string; + span: SpanRef; + /** Turn start (Unix seconds), used as the first child's start time. */ + startTime: number | undefined; + /** Max end time (Unix seconds) among the turn's already-ended children. */ + lastChildEndTime: number | undefined; +} + +/** Pending-subagent info captured before its root span exists. The parent turn + * span is referenced by identity so it can be rehydrated independently. */ +export interface PendingSubagentSnapshot { + agentId: string; + agentType: string | undefined; + parent: SpanRef; +} + +/** One transcript file's parsing state. Mirrors TranscriptScope, with Span + * handles replaced by SpanRef identities. */ +export interface ScopeSnapshot { + path: string; + kind: "main" | "subagent"; + offset: number; + openTurns: OpenTurnSnapshot[]; + turnsAwaitingCompletion: string[]; + conversationHistory: ConversationItemSnapshot[]; + openLlm: OpenLlmSnapshot | null; + openTools: OpenToolSnapshot[]; + model: string | undefined; + lastTurnEndTime: number | undefined; + subagentRootSpan: SpanRef | null; + subagentEnded: boolean; + pendingSubagent?: PendingSubagentSnapshot; +} + +/** A side-map entry pairing a key with a string value. */ +export interface StringMapEntry { + key: string; + value: string; +} + +/** The full resumable state of a CodexEventProcessor. */ +export interface CodexSnapshot { + /** Plugin version that wrote this snapshot; discarded on mismatch. */ + pluginVersion: string; + /** Snapshot shape version; discarded on mismatch. */ + schemaVersion: number; + /** Session id (queueId) this snapshot belongs to. */ + sessionId: string | null; + /** Wall-clock ms when written, for age-based GC of orphans. */ + savedAt: number; + + /** + * Reporting config, minus secrets. `apiKey` is intentionally never persisted + * (re-resolved from the env / config event on resume), mirroring how the event + * recorder redacts it. + */ + reportingConfig: Omit | undefined; + + rootSpan: SpanRef | null; + rootEnded: boolean; + /** When the root was ended (Unix seconds), if it has been; re-asserted on + * resume so the rehydrated root row stays closed. */ + rootEndTime: number | undefined; + rootEnrichment: { source?: string; permissionMode?: string }; + + /** Path of the main scope within `scopes`, if it exists. */ + mainScopePath: string | null; + scopes: ScopeSnapshot[]; + + /** + * Compaction triggers ("manual"/"auto") keyed by turn_id. Plain strings (no + * span references), so safe to carry across a restart. The span-bearing + * side-maps (spawn-agent turn spans, compaction spans) are deliberately NOT + * persisted: they hold references to spans that may have closed, and + * rehydrating a closed span re-opens it. + */ + compactionTriggerByTurn: StringMapEntry[]; +} + +/** A parsed snapshot is usable only if it matches our plugin + schema version. */ +export function isCompatibleSnapshot(snapshot: CodexSnapshot, pluginVersion: string): boolean { + return ( + snapshot.schemaVersion === SNAPSHOT_SCHEMA_VERSION && snapshot.pluginVersion === pluginVersion + ); +} + +/** Strip the apiKey from a reporting config before persisting it. */ +export function redactReportingConfig( + config: ReportingConfig | undefined, +): Omit | undefined { + if (config === undefined) return undefined; + const { apiKey: _apiKey, ...rest } = config; + return rest; +} diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index eaa0e57..e85d729 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -1,7 +1,13 @@ -// Codex-specific test helpers: event builders that mimic Codex hook payloads, -// and assertProducesTrace, which runs events through the CodexEventProcessor and -// checks the resulting Braintrust trace. Generic trace/span helpers are imported -// from the shared test-helpers. +// Codex-specific test helpers. Spans are built from transcript records; hooks +// drive lifecycle (config, root enrichment, the Stop sentinel). A test expresses +// both sources in one ordered list. transcript(...) (and the typed builders that +// wrap it) represent lines being appended to the transcript file; hook builders +// (sessionStart, stop, configEvent) represent hook events arriving. +// +// Semantics: a transcript entry buffers a line. The NEXT hook triggers a +// catch-up that reads all buffered-but-unread lines, then handles the hook — +// modeling how the real processor catches up on each hook. A Stop hook waits for +// the turn's task_complete to be present before catching up. import type { EnqueueEvent } from "../../server/routes.ts"; import { @@ -13,32 +19,68 @@ import { } from "../../test-helpers.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; +import type { + TranscriptReader, + TranscriptReadResult, + TranscriptWaitResult, + WaitForOptions, +} from "./transcript-reader.ts"; + +/** Synthetic transcript path attached to hook events so catch-up reads run. */ +export const TEST_TRANSCRIPT_PATH = "/test/transcript.jsonl"; + +/** Monotonic default timestamp so transcript records sort in creation order. */ +let tsCounter = 0; +function nextTimestamp(): string { + tsCounter += 1; + return new Date(Date.UTC(2026, 0, 1, 0, 0, tsCounter)).toISOString(); +} // ============================================================================ -// Event builders +// Hook event builders // ============================================================================ -function codexEvent(eventName: string, eventData: Record): EnqueueEvent { +function codexHook(eventName: string, eventData: Record): EnqueueEvent { const queueId = typeof eventData.session_id === "string" ? eventData.session_id : "session-1"; return { queueId, eventSource: "codex-hook", eventSourceVersion: null, eventName, - eventData: { hook_event_name: eventName, ...eventData }, + eventData: { hook_event_name: eventName, transcript_path: TEST_TRANSCRIPT_PATH, ...eventData }, }; } export function sessionStart(data: Record = {}): EnqueueEvent { - return codexEvent("SessionStart", { session_id: "session-1", ...data }); + return codexHook("SessionStart", { session_id: "session-1", ...data }); } -export function userPromptSubmit(data: Record = {}): EnqueueEvent { - return codexEvent("UserPromptSubmit", { session_id: "session-1", ...data }); +export function stop(data: Record = {}): EnqueueEvent { + return codexHook("Stop", { session_id: "session-1", ...data }); } -export function stop(data: Record = {}): EnqueueEvent { - return codexEvent("Stop", { session_id: "session-1", ...data }); +export function subagentStart(data: Record = {}): EnqueueEvent { + return codexHook("SubagentStart", { session_id: "session-1", ...data }); +} + +export function subagentStop(data: Record = {}): EnqueueEvent { + return codexHook("SubagentStop", { session_id: "session-1", ...data }); +} + +export function preToolUse(data: Record = {}): EnqueueEvent { + return codexHook("PreToolUse", { session_id: "session-1", ...data }); +} + +export function preCompact(data: Record = {}): EnqueueEvent { + return codexHook("PreCompact", { session_id: "session-1", ...data }); +} + +export function postCompact(data: Record = {}): EnqueueEvent { + return codexHook("PostCompact", { session_id: "session-1", ...data }); +} + +export function postToolUse(data: Record = {}): EnqueueEvent { + return codexHook("PostToolUse", { session_id: "session-1", ...data }); } /** A config event enabling tracing; pass extra reporting config via `data`. */ @@ -53,25 +95,307 @@ export function configEvent(data: Record = {}): EnqueueEvent { } // ============================================================================ -// assertProducesTrace: run events through a CodexEventProcessor and assert the -// resulting trace. One session => one trace. +// Transcript entry builders +// ============================================================================ + +/** Marks a transcript line being written to the (fake) transcript file. */ +export interface TranscriptEntry { + kind: "transcript"; + record: Record; +} + +/** + * Build a raw transcript entry from a full record envelope + * `{ timestamp, type, payload }`. `timestamp` defaults to a monotonic value. + */ +export function transcript(record: Record = {}): TranscriptEntry { + return { + kind: "transcript", + record: { timestamp: nextTimestamp(), ...record }, + }; +} + +/** session_meta record: opens the root span (cwd, session id, cli version). */ +export function sessionMeta(payload: Record = {}): TranscriptEntry { + return transcript({ type: "session_meta", payload: { id: "session-1", ...payload } }); +} + +/** turn_context record: carries the model for a turn. */ +export function turnContext(payload: Record = {}): TranscriptEntry { + return transcript({ type: "turn_context", payload }); +} + +/** task_started event: opens a turn span (by turn_id). */ +export function taskStarted(payload: Record = {}): TranscriptEntry { + return transcript({ type: "event_msg", payload: { type: "task_started", ...payload } }); +} + +/** user_message event: the prompt; sets the open turn's input. */ +export function userMessage(payload: Record = {}): TranscriptEntry { + return transcript({ type: "event_msg", payload: { type: "user_message", ...payload } }); +} + +/** task_complete event: closes a turn span (output = last_agent_message). */ +export function taskComplete(payload: Record = {}): TranscriptEntry { + return transcript({ type: "event_msg", payload: { type: "task_complete", ...payload } }); +} + +/** function_call response_item: opens a tool span (by call_id, under turn). */ +export function functionCall(payload: Record = {}): TranscriptEntry { + const { turn_id, metadata, ...rest } = payload as { + turn_id?: string; + metadata?: Record; + } & Record; + return transcript({ + type: "response_item", + payload: { + type: "function_call", + ...(turn_id !== undefined ? { metadata: { turn_id, ...metadata } } : { metadata }), + ...rest, + }, + }); +} + +/** function_call_output response_item: closes a tool span (by call_id). */ +export function functionCallOutput(payload: Record = {}): TranscriptEntry { + return transcript({ + type: "response_item", + payload: { type: "function_call_output", ...payload }, + }); +} + +/** custom_tool_call response_item (e.g. apply_patch): opens a tool span. */ +export function customToolCall(payload: Record = {}): TranscriptEntry { + const { turn_id, metadata, ...rest } = payload as { + turn_id?: string; + metadata?: Record; + } & Record; + return transcript({ + type: "response_item", + payload: { + type: "custom_tool_call", + ...(turn_id !== undefined ? { metadata: { turn_id, ...metadata } } : { metadata }), + ...rest, + }, + }); +} + +/** custom_tool_call_output response_item: closes a tool span (by call_id). */ +export function customToolCallOutput(payload: Record = {}): TranscriptEntry { + return transcript({ + type: "response_item", + payload: { type: "custom_tool_call_output", ...payload }, + }); +} + +/** A message response_item with the given role and text. */ +export function message(role: string, text: string): TranscriptEntry { + const itemType = role === "assistant" ? "output_text" : "input_text"; + return transcript({ + type: "response_item", + payload: { type: "message", role, content: [{ type: itemType, text }] }, + }); +} + +/** assistant message response_item (model output; opens/feeds an llm span). */ +export function assistantMessage(text: string): TranscriptEntry { + return message("assistant", text); +} + +/** user message response_item (input context). */ +export function userMessageItem(text: string): TranscriptEntry { + return message("user", text); +} + +/** + * reasoning response_item. Pass summary strings to surface readable reasoning + * ("thinking"); they're wrapped in Codex's real `{type:"summary_text", text}` + * shape. With no strings, only `encrypted_content` is present (the common case), + * so nothing readable is surfaced. + */ +export function reasoning(summary: string[] = []): TranscriptEntry { + return transcript({ + type: "response_item", + payload: { + type: "reasoning", + summary: summary.map((text) => ({ type: "summary_text", text })), + encrypted_content: "opaque", + }, + }); +} + +/** + * token_count event: closes the current llm span. `usage` maps the Codex token + * keys (input_tokens, output_tokens, total_tokens, ...) onto last_token_usage. + */ +export function tokenCount(usage: Record = {}): TranscriptEntry { + return transcript({ + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: usage } }, + }); +} + +/** + * compacted record: marks the current turn as a context compaction. + * `replacement_history` is the summarized history that replaces prior context. + */ +export function compacted(payload: Record = {}): TranscriptEntry { + return transcript({ type: "compacted", payload }); +} + +/** One entry in a mixed trace list: a hook event or a transcript write. */ +export type TraceEntry = EnqueueEvent | TranscriptEntry; + +function isTranscriptEntry(entry: TraceEntry): entry is TranscriptEntry { + return (entry as TranscriptEntry).kind === "transcript"; +} + +// ============================================================================ +// FakeTranscriptReader: an in-memory transcript fed by transcript(...) entries. +// ============================================================================ + +/** + * Models the transcript file as a growing byte buffer. `append` writes a + * complete line; `readFrom` returns complete lines from the given offset. + * `waitFor` resolves immediately from the current buffer (the harness has + * already written everything available before the hook runs), so tests stay + * synchronous and don't actually sleep. + */ +export class FakeTranscriptReader implements TranscriptReader { + private buffer = ""; + /** Offsets passed to readFrom/waitFor, in call order (for assertions). */ + readonly readOffsets: number[] = []; + + append(line: string): void { + this.buffer += `${line}\n`; + } + + readFrom(_path: string, offset: number): TranscriptReadResult { + this.readOffsets.push(offset); + const from = offset > this.buffer.length ? 0 : offset; + const slice = this.buffer.slice(from); + const lines: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + lines.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + const sentinelFound = lines.some(predicate); + return { lines, offset: next, sentinelFound }; + } +} + +/** + * A transcript reader backed by multiple in-memory files, keyed by path. Used by + * subagent tests, where the main session and each subagent write to separate + * transcript files but are read by one processor. `append(path, line)` writes to + * a specific file; reads/waits operate per path with independent offsets. + */ +export class MultiFileTranscriptReader implements TranscriptReader { + private readonly buffers = new Map(); + /** Per-path withheld trailing line that only appears after N more reads. */ + private readonly withheldByPath = new Map(); + + append(path: string, line: string): void { + this.buffers.set(path, `${this.buffers.get(path) ?? ""}${line}\n`); + } + + /** + * Append a line to `path` that only becomes visible after `afterReads` more + * reads of that path. Models a record Codex writes slightly later, so it isn't + * seen by the catch-up that's running now but is by a subsequent one. + */ + withhold(path: string, line: string, afterReads: number): void { + this.withheldByPath.set(path, { line, afterReads }); + } + + readFrom(path: string, offset: number): TranscriptReadResult { + const pending = this.withheldByPath.get(path); + if (pending !== undefined) { + if (pending.afterReads <= 0) { + this.append(path, pending.line); + this.withheldByPath.delete(path); + } else { + pending.afterReads -= 1; + } + } + const buffer = this.buffers.get(path) ?? ""; + const from = offset > buffer.length ? 0 : offset; + const slice = buffer.slice(from); + const lines: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + lines.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + const sentinelFound = lines.some(predicate); + return { lines, offset: next, sentinelFound }; + } +} + +// ============================================================================ +// assertProducesTrace: run a mixed list of hook events and transcript writes +// through a CodexEventProcessor and assert the resulting trace. // ============================================================================ export async function assertProducesTrace( - events: EnqueueEvent[], + entries: TraceEntry[], expected: ExpectedSpan, - opts: { queueId?: string } = {}, + opts: { queueId?: string; reader?: FakeTranscriptReader } = {}, ): Promise { - const queueId = opts.queueId ?? events[0]?.queueId ?? "session-1"; + const hookEvents = entries.filter((e): e is EnqueueEvent => !isTranscriptEntry(e)); + const queueId = opts.queueId ?? hookEvents[0]?.queueId ?? "session-1"; + // Tracing is off by default; prepend a tracing-enabled config event unless the // caller already provided one, so trace assertions exercise the span path. - const hasConfig = events.some((e) => e.eventName === CODEX_CONFIG_EVENT); - const toProcess = hasConfig ? events : [configEvent({ session_id: queueId }), ...events]; + const hasConfig = hookEvents.some((e) => e.eventName === CODEX_CONFIG_EVENT); + const toRun: TraceEntry[] = hasConfig + ? entries + : [configEvent({ session_id: queueId }), ...entries]; + + const reader = opts.reader ?? new FakeTranscriptReader(); const trace = withCapturedTrace(); try { - const processor = new CodexEventProcessor(queueId, createTestLogger(), () => trace.spanFactory); - for (const event of toProcess) { - await processor.process(event); + const processor = new CodexEventProcessor( + queueId, + createTestLogger(), + () => trace.spanFactory, + reader, + ); + for (const entry of toRun) { + if (isTranscriptEntry(entry)) { + reader.append(JSON.stringify(entry.record)); + } else { + await processor.process(entry); + } } await processor.flush(); diff --git a/plugins/trace-codex/src/agents/codex/transcript-reader.ts b/plugins/trace-codex/src/agents/codex/transcript-reader.ts new file mode 100644 index 0000000..ef2ad70 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/transcript-reader.ts @@ -0,0 +1,155 @@ +// Reads new content from a Codex transcript file starting at a byte offset. +// +// The processor "catches up" on the transcript whenever a hook arrives: it asks +// the reader for everything appended since the last read, parses the new lines, +// and advances its offset. This interface is the seam that keeps the processor +// free of direct filesystem coupling — production reads the real file, while +// unit tests inject a fake fed by `transcript(...)` test entries. +// +// On a terminal hook (Stop) the processor must not race the transcript writer: +// the turn's closing record may not be flushed to disk yet. `waitFor` polls the +// file, bounded by a timeout, until a sentinel line appears (or it gives up). +// This is the one deliberate blocking point, and it always returns rather than +// hanging the turn. + +import { readFileSync, statSync } from "node:fs"; +import type { Logger } from "../../log.ts"; + +/** Result of a read: the complete new lines and the advanced byte offset. */ +export interface TranscriptReadResult { + /** Complete lines (newline-terminated in the file) read since `offset`. */ + lines: string[]; + /** + * The new offset to read from next time. A partial trailing line (no + * newline yet) is NOT consumed and not advanced past, so it is re-read once + * it is fully written. + */ + offset: number; +} + +/** Options bounding a `waitFor` poll. */ +export interface WaitForOptions { + /** Give up after this many milliseconds total. */ + timeoutMs: number; + /** Delay between polls. */ + intervalMs: number; +} + +/** Result of waiting: the lines read so far and whether the sentinel was seen. */ +export interface TranscriptWaitResult extends TranscriptReadResult { + /** True if `predicate` matched one of the read lines before the timeout. */ + sentinelFound: boolean; +} + +export interface TranscriptReader { + /** + * Read complete lines from `path` starting at byte `offset`. Returns the new + * lines plus the offset just past the last newline consumed. Must never + * throw: on a missing/unreadable file it returns no lines and the same + * offset, so the next hook simply tries again. + */ + readFrom(path: string, offset: number): TranscriptReadResult; + + /** + * Read from `offset`, then keep polling until `predicate` matches one of the + * cumulative lines read or the timeout elapses. Returns all lines read across + * polls, the advanced offset, and whether the sentinel was found. Never + * throws and always returns within the timeout, so the caller can fail soft + * (emit whatever exists) rather than stall the turn. + */ + waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + opts: WaitForOptions, + ): Promise; +} + +/** + * Split a buffer into complete lines (those ending in "\n") and report how many + * bytes were consumed. A trailing fragment without a newline is left for the + * next read. + */ +function splitCompleteLines(buf: Buffer): { lines: string[]; consumed: number } { + const lines: string[] = []; + let lineStart = 0; + let consumed = 0; + for (let i = 0; i < buf.length; i++) { + if (buf[i] === 0x0a /* \n */) { + lines.push(buf.toString("utf8", lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines, consumed }; +} + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Production reader: reads the file, slices from `offset`, and returns complete + * lines plus the advanced offset. Reads the whole file and slices for + * simplicity; transcripts are modest and this runs off the hook hot path. + */ +export function createFileTranscriptReader(diagLogger?: Logger): TranscriptReader { + function readFrom(path: string, offset: number): TranscriptReadResult { + try { + const size = statSync(path).size; + // File shrank or truncated (e.g. replaced): reset to start. + const from = offset > size ? 0 : offset; + if (from >= size) { + return { lines: [], offset: from }; + } + const buf = readFileSync(path); + const slice = buf.subarray(from); + const { lines, consumed } = splitCompleteLines(slice); + return { lines, offset: from + consumed }; + } catch (err) { + diagLogger?.debug("transcript reader: read failed; skipping", { + path, + offset, + error: String(err), + }); + return { lines: [], offset }; + } + } + + async function waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + opts: WaitForOptions, + ): Promise { + const deadline = Date.now() + opts.timeoutMs; + const allLines: string[] = []; + let cursor = offset; + let sentinelFound = false; + + // Poll until the sentinel appears or we run out of time. Always does at + // least one read so a sentinel already on disk is found immediately. + for (;;) { + const { lines, offset: next } = readFrom(path, cursor); + cursor = next; + for (const line of lines) { + allLines.push(line); + if (predicate(line)) sentinelFound = true; + } + if (sentinelFound || Date.now() >= deadline) break; + await sleep(opts.intervalMs); + } + + if (!sentinelFound) { + diagLogger?.debug("transcript reader: sentinel not found before timeout", { + path, + offset, + timeoutMs: opts.timeoutMs, + }); + } + return { lines: allLines, offset: cursor, sentinelFound }; + } + + return { readFrom, waitFor }; +} + +/** Default production reader (no diagnostics). */ +export const defaultTranscriptReader: TranscriptReader = createFileTranscriptReader(); diff --git a/plugins/trace-codex/src/agents/codex/transcript.test.ts b/plugins/trace-codex/src/agents/codex/transcript.test.ts new file mode 100644 index 0000000..c49404b --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/transcript.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, test } from "bun:test"; +import { parseTranscriptLine } from "./transcript.ts"; + +describe("parseTranscriptLine", () => { + test("parses a well-formed record envelope", () => { + const line = JSON.stringify({ + timestamp: "2026-06-22T04:50:25.725Z", + type: "response_item", + payload: { type: "message", role: "assistant", content: "hi" }, + }); + const record = parseTranscriptLine(line); + expect(record).not.toBeNull(); + expect(record?.type).toBe("response_item"); + expect(record?.payload?.type).toBe("message"); + expect(record?.timestamp).toBe("2026-06-22T04:50:25.725Z"); + }); + + test("preserves unknown record types (forward compatible)", () => { + const line = JSON.stringify({ type: "some_future_type", payload: { type: "whatever" } }); + const record = parseTranscriptLine(line); + expect(record?.type).toBe("some_future_type"); + expect(record?.payload?.type).toBe("whatever"); + }); + + test("preserves extra top-level fields", () => { + const line = JSON.stringify({ type: "compacted", payload: {}, window_id: 42 }); + const record = parseTranscriptLine(line); + expect((record as Record)?.window_id).toBe(42); + }); + + test("returns null for a blank line", () => { + expect(parseTranscriptLine("")).toBeNull(); + expect(parseTranscriptLine(" ")).toBeNull(); + }); + + test("returns null (no throw) for malformed JSON", () => { + expect(parseTranscriptLine('{"type":"response_item"')).toBeNull(); + expect(parseTranscriptLine("not json at all")).toBeNull(); + }); + + test("returns null for JSON that isn't an object", () => { + expect(parseTranscriptLine("[1,2,3]")).toBeNull(); + expect(parseTranscriptLine("42")).toBeNull(); + expect(parseTranscriptLine('"a string"')).toBeNull(); + expect(parseTranscriptLine("null")).toBeNull(); + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/transcript.ts b/plugins/trace-codex/src/agents/codex/transcript.ts new file mode 100644 index 0000000..e1d1a76 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/transcript.ts @@ -0,0 +1,81 @@ +// Pure parser/types for Codex transcript ("rollout") records. +// +// Codex writes an append-only JSONL transcript per session at the path every +// hook payload reports as `transcript_path`. Each line is one record with a +// stable envelope: `{ timestamp, type, payload }`. The transcript is the only +// source for LLM calls, token usage, and the true execution order of LLM and +// tool calls within a turn. +// +// This module only parses lines into typed records; it builds no spans and has +// no Braintrust dependency. The format is explicitly NOT a stable interface +// (Codex may change it across versions), so parsing is permissive and +// fail-soft: unknown record types are preserved as-is, and a malformed line +// yields null rather than throwing. + +/** + * The common envelope every transcript line shares. `payload` is intentionally + * loose: we narrow it per `type`/`payload.type` only where we need to, and we + * tolerate shapes we don't recognize. + */ +export interface TranscriptRecord { + /** ISO-8601 timestamp Codex wrote the record (its clock). */ + timestamp?: string; + /** Top-level record kind, e.g. "response_item", "event_msg", "turn_context". */ + type?: string; + /** Record-specific body. Narrow via `payload.type` for response_item/event_msg. */ + payload?: TranscriptPayload; + /** Anything else Codex includes; preserved for forward compatibility. */ + [key: string]: unknown; +} + +/** Loose payload shape. `type` discriminates response_item / event_msg subtypes. */ +export interface TranscriptPayload { + type?: string; + [key: string]: unknown; +} + +// Known top-level record types. Kept as constants (not a closed union) so +// unknown types still parse and flow through as plain records. +export const RECORD_SESSION_META = "session_meta"; +export const RECORD_TURN_CONTEXT = "turn_context"; +export const RECORD_RESPONSE_ITEM = "response_item"; +export const RECORD_EVENT_MSG = "event_msg"; +export const RECORD_COMPACTED = "compacted"; + +// Known response_item payload subtypes. +export const ITEM_MESSAGE = "message"; +export const ITEM_REASONING = "reasoning"; +export const ITEM_FUNCTION_CALL = "function_call"; +export const ITEM_FUNCTION_CALL_OUTPUT = "function_call_output"; +export const ITEM_CUSTOM_TOOL_CALL = "custom_tool_call"; +export const ITEM_CUSTOM_TOOL_CALL_OUTPUT = "custom_tool_call_output"; +export const ITEM_TOOL_SEARCH_CALL = "tool_search_call"; +export const ITEM_TOOL_SEARCH_OUTPUT = "tool_search_output"; + +// Known event_msg payload subtypes. +export const EVT_TOKEN_COUNT = "token_count"; +export const EVT_AGENT_MESSAGE = "agent_message"; +export const EVT_USER_MESSAGE = "user_message"; +export const EVT_TASK_STARTED = "task_started"; +export const EVT_TASK_COMPLETE = "task_complete"; +export const EVT_CONTEXT_COMPACTED = "context_compacted"; + +/** + * Parse a single transcript line into a TranscriptRecord. Returns null for + * blank lines and for anything that isn't a JSON object, so callers can skip it + * without breaking the session. Never throws. + */ +export function parseTranscriptLine(line: string): TranscriptRecord | null { + const trimmed = line.trim(); + if (trimmed.length === 0) return null; + try { + const parsed = JSON.parse(trimmed) as unknown; + if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) { + return null; + } + return parsed as TranscriptRecord; + } catch { + // Malformed JSON (e.g. a partially-written line): skip it. + return null; + } +} diff --git a/plugins/trace-codex/src/braintrust/logger.ts b/plugins/trace-codex/src/braintrust/logger.ts index a31710f..4b95023 100644 --- a/plugins/trace-codex/src/braintrust/logger.ts +++ b/plugins/trace-codex/src/braintrust/logger.ts @@ -13,9 +13,64 @@ import type { Logger } from "../log.ts"; export type { Span, StartSpanArgs }; +/** + * The minimal, serializable identity of a span: enough to recreate a handle + * bound to the same row after a restart, so further log()/end() calls merge into + * the original span server-side (rows are keyed by span_id). Captured from a live + * span's `spanId`/`rootSpanId`/`spanParents` getters (all synchronous). + */ +export interface SpanRef { + spanId: string; + rootSpanId: string; + /** Parent span ids (empty for a root span). */ + spanParents: string[]; + /** + * The span's name/type at capture time. Re-asserted on rehydration so the + * merged row keeps the original attributes — without them the SDK would infer + * a (wrong) name from the call stack when the rehydrated handle is created. + */ + name?: string; + type?: StartSpanArgs["type"]; + /** + * The span's original start time (Unix seconds). Re-asserted on rehydration so + * the merged row keeps its original start — without it the SDK stamps the start + * to the moment of rehydration (the resume time), corrupting the span's timing. + */ + startTime?: number; +} + +/** + * Capture a live span's identity for later rehydration. `name`/`type`/`startTime` + * are the attributes the span was created with (the SDK doesn't expose them as + * getters), passed by the caller so they can be re-asserted on resume. + */ +export function spanRef( + span: Span, + name?: string, + type?: StartSpanArgs["type"], + startTime?: number, +): SpanRef { + return { + spanId: span.spanId, + rootSpanId: span.rootSpanId, + spanParents: span.spanParents, + ...(name !== undefined ? { name } : {}), + ...(type !== undefined ? { type } : {}), + ...(startTime !== undefined ? { startTime } : {}), + }; +} + /** Creates root spans and can flush buffered events. */ export interface SpanFactory { startSpan(args: StartSpanArgs): Span; + /** + * Recreate a handle bound to an existing span's identity (see {@link SpanRef}). + * The returned span carries the original span_id/root_span_id/parents, so + * subsequent log()/end() calls merge into that row rather than creating a new + * one. Used to resume a session's trace after the server restarted. The + * returned handle is "naked": it logs nothing until the caller does. + */ + rehydrateSpan(ref: SpanRef): Span; flush(): Promise; } @@ -79,6 +134,14 @@ export function createSpanFactory(config?: ReportingConfig, diagLogger?: Logger) }); return { startSpan: (args) => logger.startSpan(args), + rehydrateSpan: (ref) => + logger.startSpan({ + spanId: ref.spanId, + parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId }, + ...(ref.name !== undefined ? { name: ref.name } : {}), + ...(ref.type !== undefined ? { type: ref.type } : {}), + ...(ref.startTime !== undefined ? { startTime: ref.startTime } : {}), + }), flush: async () => { try { await logger.flush(); diff --git a/plugins/trace-codex/src/client/client.ts b/plugins/trace-codex/src/client/client.ts index f4b1735..264306b 100644 --- a/plugins/trace-codex/src/client/client.ts +++ b/plugins/trace-codex/src/client/client.ts @@ -34,11 +34,19 @@ async function readStdin(): Promise { export interface HookClientOptions { /** * Event names that terminate a turn/session. After enqueuing one of these, - * the client asks the server to flush synchronously (POST /flush) so the - * final spans are delivered before this process — and any background server — - * is torn down. Defaults to none (fire-and-forget). + * the client asks the server to flush (POST /flush) so buffered spans are + * delivered. Defaults to none (fire-and-forget). */ terminalEvents?: readonly string[]; + /** + * Whether to block on a terminal event until the server confirms all events + * are processed and spans are delivered. Defaults to false. When false the + * client does nothing on a terminal event — the long-lived server flushes on + * its own when the queue goes idle. Set true (via BRAINTRUST_FLUSH_ON_TURN_END) + * to guarantee delivery before Codex exits, e.g. in programmatic/CI runs where + * the process tree is torn down before the idle flush can fire. + */ + flushOnTurnEnd?: boolean; } export async function runHookClient( @@ -79,11 +87,14 @@ export async function runHookClient( } } - // On a terminal event, block until the server confirms the queue has drained - // and buffered spans are flushed. Without this, a CI job (or any short-lived - // host) can end right after the agent's last turn — before the background - // server's idle timeout fires — and lose the final spans. - if (sawTerminal) { + // On a terminal event, only flush when flushOnTurnEnd is set: block until the + // server confirms the queue has drained and buffered spans are delivered — + // important for a CI job (or any short-lived host) that ends right after the + // agent's last turn, before the background server's idle timeout fires, which + // would otherwise lose the final spans. Otherwise do nothing: the long-lived + // server flushes on its own when the queue goes idle, so the turn isn't + // stalled by a flush request. + if (sawTerminal && (options.flushOnTurnEnd ?? false)) { const flushed = await postFlush(config, logger); logger.debug("flush requested on terminal event", { flushed }); } diff --git a/plugins/trace-codex/src/index.ts b/plugins/trace-codex/src/index.ts index d5be092..81853fe 100644 --- a/plugins/trace-codex/src/index.ts +++ b/plugins/trace-codex/src/index.ts @@ -22,6 +22,13 @@ function agentFactories(): Map { return new Map(AGENTS.map((a) => [a.eventSource, a.createProcessor])); } +/** Parse a boolean env var: true only for "true"/"1" (case-insensitive). */ +function parseBoolEnv(value: string | undefined): boolean { + if (!value) return false; + const v = value.trim().toLowerCase(); + return v === "true" || v === "1"; +} + async function main(): Promise { const mode = process.argv[2] ?? "hook"; @@ -64,6 +71,8 @@ async function main(): Promise { try { await runHookClient(config, logger, agent.buildEvents, { terminalEvents: agent.terminalEvents, + // Settings (above) have mapped BRAINTRUST_FLUSH_ON_TURN_END into env. + flushOnTurnEnd: parseBoolEnv(process.env.BRAINTRUST_FLUSH_ON_TURN_END), }); } catch (err) { logger.error("unexpected hook client error", { error: String(err) }); diff --git a/plugins/trace-codex/src/server/enqueue-client.ts b/plugins/trace-codex/src/server/enqueue-client.ts index a947c0e..e00f680 100644 --- a/plugins/trace-codex/src/server/enqueue-client.ts +++ b/plugins/trace-codex/src/server/enqueue-client.ts @@ -31,13 +31,14 @@ export async function postEnqueue( } /** - * Ask the server to process everything enqueued so far and flush buffered spans - * to the backend, then wait for it to confirm. Returns true on a 2xx response. - * Never throws. + * Ask the server to flush, blocking until it confirms. Returns true on a 2xx + * response. Never throws. * - * The hook calls this after a terminal event so the final spans are delivered - * before the process tree is torn down (e.g. when a CI job ends right after the - * agent's last turn, before the background server's idle timeout fires). + * The server waits until everything enqueued so far has been processed and + * buffered spans have reached the backend before responding. The hook uses this + * after a terminal event (only when block-on-stop is configured) so the final + * spans are delivered before the process tree is torn down (e.g. a CI job ending + * right after the agent's last turn, before the idle timeout fires). */ export async function postFlush( config: Pick, @@ -46,8 +47,8 @@ export async function postFlush( try { const res = await fetch(`${serverBaseUrl(config)}/flush`, { method: "POST", - // The server bounds its own wait (FLUSH_TIMEOUT_MS); allow a little more - // here so we receive its response rather than aborting the request. + // The server bounds its wait by FLUSH_TIMEOUT_MS; allow a little more here + // so we receive its response rather than aborting first. signal: AbortSignal.timeout(12_000), }); if (!res.ok) { diff --git a/plugins/trace-codex/src/server/routes.ts b/plugins/trace-codex/src/server/routes.ts index 3ae78bf..74ce636 100644 --- a/plugins/trace-codex/src/server/routes.ts +++ b/plugins/trace-codex/src/server/routes.ts @@ -83,11 +83,13 @@ export async function handleRequest(req: Request, deps: RouteDeps): Promise true), diff --git a/plugins/trace-codex/src/server/server.test.ts b/plugins/trace-codex/src/server/server.test.ts index e26689c..f85f690 100644 --- a/plugins/trace-codex/src/server/server.test.ts +++ b/plugins/trace-codex/src/server/server.test.ts @@ -127,4 +127,53 @@ describe("startServer", () => { await server.stop(); } }); + + test("/flush waits for the queue to drain and processors to flush", async () => { + // /flush must return only after every enqueued event has been processed AND + // processors have been flushed (the idle flush runs before drained() + // resolves). Records ordering to prove flush happened before the response. + const events: string[] = []; + const factory: EventProcessorFactory = () => ({ + process: async () => { + await sleep(20); + events.push("process"); + }, + flush: () => { + events.push("flush"); + }, + }); + + const server = startServer( + testConfig({ idleTimeoutMs: 60_000, idleCheckIntervalMs: 60_000 }), + new Map([["test", factory]]), + createTestLogger(), + ); + + try { + const enqueue = await fetch(`http://127.0.0.1:${server.port}/enqueue`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + queueId: "s1", + eventSource: "test", + eventSourceVersion: null, + eventName: "Stop", + eventData: {}, + }), + }); + expect(enqueue.status).toBe(200); + + const flushRes = await fetch(`http://127.0.0.1:${server.port}/flush`, { + method: "POST", + }); + expect(flushRes.status).toBe(200); + expect(await flushRes.json()).toEqual({ ok: true }); + + // By the time /flush returns, the event was processed and the idle flush + // ran. + expect(events).toEqual(["process", "flush"]); + } finally { + await server.stop(); + } + }); }); diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts index 639d4a4..2f99eeb 100644 --- a/plugins/trace-codex/src/test-helpers.ts +++ b/plugins/trace-codex/src/test-helpers.ts @@ -1,7 +1,7 @@ // Shared test helpers. import { _exportsForTestingOnly, initLogger } from "braintrust"; -import type { Span, SpanFactory, StartSpanArgs } from "./braintrust/logger.ts"; +import type { Span, SpanFactory, SpanRef, StartSpanArgs } from "./braintrust/logger.ts"; import type { Logger } from "./log.ts"; /** A no-op logger for tests; never touches the filesystem. */ @@ -53,6 +53,25 @@ export function createFakeSpanFactory(): FakeSpanFactory { }, } as unknown as Span; }, + rehydrateSpan(ref: SpanRef): Span { + const fake: FakeSpan = { + id: ref.spanId, + startArgs: {}, + flushCount: 0, + endCount: 0, + }; + spans.push(fake); + return { + id: fake.id, + flush: async () => { + fake.flushCount += 1; + }, + end: () => { + fake.endCount += 1; + return 0; + }, + } as unknown as Span; + }, flush: async () => { factory.factoryFlushCount += 1; }, @@ -85,7 +104,8 @@ export interface CapturedSpan { input?: unknown; output?: unknown; metadata?: Record; - metrics?: { start?: number; end?: number }; + tags?: string[]; + metrics?: { start?: number; end?: number } & Record; } export interface CapturedTrace { @@ -109,6 +129,14 @@ export function withCapturedTrace(): CapturedTrace { return { spanFactory: { startSpan: (args) => logger.startSpan(args), + rehydrateSpan: (ref) => + logger.startSpan({ + spanId: ref.spanId, + parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId }, + ...(ref.name !== undefined ? { name: ref.name } : {}), + ...(ref.type !== undefined ? { type: ref.type } : {}), + ...(ref.startTime !== undefined ? { startTime: ref.startTime } : {}), + }), flush: () => logger.flush(), }, drain: async () => { @@ -131,12 +159,51 @@ export interface SpanTree { input?: unknown; output?: unknown; metadata?: Record; - metrics?: { start?: number; end?: number }; + tags?: string[]; + metrics?: { start?: number; end?: number } & Record; children: SpanTree[]; } +/** + * Merge captured span rows that share a span_id into one logical span, the way + * Braintrust merges rows server-side. Multiple rows for one span_id occur when a + * span is logged across several calls (e.g. start, then output, then end) and + * especially when a span is rehydrated after a server restart: the resumed + * handle re-emits rows under the original id. Later non-empty fields win; + * metadata and span_attributes are shallow-merged so partial updates accumulate. + */ +export function mergeCapturedSpans(spans: CapturedSpan[]): CapturedSpan[] { + const byId = new Map(); + const order: string[] = []; + for (const span of spans) { + const existing = byId.get(span.span_id); + if (existing === undefined) { + byId.set(span.span_id, { ...span }); + order.push(span.span_id); + continue; + } + // Merge: later rows override, but don't clobber an existing value with + // undefined (a partial row that omits a field shouldn't erase it). + const merged: CapturedSpan = { ...existing }; + for (const [key, value] of Object.entries(span) as [keyof CapturedSpan, unknown][]) { + if (value === undefined) continue; + if (key === "metadata" || key === "span_attributes" || key === "metrics") { + merged[key] = { + ...(existing[key] as Record | undefined), + ...(value as Record), + } as never; + } else { + merged[key] = value as never; + } + } + byId.set(span.span_id, merged); + } + return order.map((id) => byId.get(id) as CapturedSpan); +} + /** Build a single-rooted tree from flat captured spans (via span_parents). */ -export function spansToTree(spans: CapturedSpan[]): SpanTree | null { +export function spansToTree(rawSpans: CapturedSpan[]): SpanTree | null { + const spans = mergeCapturedSpans(rawSpans); if (spans.length === 0) return null; const root = spans.find( @@ -171,6 +238,7 @@ export function spansToTree(spans: CapturedSpan[]): SpanTree | null { input: span.input, output: span.output, metadata: span.metadata, + tags: span.tags, metrics: span.metrics, children, }; @@ -188,8 +256,12 @@ export interface ExpectedSpan { input?: unknown; output?: unknown; metadata?: Record; + /** If set, assert each listed tag is present on the span. */ + tags?: string[]; /** If set, assert whether the span has an end time (true) or not (false). */ ended?: boolean; + /** If set, assert exact start/end metric values (Unix seconds). */ + metrics?: { start?: number; end?: number } & Record; /** Exact list of children (length and order are checked). */ children?: ExpectedSpan[]; } @@ -198,6 +270,23 @@ function nameMatches(actual: string | undefined, expected: string | RegExp): boo return expected instanceof RegExp ? expected.test(actual ?? "") : actual === expected; } +/** JSON.stringify with object keys sorted recursively, for order-insensitive + * deep equality of plain data (objects/arrays/primitives). */ +function canonicalJson(value: unknown): string { + const sort = (v: unknown): unknown => { + if (Array.isArray(v)) return v.map(sort); + if (v !== null && typeof v === "object") { + const out: Record = {}; + for (const key of Object.keys(v as Record).sort()) { + out[key] = sort((v as Record)[key]); + } + return out; + } + return v; + }; + return JSON.stringify(sort(value)); +} + export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: string): string[] { const diffs: string[] = []; if (actual === null) { @@ -233,17 +322,37 @@ export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: } if (expected.metadata !== undefined) { for (const [key, value] of Object.entries(expected.metadata)) { - const a = JSON.stringify(actual.metadata?.[key]); - const e = JSON.stringify(value); + // Compare with sorted keys so nested-object key order doesn't matter (the + // SDK may reorder keys, e.g. when metadata is merged across log() calls). + const a = canonicalJson(actual.metadata?.[key]); + const e = canonicalJson(value); if (a !== e) diffs.push(`${path}.metadata.${key}: expected ${e}, got ${a}`); } } + if (expected.tags !== undefined) { + for (const tag of expected.tags) { + if (!actual.tags?.includes(tag)) { + diffs.push( + `${path}.tags: expected to include "${tag}", got ${JSON.stringify(actual.tags)}`, + ); + } + } + } if (expected.ended !== undefined) { const isEnded = actual.metrics?.end !== undefined; if (isEnded !== expected.ended) { diffs.push(`${path}.ended: expected ${expected.ended}, got ${isEnded}`); } } + if (expected.metrics !== undefined) { + for (const [key, value] of Object.entries(expected.metrics)) { + if (value === undefined) continue; + const actualValue = actual.metrics?.[key]; + if (actualValue !== value) { + diffs.push(`${path}.metrics.${key}: expected ${value}, got ${String(actualValue)}`); + } + } + } if (expected.children !== undefined) { if (actual.children.length !== expected.children.length) { diffs.push(