From 6942000ec2b695a5444bade0735615580ed129ce Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Thu, 18 Jun 2026 12:52:44 -0600 Subject: [PATCH 01/17] smoke test job --- .github/workflows/ci.yaml | 2 +- .github/workflows/release.yaml | 14 +- .github/workflows/smoke.yaml | 81 +++++++++ plugins/trace-codex/Makefile | 14 +- plugins/trace-codex/README.md | 14 ++ plugins/trace-codex/bin/codex-hook.sh | 11 +- plugins/trace-codex/scripts/mock-collector.ts | 116 ++++++++++++ plugins/trace-codex/scripts/smoke-test.sh | 170 ++++++++++++++++++ 8 files changed, 418 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/smoke.yaml create mode 100644 plugins/trace-codex/scripts/mock-collector.ts create mode 100755 plugins/trace-codex/scripts/smoke-test.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 57dc956..da62abb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,7 +7,7 @@ on: jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 44aee2c..fec606b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -17,7 +17,9 @@ env: jobs: release: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + outputs: + version: ${{ steps.vars.outputs.version }} steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -95,3 +97,13 @@ jobs: "$PLUGIN_DIR"/bin/codex-hook-darwin-x64 \ "$PLUGIN_DIR"/bin/codex-hook-linux-x64 \ "$PLUGIN_DIR"/bin/codex-hook-linux-arm64 + + # After a successful publish, run the end-to-end smoke test (macOS + Linux) + # against the just-released version. + smoke: + needs: release + uses: ./.github/workflows/smoke.yaml + with: + version: ${{ needs.release.outputs.version }} + secrets: + openai_api_key: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml new file mode 100644 index 0000000..ea66015 --- /dev/null +++ b/.github/workflows/smoke.yaml @@ -0,0 +1,81 @@ +name: Smoke trace-codex + +# End-to-end smoke test for the trace-codex plugin against a published release. +# Installs Codex + the plugin from the marketplace, runs a real `codex exec` +# session with tracing pointed at a local mock Braintrust collector, and asserts +# at least one trace row was reported. +# +# Two entry points: +# - workflow_call: invoked by release.yaml after a successful release, with the +# version it just published. +# - workflow_dispatch: run manually from the Actions tab against any published +# release version (e.g. to retest an older release). + +on: + workflow_call: + inputs: + version: + description: "Published release version to test, e.g. 0.0.1 (with or without leading v)." + required: true + type: string + secrets: + openai_api_key: + description: "OpenAI API key used by codex exec." + required: true + workflow_dispatch: + inputs: + version: + description: "Published release version to test, e.g. 0.0.1 (with or without leading v)." + required: true + type: string + +permissions: + contents: read + +env: + PLUGIN_DIR: plugins/trace-codex + +jobs: + smoke: + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-24.04 + label: linux-x64 + - runner: macos-15 + label: darwin-arm64 + - runner: macos-15-intel + label: darwin-x64 + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Install Bun + uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2 + with: + bun-version: latest + + - name: Install plugin dependencies + working-directory: ${{ env.PLUGIN_DIR }} + run: bun install --frozen-lockfile + + - name: Install Codex CLI + run: npm install -g @openai/codex + + - name: Install trace-codex plugin from the marketplace + env: + # The release repo is private; the launcher downloads the binary using + # this token. `codex plugin` also uses gh credentials for the clone. + GH_TOKEN: ${{ github.token }} + run: | + codex plugin marketplace add braintrustdata/braintrust-codex-plugin + codex plugin add trace-codex@braintrust-codex-plugins + + - name: Run smoke test (${{ matrix.label }}) + working-directory: ${{ env.PLUGIN_DIR }} + env: + OPENAI_API_KEY: ${{ secrets.openai_api_key || secrets.OPENAI_API_KEY }} + GH_TOKEN: ${{ github.token }} + SMOKE_RELEASE_VERSION: ${{ inputs.version }} + run: sh scripts/smoke-test.sh diff --git a/plugins/trace-codex/Makefile b/plugins/trace-codex/Makefile index 0f20746..862ac4c 100644 --- a/plugins/trace-codex/Makefile +++ b/plugins/trace-codex/Makefile @@ -9,6 +9,10 @@ # make build Compile the host binary (and per-platform binaries). # make integration Build, boot the server via the hook binary, health-check, # assert the version, then shut it down. +# make smoke End-to-end: run a real `codex exec` session with the +# installed plugin, traced to a local mock collector, and +# assert >=1 trace row. Requires codex, bun, and +# OPENAI_API_KEY; optionally SMOKE_VERSION to pin a release. # make clean Remove build artifacts. SHELL := /bin/bash @@ -24,7 +28,7 @@ EXPECTED_VERSION := $(shell sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^" BIN := bin/codex-hook BASE_URL := http://127.0.0.1:$(TEST_PORT) -.PHONY: test lint typecheck unit build integration clean +.PHONY: test lint typecheck unit build integration smoke clean test: lint typecheck unit integration @@ -81,6 +85,14 @@ integration: build echo " -> server stopped cleanly"; \ echo "PASS: integration test" +# End-to-end smoke test. Assumes the plugin is installed in Codex (run +# `../../install.sh trace-codex` first for a local dev install). Set +# SMOKE_VERSION to pin the release whose binary the launcher downloads; leave it +# unset to use the installed plugin manifest version / local dev binary. +smoke: + @echo "==> Smoke test" + SMOKE_RELEASE_VERSION="$(SMOKE_VERSION)" sh scripts/smoke-test.sh + clean: @echo "==> Cleaning" rm -rf bin diff --git a/plugins/trace-codex/README.md b/plugins/trace-codex/README.md index 6371b6e..ddd21d9 100644 --- a/plugins/trace-codex/README.md +++ b/plugins/trace-codex/README.md @@ -45,3 +45,17 @@ Advanced plugin settings for debugging or developing the plugin | _(none)_ | `BRAINTRUST_EVENT_SERVER_LOG_DIR` | `$PLUGIN_DATA` | Directory for logs, pidfile, and `config.json`. | The config file is read by the hook client only at the moment it boots the background server (the running server keeps the config it started with). To pick up config changes, stop the server (or wait for it to idle out) so the next event re-boots it. + +## Smoke test + +`make smoke` runs a full end-to-end check: it starts a local mock Braintrust collector, runs a real `codex exec "say hi"` session with tracing pointed at the mock, and asserts at least one trace row was reported. This exercises the whole path (Codex hook -> background server -> Braintrust SDK flush) without touching a real Braintrust account. + +Requirements: `codex` and `bun` on `PATH`, `OPENAI_API_KEY` set, and the plugin installed in Codex (run `../../install.sh trace-codex` for a local dev install). + +```bash +OPENAI_API_KEY=sk-... make smoke +# Pin the release whose codex-hook binary the launcher downloads: +OPENAI_API_KEY=sk-... make smoke SMOKE_VERSION=0.0.1 +``` + +CI runs the same smoke test (macOS arm64/x64 and Linux) after every release via `.github/workflows/smoke.yaml`. That workflow can also be dispatched manually from the Actions tab against any published release version. diff --git a/plugins/trace-codex/bin/codex-hook.sh b/plugins/trace-codex/bin/codex-hook.sh index eaed11e..102c3d9 100755 --- a/plugins/trace-codex/bin/codex-hook.sh +++ b/plugins/trace-codex/bin/codex-hook.sh @@ -51,8 +51,17 @@ suffix="$os_name-$arch_name" # Read the plugin version (single source of truth) from the manifest. Only on # this slow path, so no dependency (jq) and no parsing on the hot path. +# +# TRACE_CODEX_RELEASE_VERSION overrides the manifest version. This is intended +# for testing (e.g. the smoke test pointing at an arbitrary published release); +# normal installs leave it unset and use the manifest. manifest="$ROOT/.codex-plugin/plugin.json" -version=$(sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$manifest" 2>/dev/null | head -1) +version="${TRACE_CODEX_RELEASE_VERSION:-}" +if [ -z "$version" ]; then + version=$(sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$manifest" 2>/dev/null | head -1) +fi +# Allow an optional leading 'v' on the override (v0.0.1 or 0.0.1). +version="${version#v}" if [ -z "$version" ]; then log "could not read plugin version from $manifest; tracing disabled this session" exit 0 diff --git a/plugins/trace-codex/scripts/mock-collector.ts b/plugins/trace-codex/scripts/mock-collector.ts new file mode 100644 index 0000000..51c29c8 --- /dev/null +++ b/plugins/trace-codex/scripts/mock-collector.ts @@ -0,0 +1,116 @@ +// Mock Braintrust collector for the smoke test. +// +// The smoke test runs a real Codex session with the trace plugin enabled, but +// points the Braintrust SDK at this local server instead of the real backend. +// It implements just enough of the Braintrust HTTP API for a span flush to +// succeed, and counts the trace rows it receives so the smoke test can assert +// that tracing actually reported something. +// +// The Braintrust SDK (v3.x) makes these calls during initLogger + flush: +// POST {BRAINTRUST_APP_URL}/api/apikey/login -> org_info handshake +// POST {BRAINTRUST_APP_URL}/api/project/register -> resolves the project id +// GET {BRAINTRUST_API_URL}/version -> payload-limit probe (tolerated) +// POST {BRAINTRUST_API_URL}/logs3 -> the actual span data +// Pointing both BRAINTRUST_APP_URL and BRAINTRUST_API_URL at this one origin +// captures the whole flow. +// +// On shutdown (SIGTERM/SIGINT) it writes a JSON summary to MOCK_COLLECTOR_OUT: +// { "logs3Requests": , "totalRows": } +// so the smoke test can read the result after the Codex session ends. + +import { writeFileSync } from "node:fs"; + +const PORT = Number(process.env.MOCK_COLLECTOR_PORT || "53999"); +const OUT = process.env.MOCK_COLLECTOR_OUT || ""; + +let logs3Requests = 0; +let totalRows = 0; + +function log(message: string): void { + process.stderr.write(`mock-collector: ${message}\n`); +} + +function json(body: unknown, status = 200): Response { + return new Response(JSON.stringify(body), { + status, + headers: { "content-type": "application/json" }, + }); +} + +/** Count the trace rows in a /logs3 body: { rows: [...], api_version: 2 }. */ +function countRows(body: unknown): number { + if ( + typeof body === "object" && + body !== null && + Array.isArray((body as { rows?: unknown[] }).rows) + ) { + return (body as { rows: unknown[] }).rows.length; + } + return 0; +} + +const server = Bun.serve({ + port: PORT, + hostname: "127.0.0.1", + async fetch(req) { + const { pathname } = new URL(req.url); + + // Login handshake. api_url/proxy_url are echoed back, but the SDK uses + // BRAINTRUST_API_URL (set by the smoke test) for the data plane regardless. + if (req.method === "POST" && pathname === "/api/apikey/login") { + const self = `http://127.0.0.1:${PORT}`; + return json({ + org_info: [{ id: "smoke-org", name: "smoke", api_url: self, proxy_url: self }], + }); + } + + // Project registration -> returns a project id the rows reference. + if (req.method === "POST" && pathname === "/api/project/register") { + return json({ project: { id: "00000000-0000-0000-0000-000000000000", name: "smoke" } }); + } + + // Payload-limit probe. Tolerated by the SDK; return no limit. + if (req.method === "GET" && pathname === "/version") { + return json({ logs3_payload_max_bytes: null }); + } + + // The actual span data. Count the rows and acknowledge. + if (req.method === "POST" && (pathname === "/logs3" || pathname === "/logs3/overflow")) { + let rows = 0; + try { + rows = countRows(await req.json()); + } catch (err) { + log(`failed to parse ${pathname} body: ${String(err)}`); + } + logs3Requests += 1; + totalRows += rows; + log(`received ${pathname}: ${rows} row(s) (total ${totalRows})`); + return json({}); + } + + // Anything else: 200 with an empty object so the SDK never hard-fails. + log(`unhandled ${req.method} ${pathname}`); + return json({}); + }, +}); + +log(`listening on http://127.0.0.1:${server.port}`); + +function writeSummaryAndExit(): void { + const summary = { logs3Requests, totalRows }; + if (OUT) { + try { + writeFileSync(OUT, JSON.stringify(summary)); + log(`wrote summary to ${OUT}: ${JSON.stringify(summary)}`); + } catch (err) { + log(`failed to write summary to ${OUT}: ${String(err)}`); + } + } else { + log(`summary: ${JSON.stringify(summary)}`); + } + server.stop(true); + process.exit(0); +} + +process.on("SIGTERM", writeSummaryAndExit); +process.on("SIGINT", writeSummaryAndExit); diff --git a/plugins/trace-codex/scripts/smoke-test.sh b/plugins/trace-codex/scripts/smoke-test.sh new file mode 100755 index 0000000..8b6ed54 --- /dev/null +++ b/plugins/trace-codex/scripts/smoke-test.sh @@ -0,0 +1,170 @@ +#!/bin/sh +# End-to-end smoke test: run a real Codex session with the trace plugin enabled, +# pointed at a local mock Braintrust collector, and assert at least one trace row +# was reported. +# +# Usage: +# scripts/smoke-test.sh [VERSION] +# +# VERSION (optional; or the SMOKE_RELEASE_VERSION env var) pins the release whose +# codex-hook binary the plugin launcher downloads, e.g. "0.0.1" or "v0.0.1". When +# unset, the launcher uses the installed plugin manifest version (the normal +# behavior). CI passes the version it just released; locally you can point this +# at any published release. +# +# Prerequisites (the caller is responsible for these): +# - `codex` is on PATH and authenticated for `codex exec` (we pass +# CODEX_API_KEY from OPENAI_API_KEY below). +# - The trace-codex plugin is installed and enabled in Codex (the CI workflow +# installs it from the published release; locally use ./install.sh). +# - Bun is on PATH (used to run the mock collector). +# - OPENAI_API_KEY is set. +# - For a private release repo, GH_TOKEN or GITHUB_TOKEN (or an authenticated +# `gh`) so the launcher can download the binary. +# +# Exit 0 on success (>=1 trace row received), non-zero otherwise. + +set -eu + +# Resolve the plugin dir from this script's location so it works from anywhere. +SCRIPT_DIR=$(CDPATH= cd "$(dirname "$0")" && pwd) +PLUGIN_DIR=$(dirname "$SCRIPT_DIR") + +# Target release version: positional arg wins, then env, else empty (= manifest). +RELEASE_VERSION="${1:-${SMOKE_RELEASE_VERSION:-}}" + +PORT="${MOCK_COLLECTOR_PORT:-53999}" +BASE_URL="http://127.0.0.1:$PORT" + +log() { printf 'smoke: %s\n' "$1" >&2; } +fail() { printf 'smoke: FAIL: %s\n' "$1" >&2; exit 1; } + +# Redact the API key from any text we print (defense-in-depth for the public +# repo: GitHub already masks the secret, but we never want to echo subprocess +# output that might contain it, transformed or otherwise). Reads stdin, writes +# scrubbed text to stderr. A no-op if the key is empty. +scrub() { + if [ -n "${OPENAI_API_KEY:-}" ]; then + sed "s/${OPENAI_API_KEY}/[REDACTED]/g" 2>/dev/null || cat + else + cat + fi +} + +# --- Preconditions --------------------------------------------------------- + +[ -n "${OPENAI_API_KEY:-}" ] || fail "OPENAI_API_KEY is not set" +command -v codex >/dev/null 2>&1 || fail "codex is not on PATH" +command -v bun >/dev/null 2>&1 || fail "bun is not on PATH" + +# Belt-and-suspenders: explicitly mask the key in GitHub Actions logs. This +# covers any reformatted appearance beyond GitHub's automatic secret masking. +# Harmless (and silent) outside Actions. +if [ -n "${GITHUB_ACTIONS:-}" ]; then + printf '::add-mask::%s\n' "$OPENAI_API_KEY" +fi + +WORK=$(mktemp -d) +SUMMARY="$WORK/summary.json" +COLLECTOR_LOG="$WORK/collector.log" +CODEX_LOG="$WORK/codex.log" + +COLLECTOR_PID="" +cleanup() { + # Ask the collector to flush its summary and stop; fall back to KILL. + if [ -n "$COLLECTOR_PID" ] && kill -0 "$COLLECTOR_PID" 2>/dev/null; then + kill -TERM "$COLLECTOR_PID" 2>/dev/null || true + # Give it a moment to write the summary. + for _ in 1 2 3 4 5 6 7 8 9 10; do + kill -0 "$COLLECTOR_PID" 2>/dev/null || break + sleep 0.2 + done + kill -KILL "$COLLECTOR_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT INT TERM + +# --- Start the mock collector ---------------------------------------------- + +log "starting mock collector on $BASE_URL" +MOCK_COLLECTOR_PORT="$PORT" MOCK_COLLECTOR_OUT="$SUMMARY" \ + bun run "$PLUGIN_DIR/scripts/mock-collector.ts" >"$COLLECTOR_LOG" 2>&1 & +COLLECTOR_PID=$! + +# Wait for it to accept connections. +ready=0 +for _ in $(seq 1 50); do + if curl -fsS -o /dev/null --max-time 1 "$BASE_URL/version" 2>/dev/null; then + ready=1 + break + fi + kill -0 "$COLLECTOR_PID" 2>/dev/null || break + sleep 0.2 +done +if [ "$ready" != "1" ]; then + scrub <"$COLLECTOR_LOG" >&2 || true + fail "mock collector never became ready on $BASE_URL" +fi +log "collector ready" + +# --- Run a real Codex session with tracing pointed at the collector -------- + +# These BRAINTRUST_* vars are read by the hook client (inherited from this +# process via Codex) and bundled into the session's reporting config. Pointing +# both URLs at the collector captures the login handshake and the data POST. +export TRACE_TO_BRAINTRUST=true +export BRAINTRUST_API_KEY=smoke-key +export BRAINTRUST_API_URL="$BASE_URL" +export BRAINTRUST_APP_URL="$BASE_URL" +export BRAINTRUST_PROJECT=trace-codex-smoke +# Keep the background event server short-lived so it doesn't linger after CI. +export BRAINTRUST_EVENT_SERVER_IDLE_TIMEOUT_MS=15000 +export BRAINTRUST_EVENT_SERVER_IDLE_CHECK_INTERVAL_MS=1000 + +# Pin the release the launcher downloads, when requested. The launcher strips an +# optional leading 'v', so either form is fine here. +if [ -n "$RELEASE_VERSION" ]; then + export TRACE_CODEX_RELEASE_VERSION="$RELEASE_VERSION" + log "pinning plugin binary to release $RELEASE_VERSION" +else + log "using installed plugin manifest version (no pin)" +fi + +log "running codex exec (say hi)" +# CODEX_API_KEY is supported only by `codex exec` and is set inline for just +# this invocation. --dangerously-bypass-hook-trust runs the plugin's hooks +# without the interactive /hooks trust prompt (one-off automation escape hatch). +set +e +CODEX_API_KEY="$OPENAI_API_KEY" codex exec \ + --skip-git-repo-check \ + --dangerously-bypass-hook-trust \ + --sandbox read-only \ + "say hi" >"$CODEX_LOG" 2>&1 +codex_status=$? +set -e +log "codex exec exited with status $codex_status" +if [ "$codex_status" -ne 0 ]; then + log "--- codex output (key redacted) ---" + scrub <"$CODEX_LOG" >&2 || true +fi + +# The hook client blocks on a synchronous flush at session end, so by the time +# codex exec returns the trace should already be delivered. Stop the collector +# (via cleanup) to make it write its summary, then assert. +cleanup +COLLECTOR_PID="" # already stopped; avoid double-kill in the EXIT trap + +log "--- collector log (key redacted) ---" +scrub <"$COLLECTOR_LOG" >&2 || true + +[ -f "$SUMMARY" ] || fail "collector wrote no summary file" +total_rows=$(sed -n 's/.*"totalRows"[[:space:]]*:[[:space:]]*\([0-9]*\).*/\1/p' "$SUMMARY") +[ -n "$total_rows" ] || fail "could not parse totalRows from summary: $(cat "$SUMMARY")" +log "summary: $(cat "$SUMMARY")" + +if [ "$total_rows" -ge 1 ]; then + log "PASS: received $total_rows trace row(s)" + exit 0 +fi + +fail "expected >=1 trace row, got $total_rows" From 081959775910ed03fc523ed7312240f57f877602 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Sun, 21 Jun 2026 22:29:57 -0600 Subject: [PATCH 02/17] agents.md --- AGENTS.md | 3 +- plugins/trace-codex/AGENTS.md | 144 ++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 plugins/trace-codex/AGENTS.md diff --git a/AGENTS.md b/AGENTS.md index d2016b0..2e4bf58 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,8 +17,9 @@ Key files for the MCP/skills plugin: - `plugins/braintrust-codex-plugin/.mcp.json` — MCP server definition - `plugins/braintrust-codex-plugin/skills/braintrust/` — agent skills exposed through the plugin -Key files for the tracing plugin: +Key files for the tracing plugin (see [`plugins/trace-codex/AGENTS.md`](plugins/trace-codex/AGENTS.md) for its architecture): +- `plugins/trace-codex/AGENTS.md` — architecture and contributor guide for this plugin - `plugins/trace-codex/.codex-plugin/plugin.json` — plugin manifest - `plugins/trace-codex/hooks/hooks.json` — lifecycle hook config - `plugins/trace-codex/src/` — the hook client + event server (compiled to `bin/codex-hook`) diff --git a/plugins/trace-codex/AGENTS.md b/plugins/trace-codex/AGENTS.md new file mode 100644 index 0000000..e1d3a84 --- /dev/null +++ b/plugins/trace-codex/AGENTS.md @@ -0,0 +1,144 @@ +# AGENTS.md — trace-codex + +Guidelines for AI agents working in the `trace-codex` plugin. This is the +opt-in plugin that traces Codex sessions to Braintrust. It is **independent** +from the MCP/skills plugin (`braintrust-codex-plugin`) — do not merge behavior +between them. See the [repo AGENTS.md](../../AGENTS.md) for the monorepo +overview. + +## What this plugin is + +A long-lived, local **event server** plus a short-lived **hook client**, both +compiled into a single `codex-hook` binary. Codex fires lifecycle hooks; each +hook invocation runs the client, which forwards the event to the server, which +turns the stream of events into Braintrust spans (session → turn → tool). + +## Architecture + +Request/data flow: + +``` +Codex hook → bin/codex-hook.sh → codex-hook (client mode) + → POST /enqueue → in-memory FIFO queue + → ProcessorRegistry (one EventProcessor per session) + → Braintrust spans +``` + +Key pieces: + +- `src/index.ts` — entry point. One binary, three modes: `serve` (the + background server), `hook` (default; read one event from stdin, enqueue it), + and `replay` (re-POST a recorded NDJSON session). +- `src/client/` — the hook client. Ensures a server is up (booting a detached + one if needed), POSTs events, and on a terminal event (`Stop`) calls `/flush` + so final spans are delivered before the process tree is torn down. +- `src/server/` — the HTTP server, the FIFO `EventQueue`, routes + (`/enqueue`, `/flush`, `/health`, `/shutdown`), and the idle watchdog. +- `src/processor/` — the generic `ProcessorRegistry` (LRU map of per-session + processors) and the `EventProcessor` interface. +- `src/agents/codex/` — Codex-specific: translate hook payloads into + `EnqueueEvent`s (`event-builder.ts`), build spans (`event-processor.ts`), and + read user settings (`settings.ts`). +- `src/braintrust/` — thin wrapper over the Braintrust SDK (span factory). + +The server is **single-version**: a client whose version doesn't match a running +server bails rather than talking to it. It shuts itself down after an idle +window (default 5 min) or on `/shutdown`. + +The design has three deliberate properties worth understanding before changing +anything: + +### 1. Agent-agnostic "event server" + +The core (`src/server/`, `src/processor/`, `src/braintrust/`) knows nothing +about Codex. It speaks a generic `EnqueueEvent` shape and routes events to a +per-session processor. All Codex-specific knowledge lives in `src/agents/codex/` +and is registered into the generic core at startup. Adding another agent (Claude +Code, opencode, etc.) means adding a sibling `src/agents//` module that +exports the same `Agent` shape — the server, queue, and registry should not need +to change. + +Keep this boundary clean: nothing under `src/server/`, `src/processor/`, or +`src/braintrust/` should reference Codex by name. The generic event server may +eventually be extracted into its own package, so treat any Codex-specific leak +into the generic layers as a bug, not a shortcut. + +### 2. Bun-compiled, cross-platform binaries + +The hook command in `hooks.json` is a fixed, platform-agnostic string that +invokes `bin/codex-hook.sh` (or `.cmd` on Windows), **not** the binary directly. +That launcher resolves and runs the real, platform-specific `codex-hook` binary. + +- `scripts/build.ts` compiles `src/index.ts` with `bun build --compile` for each + target (`darwin-arm64/x64`, `linux-x64/arm64`; Windows slots in but isn't + built yet). `BUILD_HOST_ONLY=1` builds just the host target (used by the + repo-root `install.sh` for local dev installs). +- The compiled binaries are **large and not committed**. The launcher downloads + the matching binary from the plugin's GitHub release on first use and caches + it at `$PLUGIN_ROOT/bin/codex-hook`. Codex wipes `$PLUGIN_ROOT` on every + install/upgrade, so the cache self-invalidates and the next hook re-downloads + the right version. +- Plugin version is the single source of truth in + `.codex-plugin/plugin.json`; the launcher reads it to construct the release + tag (`trace-codex-v`). + +**Hard rule:** the hook must never fail the Codex turn. The launcher and client +log to stderr and exit 0 on any error; the server swallows errors so a turn is +never blocked by tracing. + +### 3. Hooks are non-blocking by default + +Tracing must not degrade the experience of using the coding agent. Every hook +fires synchronously in Codex's path, so blocking work on a hook directly adds +latency to the user's session. The design therefore makes the common path +fire-and-forget: the client POSTs to `/enqueue` and returns immediately, while +the background server does the slow work (Braintrust SDK calls, flushes) off the +critical path. + +We **deliberately block in a few places**, and only where correctness requires +it: + +- On a **terminal event** (`Stop`), the client calls `/flush` and waits for the + server to confirm buffered spans reached Braintrust. Without this, a + short-lived host (e.g. a CI job that ends right after the last turn) would tear + the process tree down before the final spans are delivered. This is the one + routine blocking point, and it's bounded by a timeout (`/flush` gives up rather + than hanging the turn). + +When adding behavior, prefer enqueue-and-return. Reach for a blocking +request/await only when data would otherwise be lost, keep it off the +per-hook hot path where possible, and always bound it so a slow or stuck backend +can't stall the agent. + +## Making changes + +- **Generic vs. agent code**: put agent-specific logic in `src/agents//`; + keep the server/processor/braintrust layers agent-agnostic. +- **Config**: two layers. The agent-specific layer (`src/agents/codex/settings.ts`) + reads the user's `config.json` from `PLUGIN_DATA`, maps its friendly camelCase + keys onto `BRAINTRUST_*` / `BRAINTRUST_EVENT_SERVER_*` env vars (env wins over + the file), and is run by the hook client before it boots the server — so the + spawned server inherits the resolved env. The generic layer (`src/config.ts`) + then reads **only** those env vars and never touches `config.json`. Keep it + that way: config-file parsing is deliberately agent-specific. User-facing + settings are documented in the [README](./README.md); update it (and the + `Settings` map in `settings.ts`) when adding one. +- **Hooks**: `hooks/hooks.json` lists the lifecycle events wired to the client. + Most are forwarded but not yet turned into spans (the processor no-ops the + ones it doesn't handle). +- **Build**: edit `scripts/build.ts` for targets/output; the launcher scripts + (`bin/codex-hook.sh`, `bin/codex-hook.cmd`) for download/exec behavior. + +## Commands + +Run from `plugins/trace-codex/`: + +- `bun test` — run the test suite (tests live next to sources as `*.test.ts`). +- `bun run typecheck` — `tsc --noEmit`. +- `bun run lint` / `bun run check` — Biome lint / lint+format. +- `bun run build` — compile all target binaries (`BUILD_HOST_ONLY=1` for host + only). +- `bun run dev` — run the server in watch mode. + +Add or update tests alongside any behavior change; keep typecheck and lint +clean. From 8d38c8e563c3a9a11c735163060c8ef65038734c Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Sun, 21 Jun 2026 23:11:41 -0600 Subject: [PATCH 03/17] trace tool calls --- .../src/agents/codex/event-processor.test.ts | 138 ++++++++++++++ .../src/agents/codex/event-processor.ts | 175 +++++++++++++++++- .../src/agents/codex/test-helpers.ts | 8 + 3 files changed, 319 insertions(+), 2 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index ab63e57..dffc16b 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -7,6 +7,8 @@ import { CodexEventProcessor } from "./event-processor.ts"; import { assertProducesTrace, configEvent, + postToolUse, + preToolUse, sessionStart, stop, userPromptSubmit, @@ -256,4 +258,140 @@ describe("CodexEventProcessor", () => { }, ); }); + + test("a paired tool call becomes a 'tool' span under its turn", async () => { + await assertProducesTrace( + [ + sessionStart({ model: "gpt-5.5", source: "startup" }), + userPromptSubmit({ prompt: "run it", turn_id: "t1" }), + preToolUse({ + turn_id: "t1", + tool_name: "Bash", + tool_use_id: "call_1", + tool_input: { command: "ls" }, + }), + postToolUse({ + turn_id: "t1", + tool_name: "Bash", + tool_use_id: "call_1", + tool_response: "file.txt", + }), + stop({ turn_id: "t1", last_assistant_message: "done" }), + ], + { + span_attributes: { name: "codex session", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + input: "run it", + output: "done", + ended: true, + children: [ + { + span_attributes: { name: "Bash", type: "tool" }, + input: { command: "ls" }, + output: "file.txt", + metadata: { tool_name: "Bash", tool_use_id: "call_1", turn_id: "t1" }, + ended: true, + children: [], + }, + ], + }, + ], + }, + ); + }); + + test("multiple tool calls in one turn become sibling tool spans", async () => { + await assertProducesTrace( + [ + sessionStart({ model: "gpt-5.5", source: "startup" }), + userPromptSubmit({ prompt: "do two things", turn_id: "t1" }), + preToolUse({ turn_id: "t1", tool_name: "Bash", tool_use_id: "call_1" }), + postToolUse({ turn_id: "t1", tool_use_id: "call_1", tool_response: "a" }), + preToolUse({ turn_id: "t1", tool_name: "apply_patch", tool_use_id: "call_2" }), + postToolUse({ turn_id: "t1", tool_use_id: "call_2", tool_response: "b" }), + stop({ turn_id: "t1", last_assistant_message: "done" }), + ], + { + span_attributes: { name: "codex session", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "Bash", type: "tool" }, output: "a", ended: true }, + { + span_attributes: { name: "apply_patch", type: "tool" }, + output: "b", + ended: true, + }, + ], + }, + ], + }, + ); + }); + + test("an unpaired PreToolUse (no PostToolUse) is closed when its turn ends", async () => { + await assertProducesTrace( + [ + sessionStart({ model: "gpt-5.5", source: "startup" }), + userPromptSubmit({ prompt: "run it", turn_id: "t1" }), + preToolUse({ + turn_id: "t1", + tool_name: "Bash", + tool_use_id: "call_1", + tool_input: { command: "ls" }, + }), + // No PostToolUse for call_1 (e.g. interrupted), then the turn stops. + stop({ turn_id: "t1", last_assistant_message: "done" }), + ], + { + span_attributes: { name: "codex session", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "Bash", type: "tool" }, + input: { command: "ls" }, + // No output (PostToolUse never arrived), but the span is ended. + ended: true, + children: [], + }, + ], + }, + ], + }, + ); + }); + + test("a tool call with no matching open turn span is skipped", async () => { + await assertProducesTrace( + [ + sessionStart({ model: "gpt-5.5", source: "startup" }), + userPromptSubmit({ prompt: "run it", turn_id: "t1" }), + // turn_id "tX" was never opened; the tool span must be skipped. + preToolUse({ turn_id: "tX", tool_name: "Bash", tool_use_id: "call_1" }), + postToolUse({ turn_id: "tX", tool_use_id: "call_1", tool_response: "x" }), + stop({ turn_id: "t1", last_assistant_message: "done" }), + ], + { + span_attributes: { name: "codex session", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [], + }, + ], + }, + ); + }); }); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index dd95c50..764d191 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -3,8 +3,10 @@ // builds its own per-session SpanFactory. On SessionStart it opens a Braintrust // root span. Each turn is a child span: UserPromptSubmit opens it (input = the // user prompt), and the matching Stop closes it (output = the final assistant -// message). The first Stop also ends the root span. Buffered spans are delivered -// via flush(). +// message). Tool calls are grandchild spans: PreToolUse opens a "tool" span under +// the matching turn (input = tool_input), and the matching PostToolUse (paired by +// tool_use_id) closes it (output = tool_response). The first Stop also ends the +// root span. Buffered spans are delivered via flush(). import { defaultSpanFactoryProvider, @@ -20,6 +22,8 @@ import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; const SESSION_START = "SessionStart"; const USER_PROMPT_SUBMIT = "UserPromptSubmit"; +const PRE_TOOL_USE = "PreToolUse"; +const POST_TOOL_USE = "PostToolUse"; const STOP = "Stop"; /** @@ -46,6 +50,11 @@ export class CodexEventProcessor implements EventProcessor { // Turn spans currently open, keyed by turn_id. A turn is opened by // UserPromptSubmit and closed (ended) by the matching Stop. private readonly openTurns = new Map(); + // Tool spans currently open, keyed by tool_use_id. A tool span is opened by + // PreToolUse (as a child of its turn) and closed by the matching PostToolUse. + // The owning turn_id is tracked so any still-open tool spans are ended when + // their turn (or the root) ends. + private readonly openTools = new Map(); constructor( queueId: string | null, @@ -86,6 +95,14 @@ export class CodexEventProcessor implements EventProcessor { this.startTurnSpan(event); return; } + if (event.eventName === PRE_TOOL_USE) { + this.startToolSpan(event); + return; + } + if (event.eventName === POST_TOOL_USE) { + this.endToolSpan(event); + return; + } if (event.eventName === STOP) { this.endTurnSpan(event); this.endRootSpan(); @@ -204,6 +221,143 @@ export class CodexEventProcessor implements EventProcessor { } } + // Open a child span for a tool call on PreToolUse, parented to the turn span + // matching turn_id. Keyed by tool_use_id so the matching PostToolUse can close + // it. The tool_input is the span's input. If the turn span isn't open (which + // shouldn't happen — tool calls always occur within a turn) or there's no + // tool_use_id to pair on, we log a warning and skip rather than guess a parent. + private startToolSpan(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + const toolUseId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined; + const toolName = typeof data.tool_name === "string" ? data.tool_name : undefined; + const toolInput = data.tool_input; + + if (toolUseId === undefined) { + this.logger.warn("codex processor: PreToolUse without tool_use_id; skipping tool span", { + queueId: this.queueId, + turnId, + }); + return; + } + + if (turnId === undefined) { + this.logger.warn("codex processor: PreToolUse without turn_id; skipping tool span", { + queueId: this.queueId, + toolUseId, + }); + return; + } + + const turnSpan = this.openTurns.get(turnId); + if (turnSpan === undefined) { + this.logger.warn("codex processor: PreToolUse with no open turn span; skipping tool span", { + queueId: this.queueId, + turnId, + toolUseId, + }); + return; + } + + if (this.openTools.has(toolUseId)) { + this.logger.warn("codex processor: duplicate tool_use_id; keeping existing tool span", { + queueId: this.queueId, + toolUseId, + }); + return; + } + + try { + const toolSpan = turnSpan.startSpan({ + name: toolName ?? "tool", + type: "tool", + event: { + input: toolInput, + metadata: { tool_name: toolName, tool_use_id: toolUseId, turn_id: turnId }, + }, + }); + this.openTools.set(toolUseId, { span: toolSpan, turnId }); + this.logger.info("codex processor: opened tool span", { + queueId: this.queueId, + turnId, + toolUseId, + spanId: toolSpan.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open tool span", { + queueId: this.queueId, + turnId, + toolUseId, + error: String(err), + }); + } + } + + // Close the tool span matching the PostToolUse's tool_use_id, recording the + // tool_response as the span's output. + private endToolSpan(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const toolUseId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined; + const output = data.tool_response; + + if (toolUseId === undefined) { + this.logger.debug("codex processor: PostToolUse without tool_use_id; no tool span to close", { + queueId: this.queueId, + }); + return; + } + + const entry = this.openTools.get(toolUseId); + if (entry === undefined) { + this.logger.debug("codex processor: PostToolUse with no open tool span", { + queueId: this.queueId, + toolUseId, + }); + return; + } + + try { + entry.span.log({ output }); + entry.span.end(); + this.logger.info("codex processor: ended tool span", { queueId: this.queueId, toolUseId }); + } catch (err) { + this.logger.error("codex processor: failed to end tool span", { + queueId: this.queueId, + toolUseId, + error: String(err), + }); + } finally { + this.openTools.delete(toolUseId); + } + } + + // End (without output) any still-open tool spans owned by the given turn. A + // tool span can be left open when its PostToolUse never arrives (e.g. the call + // was interrupted by a permission prompt and re-issued under a new + // tool_use_id). Called when the turn ends so dangling tool spans get closed. + private endOpenToolSpansForTurn(turnId: string): void { + for (const [toolUseId, entry] of this.openTools) { + if (entry.turnId !== turnId) continue; + try { + entry.span.end(); + this.logger.info("codex processor: ended dangling tool span on turn end", { + queueId: this.queueId, + turnId, + toolUseId, + }); + } catch (err) { + this.logger.error("codex processor: failed to end dangling tool span", { + queueId: this.queueId, + turnId, + toolUseId, + error: String(err), + }); + } finally { + this.openTools.delete(toolUseId); + } + } + } + // Close the turn span matching the Stop's turn_id, recording the final // assistant message as the span's output. private endTurnSpan(event: EnqueueEvent): void { @@ -228,6 +382,9 @@ export class CodexEventProcessor implements EventProcessor { return; } + // Close any tool spans still open under this turn before ending the turn. + this.endOpenToolSpansForTurn(turnId); + try { turnSpan.log({ output }); turnSpan.end(); @@ -248,6 +405,20 @@ export class CodexEventProcessor implements EventProcessor { private endRootSpan(): void { if (this.rootSpan === null || this.rootEnded) return; this.rootEnded = true; + // Backstop: close any tool spans still open (e.g. ones whose turn never saw + // a matching Stop) so nothing is left dangling when the session ends. + for (const [toolUseId, entry] of this.openTools) { + try { + entry.span.end(); + } catch (err) { + this.logger.error("codex processor: failed to end dangling tool span on root end", { + queueId: this.queueId, + toolUseId, + error: String(err), + }); + } + } + this.openTools.clear(); try { this.rootSpan.end(); this.logger.info("codex processor: ended root span", { queueId: this.queueId }); diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index eaa0e57..3fca991 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -37,6 +37,14 @@ export function userPromptSubmit(data: Record = {}): EnqueueEve return codexEvent("UserPromptSubmit", { session_id: "session-1", ...data }); } +export function preToolUse(data: Record = {}): EnqueueEvent { + return codexEvent("PreToolUse", { session_id: "session-1", ...data }); +} + +export function postToolUse(data: Record = {}): EnqueueEvent { + return codexEvent("PostToolUse", { session_id: "session-1", ...data }); +} + export function stop(data: Record = {}): EnqueueEvent { return codexEvent("Stop", { session_id: "session-1", ...data }); } From fe6f46e00535b4fbee05647e5d34850c9f680222 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 11:18:44 -0600 Subject: [PATCH 04/17] use codex session log to implement trace-codex hook data lacks LLM spans and timestamps, so we must use the session transcript to build a useful trace --- .../src/agents/codex/event-processor.test.ts | 748 +++++++++++++----- .../src/agents/codex/event-processor.ts | 733 ++++++++++++----- .../src/agents/codex/test-helpers.ts | 237 +++++- .../src/agents/codex/transcript-reader.ts | 155 ++++ .../src/agents/codex/transcript.test.ts | 47 ++ .../src/agents/codex/transcript.ts | 81 ++ plugins/trace-codex/src/test-helpers.ts | 14 + 7 files changed, 1567 insertions(+), 448 deletions(-) create mode 100644 plugins/trace-codex/src/agents/codex/transcript-reader.ts create mode 100644 plugins/trace-codex/src/agents/codex/transcript.test.ts create mode 100644 plugins/trace-codex/src/agents/codex/transcript.ts diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index dffc16b..0f5cac8 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -1,171 +1,478 @@ import { describe, expect, test } from "bun:test"; import type { ReportingConfig, SpanFactory } from "../../braintrust/logger.ts"; import type { EnqueueEvent } from "../../server/routes.ts"; -import { createTestLogger, withCapturedTrace } from "../../test-helpers.ts"; +import { createTestLogger, spansToTree, withCapturedTrace } from "../../test-helpers.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; import { assertProducesTrace, configEvent, - postToolUse, - preToolUse, + customToolCall, + customToolCallOutput, + FakeTranscriptReader, + functionCall, + functionCallOutput, + sessionMeta, sessionStart, stop, - userPromptSubmit, + taskComplete, + taskStarted, + transcript, + turnContext, + userMessage, } from "./test-helpers.ts"; +import type { + TranscriptReader, + TranscriptReadResult, + TranscriptWaitResult, + WaitForOptions, +} from "./transcript-reader.ts"; -describe("CodexEventProcessor", () => { - test("SessionStart produces a root span", async () => { +// Helpers below build a session as an ordered list of transcript writes and +// hooks. The final `stop()` hook triggers the catch-up that turns the buffered +// transcript records into spans (waiting for the task_complete sentinel). + +describe("CodexEventProcessor: root span", () => { + test("session_meta opens a root span named after the cwd basename", async () => { await assertProducesTrace( - [sessionStart({ model: "gpt-5.5", cwd: "/work", source: "startup" })], + [ + sessionStart({ source: "startup" }), + sessionMeta({ cwd: "/whatever/myapp", id: "session-1" }), + turnContext({ model: "gpt-5.5" }), + stop({ turn_id: "t1" }), + ], { - span_attributes: { name: "codex: work", type: "task" }, - input: { model: "gpt-5.5", cwd: "/work", source: "startup" }, - metadata: { session_id: "session-1", model: "gpt-5.5" }, + span_attributes: { name: "codex: myapp", type: "task" }, + metadata: { session_id: "session-1", model: "gpt-5.5", source: "startup" }, children: [], }, ); }); - test("root span is named after the project directory (basename of cwd)", async () => { - await assertProducesTrace([sessionStart({ cwd: "/whatever/myapp" })], { - span_attributes: { name: "codex: myapp", type: "task" }, - children: [], - }); - }); - test("root span name handles a trailing slash in cwd", async () => { - await assertProducesTrace([sessionStart({ cwd: "/whatever/myapp/" })], { - span_attributes: { name: "codex: myapp", type: "task" }, - children: [], - }); + await assertProducesTrace( + [sessionStart(), sessionMeta({ cwd: "/whatever/myapp/" }), stop({ turn_id: "t1" })], + { span_attributes: { name: "codex: myapp", type: "task" }, children: [] }, + ); }); test("root span falls back to 'codex session' when cwd is missing", async () => { - await assertProducesTrace([sessionStart({})], { + await assertProducesTrace([sessionStart(), sessionMeta({}), stop({ turn_id: "t1" })], { span_attributes: { name: "codex session", type: "task" }, children: [], }); }); - test("a session with no Stop stays active (no end time)", async () => { + test("model is backfilled from turn_context", async () => { await assertProducesTrace( - [sessionStart({ model: "gpt-5.5", source: "startup" }), userPromptSubmit({ prompt: "hi" })], + [sessionStart(), sessionMeta({ cwd: "/work" }), turnContext({ model: "gpt-5.5" }), stop({})], { - span_attributes: { name: "codex session", type: "task" }, - ended: false, + span_attributes: { name: "codex: work", type: "task" }, + metadata: { model: "gpt-5.5" }, children: [], }, ); }); - test("Stop ends the root span", async () => { + test("root carries source and permission_mode from the SessionStart hook", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "hi" }), + sessionStart({ source: "resume", permission_mode: "acceptEdits" }), + sessionMeta({ cwd: "/work" }), stop({}), ], { - span_attributes: { name: "codex session", type: "task" }, - ended: true, + span_attributes: { name: "codex: work", type: "task" }, + metadata: { source: "resume", permission_mode: "acceptEdits" }, children: [], }, ); }); - test("ends on the first Stop; later events do not reopen it", async () => { + test("duplicate session_meta still yields a single root span", async () => { + await assertProducesTrace( + [sessionStart(), sessionMeta({ cwd: "/work" }), sessionMeta({ cwd: "/work" }), stop({})], + { span_attributes: { name: "codex: work", type: "task" }, children: [] }, + ); + }); + + test("Stop ends the root span", async () => { + await assertProducesTrace([sessionStart(), sessionMeta({ cwd: "/work" }), stop({})], { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [], + }); + }); +}); + +describe("CodexEventProcessor: turn spans", () => { + test("task_started/task_complete become a turn span with prompt input and assistant output", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "hi", turn_id: "t1" }), - stop({ turn_id: "t1", last_assistant_message: "hello" }), - userPromptSubmit({ prompt: "another thing", turn_id: "t2" }), - stop({ turn_id: "t2", last_assistant_message: "ok" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "what's your name?" }), + taskComplete({ turn_id: "t1", last_agent_message: "I'm Codex." }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, children: [ - { span_attributes: { name: "turn: t1", type: "task" } }, - { span_attributes: { name: "turn: t2", type: "task" } }, + { + span_attributes: { name: "turn: t1", type: "task" }, + input: "what's your name?", + output: "I'm Codex.", + metadata: { turn_id: "t1" }, + ended: true, + children: [], + }, ], }, ); }); - test("a turn becomes a child span with prompt input and assistant output", async () => { + test("two turns become two ordered child spans", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "what's your name?", turn_id: "t1" }), - stop({ turn_id: "t1", last_assistant_message: "I'm Codex." }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "one" }), + taskComplete({ turn_id: "t1", last_agent_message: "first" }), + taskStarted({ turn_id: "t2" }), + userMessage({ message: "two" }), + taskComplete({ turn_id: "t2", last_agent_message: "second" }), + stop({ turn_id: "t2" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, children: [ + { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true }, + { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true }, + ], + }, + ); + }); + + test("a turn whose task_complete lands after its Stop is closed by the final flush catch-up", async () => { + // Regression: the last turn's task_complete can be written just after its + // Stop hook fires, so the Stop's bounded wait misses it. The final flush + // read must still pick it up and close the turn. Here task_complete is + // ordered AFTER stop() in the list (so it is only present at flush time). + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "one" }), + taskComplete({ turn_id: "t1", last_agent_message: "first" }), + stop({ turn_id: "t1" }), + taskStarted({ turn_id: "t2" }), + userMessage({ message: "two" }), + // Tool call/output present at Stop time, but task_complete is not yet. + functionCall({ turn_id: "t2", name: "exec_command", call_id: "c2" }), + functionCallOutput({ call_id: "c2", output: "ok" }), + stop({ turn_id: "t2" }), + // task_complete written after the Stop — only seen by the flush catch-up. + taskComplete({ turn_id: "t2", last_agent_message: "second" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true }, { - span_attributes: { name: "turn: t1", type: "task" }, - input: "what's your name?", - output: "I'm Codex.", + span_attributes: { name: "turn: t2", type: "task" }, + output: "second", + ended: true, + children: [{ span_attributes: { name: "exec_command", type: "tool" }, ended: true }], + }, + ], + }, + ); + }); + + test("each turn closes when its own Stop arrives (per-turn Stops)", async () => { + // Mirrors real Codex: Stop fires per turn. Turn 1 is processed/closed on its + // own Stop; turn 2 opens after and closes on the second Stop. Regression for + // the bug where the second turn never closed. + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "one" }), + taskComplete({ turn_id: "t1", last_agent_message: "first" }), + stop({ turn_id: "t1" }), + taskStarted({ turn_id: "t2" }), + userMessage({ message: "two" }), + taskComplete({ turn_id: "t2", last_agent_message: "second" }), + stop({ turn_id: "t2" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, output: "first", ended: true }, + { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true }, + ], + }, + ); + }); + + test("a turn still open when the root ends is closed by the root-end backstop", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "hi" }), + // No task_complete; the Stop hook ends the root and closes the turn. + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, input: "hi", ended: true }, + ], + }, + ); + }); +}); + +describe("CodexEventProcessor: span timing (from transcript timestamps)", () => { + // 2026-01-01T00:00:10Z = 1767225610, :12Z = 1767225612, :15Z = 1767225615. + test("turn and tool spans get start/end from their transcript record timestamps", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:12Z", + payload: { + type: "function_call", + name: "exec_command", + call_id: "c1", metadata: { turn_id: "t1" }, + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:13Z", + payload: { type: "function_call_output", call_id: "c1", output: "ok" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:15Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + // Root ends on first Stop at the turn's completion time. + metrics: { start: 1767225610, end: 1767225615 }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + metrics: { start: 1767225611, end: 1767225615 }, ended: true, + children: [ + { + span_attributes: { name: "exec_command", type: "tool" }, + metrics: { start: 1767225612, end: 1767225613 }, + ended: true, + }, + ], }, ], }, ); }); +}); - test("an open turn with no matching Stop stays active", async () => { +describe("CodexEventProcessor: tool spans", () => { + test("a function_call/output pair becomes a tool span under its turn", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "hi", turn_id: "t1" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "run it" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "call_1", + arguments: '{"cmd":"ls"}', + }), + functionCallOutput({ call_id: "call_1", output: "file.txt" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, - ended: false, + span_attributes: { name: "codex: work", type: "task" }, + ended: true, children: [ { span_attributes: { name: "turn: t1", type: "task" }, - input: "hi", - ended: false, + output: "done", + ended: true, + children: [ + { + span_attributes: { name: "exec_command", type: "tool" }, + input: '{"cmd":"ls"}', + output: "file.txt", + metadata: { tool_name: "exec_command", call_id: "call_1", turn_id: "t1" }, + ended: true, + children: [], + }, + ], }, ], }, ); }); - test("a Stop with no matching turn does not create a turn span", async () => { + test("custom_tool_call (apply_patch) becomes a tool span using the raw name", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - stop({ turn_id: "nope", last_assistant_message: "orphan" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + customToolCall({ + turn_id: "t1", + name: "apply_patch", + call_id: "call_2", + input: "*** Begin Patch", + }), + customToolCallOutput({ call_id: "call_2", output: "Success" }), + taskComplete({ turn_id: "t1", last_agent_message: "ok" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, - children: [], + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "apply_patch", type: "tool" }, + input: "*** Begin Patch", + output: "Success", + ended: true, + }, + ], + }, + ], }, ); }); - test("duplicate SessionStart still yields a single root span", async () => { + test("multiple tool calls in one turn become ordered sibling tool spans", async () => { await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - sessionStart({ model: "gpt-5.5", source: "startup" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1" }), + functionCallOutput({ call_id: "c1", output: "a" }), + customToolCall({ turn_id: "t1", name: "apply_patch", call_id: "c2" }), + customToolCallOutput({ call_id: "c2", output: "b" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, - children: [], + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "exec_command", type: "tool" }, output: "a", ended: true }, + { span_attributes: { name: "apply_patch", type: "tool" }, output: "b", ended: true }, + ], + }, + ], + }, + ); + }); + + test("an unpaired tool call (no output) is closed when its turn ends", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1", arguments: "{}" }), + // No output for c1; the turn completes anyway. + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "exec_command", type: "tool" }, + input: "{}", + ended: true, + children: [], + }, + ], + }, + ], + }, + ); + }); + + test("a tool call with no matching open turn span is skipped", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + // turn_id "tX" was never opened; the tool span must be skipped. + functionCall({ turn_id: "tX", name: "exec_command", call_id: "c1" }), + functionCallOutput({ call_id: "c1", output: "x" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [] }, + ], }, ); }); +}); - test("a config event builds the per-session factory with its config", () => { - // Spy provider records the config it was asked to build a factory for. +describe("CodexEventProcessor: config / master switch", () => { + test("a config event builds the per-session factory with its config", async () => { let seen: ReportingConfig | undefined; const fake: SpanFactory = { startSpan: () => @@ -179,7 +486,8 @@ describe("CodexEventProcessor", () => { return fake; }; - const processor = new CodexEventProcessor("sess-1", createTestLogger(), provider); + const reader = new FakeTranscriptReader(); + const processor = new CodexEventProcessor("sess-1", createTestLogger(), provider, reader); const cfg: EnqueueEvent = { queueId: "sess-1", eventSource: "codex-hook", @@ -192,9 +500,16 @@ describe("CodexEventProcessor", () => { traceToBraintrust: true, }, }; - processor.process(cfg); - // Factory is built lazily on first span (the SessionStart). - processor.process(sessionStart({ session_id: "sess-1", model: "gpt-5.5" })); + await processor.process(cfg); + // Factory is built lazily on first span (session_meta). + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:00Z", + type: "session_meta", + payload: { id: "sess-1", cwd: "/work" }, + }), + ); + await processor.process(sessionStart({ session_id: "sess-1" })); expect(seen).toEqual({ project: "team-project", @@ -208,9 +523,14 @@ describe("CodexEventProcessor", () => { test("root span metadata carries the configured project", async () => { await assertProducesTrace( - [configEvent({ project: "team-project" }), sessionStart({ model: "gpt-5.5" })], + [ + configEvent({ project: "team-project" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + stop({}), + ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, metadata: { project: "team-project" }, children: [], }, @@ -218,20 +538,32 @@ describe("CodexEventProcessor", () => { }); test("when tracing is disabled, no spans are produced", async () => { + const reader = new FakeTranscriptReader(); const trace = withCapturedTrace(); try { - const processor = new CodexEventProcessor("s", createTestLogger(), () => trace.spanFactory); + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); // Config event WITHOUT traceToBraintrust (defaults off). - processor.process({ + await processor.process({ queueId: "s", eventSource: "codex-hook", eventSourceVersion: null, eventName: CODEX_CONFIG_EVENT, eventData: { project: "p" }, }); - processor.process(sessionStart({ session_id: "s" })); - processor.process(userPromptSubmit({ session_id: "s", turn_id: "t1", prompt: "hi" })); - processor.process(stop({ session_id: "s", turn_id: "t1", last_assistant_message: "yo" })); + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:00Z", + type: "session_meta", + payload: { id: "s", cwd: "/work" }, + }), + ); + await processor.process(sessionStart({ session_id: "s" })); + await processor.process(stop({ session_id: "s", turn_id: "t1" })); await processor.flush(); const spans = await trace.drain(); @@ -248,150 +580,166 @@ describe("CodexEventProcessor", () => { project: "team-project", additionalMetadata: { team: "platform", model: "SHOULD_BE_OVERRIDDEN" }, }), - sessionStart({ model: "gpt-5.5" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + stop({}), ], { - span_attributes: { name: "codex session", type: "task" }, - // team comes from additionalMetadata; model is the standard key (wins). + span_attributes: { name: "codex: work", type: "task" }, metadata: { team: "platform", model: "gpt-5.5", project: "team-project" }, children: [], }, ); }); +}); - test("a paired tool call becomes a 'tool' span under its turn", async () => { +describe("CodexEventProcessor: transcript catch-up plumbing", () => { + test("each hook reads from the advancing offset (each line read once)", async () => { + const reader = new FakeTranscriptReader(); await assertProducesTrace( [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "run it", turn_id: "t1" }), - preToolUse({ - turn_id: "t1", - tool_name: "Bash", - tool_use_id: "call_1", - tool_input: { command: "ls" }, - }), - postToolUse({ - turn_id: "t1", - tool_name: "Bash", - tool_use_id: "call_1", - tool_response: "file.txt", - }), - stop({ turn_id: "t1", last_assistant_message: "done" }), + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "hi" }), + taskComplete({ turn_id: "t1", last_agent_message: "yo" }), + stop({ turn_id: "t1" }), ], { - span_attributes: { name: "codex session", type: "task" }, + span_attributes: { name: "codex: work", type: "task" }, ended: true, - children: [ - { - span_attributes: { name: "turn: t1", type: "task" }, - input: "run it", - output: "done", - ended: true, - children: [ - { - span_attributes: { name: "Bash", type: "tool" }, - input: { command: "ls" }, - output: "file.txt", - metadata: { tool_name: "Bash", tool_use_id: "call_1", turn_id: "t1" }, - ended: true, - children: [], - }, - ], - }, - ], + children: [{ span_attributes: { name: "turn: t1", type: "task" }, ended: true }], }, + { reader }, ); - }); - test("multiple tool calls in one turn become sibling tool spans", async () => { - await assertProducesTrace( - [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "do two things", turn_id: "t1" }), - preToolUse({ turn_id: "t1", tool_name: "Bash", tool_use_id: "call_1" }), - postToolUse({ turn_id: "t1", tool_use_id: "call_1", tool_response: "a" }), - preToolUse({ turn_id: "t1", tool_name: "apply_patch", tool_use_id: "call_2" }), - postToolUse({ turn_id: "t1", tool_use_id: "call_2", tool_response: "b" }), - stop({ turn_id: "t1", last_assistant_message: "done" }), - ], - { - span_attributes: { name: "codex session", type: "task" }, - ended: true, - children: [ - { - span_attributes: { name: "turn: t1", type: "task" }, - ended: true, - children: [ - { span_attributes: { name: "Bash", type: "tool" }, output: "a", ended: true }, - { - span_attributes: { name: "apply_patch", type: "tool" }, - output: "b", - ended: true, - }, - ], - }, - ], - }, - ); + // Reads: SessionStart (readFrom), Stop (waitFor). The injected config event + // has no transcript_path, so it does not read. + expect(reader.readOffsets.length).toBeGreaterThanOrEqual(2); + for (let i = 1; i < reader.readOffsets.length; i++) { + expect(reader.readOffsets[i]).toBeGreaterThanOrEqual(reader.readOffsets[i - 1] as number); + } + expect(reader.readOffsets[0]).toBe(0); }); - test("an unpaired PreToolUse (no PostToolUse) is closed when its turn ends", async () => { + test("a malformed transcript line is skipped without breaking handling", async () => { + const reader = new FakeTranscriptReader(); + reader.append("this is not valid json"); await assertProducesTrace( - [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "run it", turn_id: "t1" }), - preToolUse({ - turn_id: "t1", - tool_name: "Bash", - tool_use_id: "call_1", - tool_input: { command: "ls" }, - }), - // No PostToolUse for call_1 (e.g. interrupted), then the turn stops. - stop({ turn_id: "t1", last_assistant_message: "done" }), - ], - { - span_attributes: { name: "codex session", type: "task" }, - ended: true, - children: [ - { - span_attributes: { name: "turn: t1", type: "task" }, - ended: true, - children: [ - { - span_attributes: { name: "Bash", type: "tool" }, - input: { command: "ls" }, - // No output (PostToolUse never arrived), but the span is ended. - ended: true, - children: [], - }, - ], - }, - ], - }, + [sessionStart(), sessionMeta({ cwd: "/work" }), stop({ turn_id: "t1" })], + { span_attributes: { name: "codex: work", type: "task" }, ended: true, children: [] }, + { reader }, ); }); - test("a tool call with no matching open turn span is skipped", async () => { - await assertProducesTrace( - [ - sessionStart({ model: "gpt-5.5", source: "startup" }), - userPromptSubmit({ prompt: "run it", turn_id: "t1" }), - // turn_id "tX" was never opened; the tool span must be skipped. - preToolUse({ turn_id: "tX", tool_name: "Bash", tool_use_id: "call_1" }), - postToolUse({ turn_id: "tX", tool_use_id: "call_1", tool_response: "x" }), - stop({ turn_id: "t1", last_assistant_message: "done" }), - ], - { - span_attributes: { name: "codex session", type: "task" }, - ended: true, - children: [ - { - span_attributes: { name: "turn: t1", type: "task" }, - ended: true, - children: [], - }, - ], - }, + test("flush polls the transcript until an open turn's task_complete appears", async () => { + // A reader that withholds its last appended line until a later read, modeling + // a task_complete that lands after the terminal Stop. flush() must poll until + // it appears and then close the turn. + class DelayedRevealReader implements TranscriptReader { + private lines: string[] = []; + private withheld: string | null = null; + private readsUntilReveal = 0; + + append(line: string): void { + this.lines.push(line); + } + + /** Append a line that only becomes visible after `afterReads` more reads. */ + withhold(line: string, afterReads: number): void { + this.withheld = line; + this.readsUntilReveal = afterReads; + } + + private visible(): string[] { + return this.lines; + } + + readFrom(_path: string, offset: number): TranscriptReadResult { + // Reveal the withheld line once enough reads have happened. + if (this.withheld !== null) { + if (this.readsUntilReveal <= 0) { + this.lines.push(this.withheld); + this.withheld = null; + } else { + this.readsUntilReveal -= 1; + } + } + const visible = this.visible(); + const buf = visible.length > 0 ? `${visible.join("\n")}\n` : ""; + const from = offset > buf.length ? 0 : offset; + const slice = buf.slice(from); + const out: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + out.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines: out, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + return { lines, offset: next, sentinelFound: lines.some(predicate) }; + } + } + + const reader = new DelayedRevealReader(); + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: "s", cwd: "/work" }, + }), + ); + reader.append( + JSON.stringify({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "t1" }, + }), + ); + // task_complete is withheld: not visible at the Stop's waitFor, appears later. + reader.withhold( + JSON.stringify({ + timestamp: "2026-01-01T00:00:15Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + 1, ); + + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + await processor.process(sessionStart({ session_id: "s" })); + // Stop's bounded wait won't see task_complete yet; turn stays open. + await processor.process(stop({ session_id: "s", turn_id: "t1" })); + // flush polls until the withheld task_complete is revealed and closes t1. + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + const turn = tree?.children[0]; + expect(turn?.name).toBe("turn: t1"); + expect(turn?.metrics?.end).toBe(1767225615); // 2026-01-01T00:00:15Z + } finally { + trace.cleanup(); + } }); }); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 764d191..d215f91 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -1,12 +1,28 @@ -// Processor for Codex events. A leading config event (same queueId) configures -// how this session reports to Braintrust (project/credentials), so the processor -// builds its own per-session SpanFactory. On SessionStart it opens a Braintrust -// root span. Each turn is a child span: UserPromptSubmit opens it (input = the -// user prompt), and the matching Stop closes it (output = the final assistant -// message). Tool calls are grandchild spans: PreToolUse opens a "tool" span under -// the matching turn (input = tool_input), and the matching PostToolUse (paired by -// tool_use_id) closes it (output = tool_response). The first Stop also ends the -// root span. Buffered spans are delivered via flush(). +// Processor for a single Codex session. Spans are built from the session +// transcript file (the "rollout"), which is the source of truth for everything +// inside a session: the root, each turn, and each tool call, in true execution +// order. The Codex lifecycle hooks drive when we read the transcript and supply +// a few fields the transcript lacks. +// +// Two ordered inputs: +// - Hooks: `process(event)` is called once per hook. A leading config event +// (same queueId) configures reporting. Each hook triggers a transcript +// "catch-up". A SessionStart hook contributes `source`/`permission_mode` to +// the root span (the transcript does not record these). A Stop hook is the +// turn-terminal signal: before catching up we wait (bounded) for the turn's +// `task_complete` to be written, so the final spans aren't truncated. +// - Transcript: each new record is turned into spans by `processTranscriptEvent`: +// session_meta -> open the root span (start time = its timestamp) +// task_started -> open a turn span (child of root) +// user_message -> set the open turn's input (the prompt) +// function_call / custom_tool_call / tool_search_call +// -> open a tool span (child of its turn), keyed by call_id +// *_output -> close the matching tool span (output) +// task_complete -> set the turn's output and close it +// +// Span start times come from transcript timestamps, so sibling spans (LLM and +// tool calls, later) render in execution order. Buffered spans are delivered via +// flush(). Everything is wrapped so tracing can never break a Codex turn. import { defaultSpanFactoryProvider, @@ -19,13 +35,52 @@ import type { Logger } from "../../log.ts"; import type { EventProcessor } from "../../processor/event-processor.ts"; import type { EnqueueEvent } from "../../server/routes.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; +import { + EVT_TASK_COMPLETE, + EVT_TASK_STARTED, + EVT_USER_MESSAGE, + ITEM_CUSTOM_TOOL_CALL, + ITEM_CUSTOM_TOOL_CALL_OUTPUT, + ITEM_FUNCTION_CALL, + ITEM_FUNCTION_CALL_OUTPUT, + ITEM_TOOL_SEARCH_CALL, + ITEM_TOOL_SEARCH_OUTPUT, + parseTranscriptLine, + RECORD_EVENT_MSG, + RECORD_RESPONSE_ITEM, + RECORD_SESSION_META, + RECORD_TURN_CONTEXT, + type TranscriptRecord, +} from "./transcript.ts"; +import { defaultTranscriptReader, type TranscriptReader } from "./transcript-reader.ts"; const SESSION_START = "SessionStart"; -const USER_PROMPT_SUBMIT = "UserPromptSubmit"; -const PRE_TOOL_USE = "PreToolUse"; -const POST_TOOL_USE = "PostToolUse"; const STOP = "Stop"; +// Bound on how long a Stop hook waits for the turn's task_complete to appear in +// the transcript before giving up and emitting whatever exists. Small so a slow +// or stuck writer can never stall the Codex turn. +const SENTINEL_TIMEOUT_MS = 2000; +const SENTINEL_INTERVAL_MS = 25; + +// Bound on how long flush() polls the transcript for still-open turns to close +// (their task_complete can land slightly after the terminal Stop). Larger than +// the per-hook wait since flush is off the Codex hot path, but still bounded so +// a stuck writer can't hang shutdown. +const FLUSH_TIMEOUT_MS = 5000; +const FLUSH_INTERVAL_MS = 50; + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** Tool-call response_item subtypes that OPEN a tool span. */ +const TOOL_CALL_TYPES = new Set([ITEM_FUNCTION_CALL, ITEM_CUSTOM_TOOL_CALL, ITEM_TOOL_SEARCH_CALL]); +/** Tool-call response_item subtypes that CLOSE a tool span (by call_id). */ +const TOOL_OUTPUT_TYPES = new Set([ + ITEM_FUNCTION_CALL_OUTPUT, + ITEM_CUSTOM_TOOL_CALL_OUTPUT, + ITEM_TOOL_SEARCH_OUTPUT, +]); + /** * The final path segment of `cwd`, used to name the root span (e.g. * "/whatever/myapp" -> "myapp"). Handles trailing separators and both POSIX and @@ -38,6 +93,19 @@ function projectDirName(cwd: string | undefined): string | undefined { return segments.length > 0 ? segments[segments.length - 1] : undefined; } +/** Parse an ISO timestamp to Unix seconds (Braintrust's start time unit). */ +function isoToUnixSeconds(ts: unknown): number | undefined { + if (typeof ts !== "string") return undefined; + const ms = Date.parse(ts); + return Number.isNaN(ms) ? undefined : ms / 1000; +} + +/** Extra root-span fields the transcript doesn't record; supplied by the hook. */ +interface RootEnrichment { + source?: string; + permissionMode?: string; +} + export class CodexEventProcessor implements EventProcessor { private readonly logger: Logger; private readonly queueId: string | null; @@ -48,22 +116,41 @@ export class CodexEventProcessor implements EventProcessor { private rootSpan: Span | null = null; private rootEnded = false; // Turn spans currently open, keyed by turn_id. A turn is opened by - // UserPromptSubmit and closed (ended) by the matching Stop. + // task_started and closed by the matching task_complete. private readonly openTurns = new Map(); - // Tool spans currently open, keyed by tool_use_id. A tool span is opened by - // PreToolUse (as a child of its turn) and closed by the matching PostToolUse. - // The owning turn_id is tracked so any still-open tool spans are ended when - // their turn (or the root) ends. + // Tool spans currently open, keyed by call_id. Opened by a tool-call + // response_item and closed by the matching *_output. The owning turn_id is + // tracked so any still-open tool spans are ended when their turn ends. private readonly openTools = new Map(); + // Root enrichment from the SessionStart hook (source/permission_mode), which + // the transcript lacks. Applied when the root span is created, or patched onto + // the root if it already exists. + private rootEnrichment: RootEnrichment = {}; + // Model slug, learned from the first turn_context (session_meta has none). + private model: string | undefined; + // End time (Unix seconds) of the most recently completed turn. Used to end the + // root span at the right time when the session is finalized (on flush), since + // Codex has no session-end hook and the SDK records a span's end time once. + private lastTurnEndTime: number | undefined; + // Reads new transcript content on each hook (the "catch-up"). + private readonly transcriptReader: TranscriptReader; + // Byte offset into the transcript already consumed. In-memory only; if this + // processor is evicted and recreated, the transcript is re-read from 0. + private transcriptOffset = 0; + // Transcript path from the latest hook, so flush() can do a final catch-up + // (the last turn's task_complete can land just after its Stop hook fires). + private transcriptPath: string | null = null; constructor( queueId: string | null, logger: Logger, spanFactoryProvider: SpanFactoryProvider = defaultSpanFactoryProvider, + transcriptReader: TranscriptReader = defaultTranscriptReader, ) { this.queueId = queueId; this.logger = logger; this.spanFactoryProvider = spanFactoryProvider; + this.transcriptReader = transcriptReader; } /** The session's SpanFactory, built from the config event on first use. */ @@ -74,7 +161,16 @@ export class CodexEventProcessor implements EventProcessor { return this.spanFactoryInstance; } - process(event: EnqueueEvent): void { + /** Whether this session reports to Braintrust at all (master switch). */ + private get tracingEnabled(): boolean { + return this.reportingConfig?.traceToBraintrust === true; + } + + // Dispatcher. The config event short-circuits (it configures reporting and + // must run before any spans). Every other hook (a) records any hook-only data + // it carries, (b) catches up the transcript into spans, and on Stop (c) ends + // the root span. Wrapped so transcript work can never break the turn. + async process(event: EnqueueEvent): Promise { if (event.eventName === CODEX_CONFIG_EVENT) { this.configure(event); return; @@ -87,38 +183,131 @@ export class CodexEventProcessor implements EventProcessor { }); return; } + if (event.eventName === SESSION_START) { - this.startRootSpan(event); - return; - } - if (event.eventName === USER_PROMPT_SUBMIT) { - this.startTurnSpan(event); - return; + this.recordRootEnrichment(event); } - if (event.eventName === PRE_TOOL_USE) { - this.startToolSpan(event); - return; + + try { + await this.catchUpTranscript(event); + } catch (err) { + this.logger.error("codex processor: transcript catch-up failed", { + queueId: this.queueId, + eventName: event.eventName, + error: String(err), + }); } - if (event.eventName === POST_TOOL_USE) { - this.endToolSpan(event); - return; + + // End the root span on the first Stop. The catch-up above has already + // processed this turn's task_complete, so lastTurnEndTime is the turn's + // completion time — used as the root's end time so it isn't stamped with a + // late wall-clock value. + if (event.eventName === STOP) { + this.endRootSpan(this.lastTurnEndTime); } + } + + // Read transcript lines appended since our last read and turn each into spans. + // The transcript path comes from the hook payload. On a Stop hook we first + // wait (bounded) for the turn's task_complete to be written, so the turn's + // final records aren't missed due to a write race. Never throws. + private async catchUpTranscript(event: EnqueueEvent): Promise { + const data = (event.eventData ?? {}) as Record; + const path = typeof data.transcript_path === "string" ? data.transcript_path : undefined; + if (path === undefined) return; + this.transcriptPath = path; + + let lines: string[]; + let offset: number; if (event.eventName === STOP) { - this.endTurnSpan(event); - this.endRootSpan(); - return; + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + const result = await this.transcriptReader.waitFor( + path, + this.transcriptOffset, + (line) => this.isTaskCompleteFor(line, turnId), + { timeoutMs: SENTINEL_TIMEOUT_MS, intervalMs: SENTINEL_INTERVAL_MS }, + ); + lines = result.lines; + offset = result.offset; + if (!result.sentinelFound) { + this.logger.warn("codex processor: Stop sentinel not found; emitting partial turn", { + queueId: this.queueId, + turnId, + }); + } + } else { + const result = this.transcriptReader.readFrom(path, this.transcriptOffset); + lines = result.lines; + offset = result.offset; } - // Other events will attach child spans to the root span in later phases. - this.logger.debug("codex processor: no-op", { - queueId: this.queueId, - eventName: event.eventName, - }); + + this.transcriptOffset = offset; + this.consumeLines(lines); + } + + // Parse and process a batch of transcript lines into spans, advancing nothing + // (the caller owns the offset). Used by catch-up and by the final flush read. + private consumeLines(lines: string[]): void { + for (const line of lines) { + const record = parseTranscriptLine(line); + if (record === null) continue; + this.processTranscriptEvent(record); + } + } + + // Cheap check used as the Stop sentinel predicate: is this line a + // task_complete for `turnId` (or any task_complete when turnId is unknown)? + private isTaskCompleteFor(line: string, turnId: string | undefined): boolean { + const record = parseTranscriptLine(line); + if (record === null) return false; + if (record.type !== RECORD_EVENT_MSG || record.payload?.type !== EVT_TASK_COMPLETE) { + return false; + } + if (turnId === undefined) return true; + return record.payload?.turn_id === turnId; } + // Turn one parsed transcript record into span operations. Unknown record types + // are ignored. Never throws; per-record failures are logged. + private processTranscriptEvent(record: TranscriptRecord): void { + try { + if (record.type === RECORD_SESSION_META) { + this.startRootSpan(record); + return; + } + if (record.type === RECORD_TURN_CONTEXT) { + this.noteModel(record); + return; + } + if (record.type === RECORD_EVENT_MSG) { + const ptype = record.payload?.type; + if (ptype === EVT_TASK_STARTED) this.startTurnSpan(record); + else if (ptype === EVT_USER_MESSAGE) this.setTurnInput(record); + else if (ptype === EVT_TASK_COMPLETE) this.endTurnSpan(record); + return; + } + if (record.type === RECORD_RESPONSE_ITEM) { + const ptype = record.payload?.type ?? ""; + if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(record); + else if (TOOL_OUTPUT_TYPES.has(ptype)) this.endToolSpan(record); + return; + } + } catch (err) { + this.logger.error("codex processor: failed to process transcript record", { + queueId: this.queueId, + type: record.type, + payloadType: record.payload?.type, + error: String(err), + }); + } + } + + // ========================================================================== + // Hook-driven configuration / enrichment + // ========================================================================== + // Record the session's reporting config from the config event. The - // SpanFactory is built lazily (on first span) from this config. Arriving - // before the root span is created, this lets the session report to its own - // project/account. + // SpanFactory is built lazily (on first span) from this config. private configure(event: EnqueueEvent): void { if (this.spanFactoryInstance !== null) { this.logger.warn("codex processor: config event after factory built; ignoring", { @@ -149,46 +338,116 @@ export class CodexEventProcessor implements EventProcessor { }); } - /** Whether this session reports to Braintrust at all (master switch). */ - private get tracingEnabled(): boolean { - return this.reportingConfig?.traceToBraintrust === true; + // Capture source/permission_mode from a SessionStart hook (the transcript + // lacks these). If the root span already exists, patch them onto it now. + private recordRootEnrichment(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const source = typeof data.source === "string" ? data.source : undefined; + const permissionMode = + typeof data.permission_mode === "string" ? data.permission_mode : undefined; + if (source !== undefined) this.rootEnrichment.source = source; + if (permissionMode !== undefined) this.rootEnrichment.permissionMode = permissionMode; + + if (this.rootSpan !== null) { + try { + this.rootSpan.log({ + metadata: { source: this.rootEnrichment.source, permission_mode: permissionMode }, + }); + } catch (err) { + this.logger.error("codex processor: failed to enrich root span", { + queueId: this.queueId, + error: String(err), + }); + } + } } - async flush(): Promise { - if (this.rootSpan === null) return; + // ========================================================================== + // Transcript-driven span building + // ========================================================================== + + private startRootSpan(record: TranscriptRecord): void { + if (this.rootSpan !== null) { + this.logger.warn("codex processor: duplicate session_meta; keeping existing root span", { + queueId: this.queueId, + }); + return; + } + const payload = (record.payload ?? {}) as Record; + const cwd = typeof payload.cwd === "string" ? payload.cwd : undefined; + const sessionId = typeof payload.id === "string" ? payload.id : this.queueId; + const startTime = isoToUnixSeconds(record.timestamp); + + const projectDir = projectDirName(cwd); + const spanName = projectDir ? `codex: ${projectDir}` : "codex session"; + try { - await this.rootSpan.flush(); - this.logger.debug("codex processor: flush ok", { queueId: this.queueId }); + this.rootSpan = this.spanFactory.startSpan({ + name: spanName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input: { model: this.model, cwd, source: this.rootEnrichment.source }, + metadata: { + // User-provided extras first, so the standard keys below win. + ...this.reportingConfig?.additionalMetadata, + session_id: sessionId, + model: this.model, + cwd, + source: this.rootEnrichment.source, + permission_mode: this.rootEnrichment.permissionMode, + cli_version: typeof payload.cli_version === "string" ? payload.cli_version : undefined, + project: this.reportingConfig?.project, + }, + }, + }); + this.logger.info("codex processor: opened root span", { + queueId: this.queueId, + spanId: this.rootSpan.id, + }); } catch (err) { - this.logger.error("codex processor: flush failed", { + this.logger.error("codex processor: failed to open root span", { queueId: this.queueId, error: String(err), }); } } - // Open a child span for a turn on UserPromptSubmit. Keyed by turn_id so the - // matching Stop can close it. The user prompt is the span's input. - private startTurnSpan(event: EnqueueEvent): void { + // Learn the model from turn_context (session_meta has none). Backfill it onto + // the root input/metadata the first time we see it. + private noteModel(record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const model = typeof payload.model === "string" ? payload.model : undefined; + if (model === undefined || this.model !== undefined) return; + this.model = model; + if (this.rootSpan !== null) { + try { + this.rootSpan.log({ input: { model }, metadata: { model } }); + } catch (err) { + this.logger.error("codex processor: failed to backfill model on root", { + queueId: this.queueId, + error: String(err), + }); + } + } + } + + // Open a turn span on task_started, child of the root, keyed by turn_id. + private startTurnSpan(record: TranscriptRecord): void { if (this.rootSpan === null) { - this.logger.warn("codex processor: turn without a root span; ignoring", { + this.logger.warn("codex processor: task_started without a root span; ignoring", { queueId: this.queueId, }); return; } - - const data = (event.eventData ?? {}) as Record; - const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - const prompt = typeof data.prompt === "string" ? data.prompt : undefined; - const model = typeof data.model === "string" ? data.model : undefined; - + const payload = (record.payload ?? {}) as Record; + const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined; if (turnId === undefined) { - this.logger.warn("codex processor: UserPromptSubmit without turn_id; skipping turn span", { + this.logger.warn("codex processor: task_started without turn_id; skipping turn span", { queueId: this.queueId, }); return; } - if (this.openTurns.has(turnId)) { this.logger.warn("codex processor: duplicate turn_id; keeping existing turn span", { queueId: this.queueId, @@ -196,15 +455,13 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - + const startTime = isoToUnixSeconds(record.timestamp); try { const turnSpan = this.rootSpan.startSpan({ name: `turn: ${turnId}`, type: "task", - event: { - input: prompt, - metadata: { turn_id: turnId, model }, - }, + ...(startTime !== undefined ? { startTime } : {}), + event: { metadata: { turn_id: turnId, model: this.model } }, }); this.openTurns.set(turnId, turnSpan); this.logger.info("codex processor: opened turn span", { @@ -221,206 +478,245 @@ export class CodexEventProcessor implements EventProcessor { } } - // Open a child span for a tool call on PreToolUse, parented to the turn span - // matching turn_id. Keyed by tool_use_id so the matching PostToolUse can close - // it. The tool_input is the span's input. If the turn span isn't open (which - // shouldn't happen — tool calls always occur within a turn) or there's no - // tool_use_id to pair on, we log a warning and skip rather than guess a parent. - private startToolSpan(event: EnqueueEvent): void { - const data = (event.eventData ?? {}) as Record; - const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - const toolUseId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined; - const toolName = typeof data.tool_name === "string" ? data.tool_name : undefined; - const toolInput = data.tool_input; + // Set the open turn's input from a user_message event (the prompt). The + // user_message carries no turn_id, so it applies to the most recently opened + // still-open turn. + private setTurnInput(record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const prompt = typeof payload.message === "string" ? payload.message : undefined; + if (prompt === undefined) return; + const turnSpan = this.latestOpenTurn(); + if (turnSpan === undefined) { + this.logger.debug("codex processor: user_message with no open turn span", { + queueId: this.queueId, + }); + return; + } + try { + turnSpan.log({ input: prompt }); + } catch (err) { + this.logger.error("codex processor: failed to set turn input", { + queueId: this.queueId, + error: String(err), + }); + } + } - if (toolUseId === undefined) { - this.logger.warn("codex processor: PreToolUse without tool_use_id; skipping tool span", { + // Close the turn span on task_complete, recording the final assistant message + // as output. Also closes any tool spans still open under that turn. + private endTurnSpan(record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined; + const output = + typeof payload.last_agent_message === "string" ? payload.last_agent_message : undefined; + if (turnId === undefined) { + this.logger.debug("codex processor: task_complete without turn_id", { + queueId: this.queueId, + }); + return; + } + const turnSpan = this.openTurns.get(turnId); + if (turnSpan === undefined) { + this.logger.debug("codex processor: task_complete with no open turn span", { queueId: this.queueId, turnId, }); return; } + const endTime = isoToUnixSeconds(record.timestamp); + this.endOpenToolSpansForTurn(turnId, endTime); + try { + turnSpan.log({ output }); + turnSpan.end(endTime !== undefined ? { endTime } : undefined); + if (endTime !== undefined) this.lastTurnEndTime = endTime; + this.logger.info("codex processor: ended turn span", { queueId: this.queueId, turnId }); + } catch (err) { + this.logger.error("codex processor: failed to end turn span", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } finally { + this.openTurns.delete(turnId); + } + } + // Open a tool span (child of its turn) for a tool-call response_item. Keyed by + // call_id so the matching *_output can close it. Skipped (with a warning) when + // there's no call_id or no open turn for its metadata.turn_id. + private startToolSpan(record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; + const name = typeof payload.name === "string" ? payload.name : undefined; + const meta = (payload.metadata ?? {}) as Record; + const turnId = typeof meta.turn_id === "string" ? meta.turn_id : undefined; + // function_call uses `arguments` (a JSON string); custom_tool_call uses `input`. + const input = payload.arguments ?? payload.input; + + if (callId === undefined) { + this.logger.warn("codex processor: tool call without call_id; skipping tool span", { + queueId: this.queueId, + turnId, + }); + return; + } if (turnId === undefined) { - this.logger.warn("codex processor: PreToolUse without turn_id; skipping tool span", { + this.logger.warn("codex processor: tool call without turn_id; skipping tool span", { queueId: this.queueId, - toolUseId, + callId, }); return; } - const turnSpan = this.openTurns.get(turnId); if (turnSpan === undefined) { - this.logger.warn("codex processor: PreToolUse with no open turn span; skipping tool span", { + this.logger.warn("codex processor: tool call with no open turn span; skipping", { queueId: this.queueId, turnId, - toolUseId, + callId, }); return; } - - if (this.openTools.has(toolUseId)) { - this.logger.warn("codex processor: duplicate tool_use_id; keeping existing tool span", { + if (this.openTools.has(callId)) { + this.logger.warn("codex processor: duplicate call_id; keeping existing tool span", { queueId: this.queueId, - toolUseId, + callId, }); return; } - + const startTime = isoToUnixSeconds(record.timestamp); try { const toolSpan = turnSpan.startSpan({ - name: toolName ?? "tool", + name: name ?? "tool", type: "tool", + ...(startTime !== undefined ? { startTime } : {}), event: { - input: toolInput, - metadata: { tool_name: toolName, tool_use_id: toolUseId, turn_id: turnId }, + input, + metadata: { tool_name: name, call_id: callId, turn_id: turnId }, }, }); - this.openTools.set(toolUseId, { span: toolSpan, turnId }); + this.openTools.set(callId, { span: toolSpan, turnId }); this.logger.info("codex processor: opened tool span", { queueId: this.queueId, turnId, - toolUseId, + callId, spanId: toolSpan.id, }); } catch (err) { this.logger.error("codex processor: failed to open tool span", { queueId: this.queueId, turnId, - toolUseId, + callId, error: String(err), }); } } - // Close the tool span matching the PostToolUse's tool_use_id, recording the - // tool_response as the span's output. - private endToolSpan(event: EnqueueEvent): void { - const data = (event.eventData ?? {}) as Record; - const toolUseId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined; - const output = data.tool_response; - - if (toolUseId === undefined) { - this.logger.debug("codex processor: PostToolUse without tool_use_id; no tool span to close", { + // Close the tool span matching a *_output record's call_id, recording the + // output. + private endToolSpan(record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; + const output = payload.output ?? payload.result; + if (callId === undefined) { + this.logger.debug("codex processor: tool output without call_id", { queueId: this.queueId, }); return; } - - const entry = this.openTools.get(toolUseId); + const entry = this.openTools.get(callId); if (entry === undefined) { - this.logger.debug("codex processor: PostToolUse with no open tool span", { + this.logger.debug("codex processor: tool output with no open tool span", { queueId: this.queueId, - toolUseId, + callId, }); return; } - + const endTime = isoToUnixSeconds(record.timestamp); try { entry.span.log({ output }); - entry.span.end(); - this.logger.info("codex processor: ended tool span", { queueId: this.queueId, toolUseId }); + entry.span.end(endTime !== undefined ? { endTime } : undefined); + this.logger.info("codex processor: ended tool span", { queueId: this.queueId, callId }); } catch (err) { this.logger.error("codex processor: failed to end tool span", { queueId: this.queueId, - toolUseId, + callId, error: String(err), }); } finally { - this.openTools.delete(toolUseId); + this.openTools.delete(callId); } } - // End (without output) any still-open tool spans owned by the given turn. A - // tool span can be left open when its PostToolUse never arrives (e.g. the call - // was interrupted by a permission prompt and re-issued under a new - // tool_use_id). Called when the turn ends so dangling tool spans get closed. - private endOpenToolSpansForTurn(turnId: string): void { - for (const [toolUseId, entry] of this.openTools) { + // End any still-open tool spans owned by the given turn (e.g. a call whose + // output never arrived) when the turn ends. Uses the turn's end time so a + // dangling tool span doesn't get a wall-clock end far in the future. + private endOpenToolSpansForTurn(turnId: string, endTime: number | undefined): void { + for (const [callId, entry] of this.openTools) { if (entry.turnId !== turnId) continue; try { - entry.span.end(); + entry.span.end(endTime !== undefined ? { endTime } : undefined); this.logger.info("codex processor: ended dangling tool span on turn end", { queueId: this.queueId, turnId, - toolUseId, + callId, }); } catch (err) { this.logger.error("codex processor: failed to end dangling tool span", { queueId: this.queueId, turnId, - toolUseId, + callId, error: String(err), }); } finally { - this.openTools.delete(toolUseId); + this.openTools.delete(callId); } } } - // Close the turn span matching the Stop's turn_id, recording the final - // assistant message as the span's output. - private endTurnSpan(event: EnqueueEvent): void { - const data = (event.eventData ?? {}) as Record; - const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - const output = - typeof data.last_assistant_message === "string" ? data.last_assistant_message : undefined; - - if (turnId === undefined) { - this.logger.debug("codex processor: Stop without turn_id; no turn span to close", { - queueId: this.queueId, - }); - return; - } - - const turnSpan = this.openTurns.get(turnId); - if (turnSpan === undefined) { - this.logger.debug("codex processor: Stop with no open turn span", { - queueId: this.queueId, - turnId, - }); - return; - } - - // Close any tool spans still open under this turn before ending the turn. - this.endOpenToolSpansForTurn(turnId); - - try { - turnSpan.log({ output }); - turnSpan.end(); - this.logger.info("codex processor: ended turn span", { queueId: this.queueId, turnId }); - } catch (err) { - this.logger.error("codex processor: failed to end turn span", { - queueId: this.queueId, - turnId, - error: String(err), - }); - } finally { - this.openTurns.delete(turnId); - } + // The most recently opened turn that is still open (used to attach a + // user_message, which carries no turn_id). + private latestOpenTurn(): Span | undefined { + let last: Span | undefined; + for (const span of this.openTurns.values()) last = span; + return last; } - // End the root span on the first Stop event. Subsequent Stops are ignored - // (the SDK keeps the first end time anyway). - private endRootSpan(): void { + // End the root span on the first Stop hook. Subsequent Stops are ignored + // (the SDK records a span's end time once). `endTime` is the completing turn's + // end time so the root isn't stamped with a late wall-clock value; falls back + // to the SDK's now() if unknown. Any still-open turn/tool spans are closed at + // the same time so nothing dangles. + private endRootSpan(endTime: number | undefined): void { if (this.rootSpan === null || this.rootEnded) return; this.rootEnded = true; - // Backstop: close any tool spans still open (e.g. ones whose turn never saw - // a matching Stop) so nothing is left dangling when the session ends. - for (const [toolUseId, entry] of this.openTools) { + const endArgs = endTime !== undefined ? { endTime } : undefined; + // Backstop: close any spans still open so nothing dangles. + for (const [callId, entry] of this.openTools) { try { - entry.span.end(); + entry.span.end(endArgs); } catch (err) { this.logger.error("codex processor: failed to end dangling tool span on root end", { queueId: this.queueId, - toolUseId, + callId, error: String(err), }); } } this.openTools.clear(); + for (const [turnId, span] of this.openTurns) { + try { + span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling turn span on root end", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + } + this.openTurns.clear(); try { - this.rootSpan.end(); + this.rootSpan.end(endArgs); this.logger.info("codex processor: ended root span", { queueId: this.queueId }); } catch (err) { this.logger.error("codex processor: failed to end root span", { @@ -430,56 +726,59 @@ export class CodexEventProcessor implements EventProcessor { } } - private startRootSpan(event: EnqueueEvent): void { - if (this.rootSpan !== null) { - this.logger.warn("codex processor: duplicate SessionStart; keeping existing root span", { - queueId: this.queueId, - }); - return; - } - - const data = (event.eventData ?? {}) as Record; - const model = typeof data.model === "string" ? data.model : undefined; - const cwd = typeof data.cwd === "string" ? data.cwd : undefined; - const source = typeof data.source === "string" ? data.source : undefined; - const permissionMode = - typeof data.permission_mode === "string" ? data.permission_mode : undefined; - - // Name the root span after the directory Codex was launched from, e.g. - // "codex: myapp" for /whatever/myapp. Falls back to "codex session" when cwd - // is unknown. - const projectDir = projectDirName(cwd); - const spanName = projectDir ? `codex: ${projectDir}` : "codex session"; - + async flush(): Promise { + if (this.rootSpan === null) return; + // Final catch-up: a turn's task_complete can land slightly after its Stop + // hook fires, so the Stop's bounded wait may miss it and leave the turn open. + // flush() happens after the terminal Stop's /flush (and on idle/eviction). + // Poll the transcript until every open turn has closed (or a bounded timeout + // elapses), processing new records as they appear. This processes the + // transcript (it does not unilaterally end spans), staying consistent with + // "the transcript is the truth". + await this.drainOpenTurns(); try { - this.rootSpan = this.spanFactory.startSpan({ - name: spanName, - type: "task", - event: { - input: { model, cwd, source }, - metadata: { - // User-provided extras first, so the standard keys below win on - // conflict. - ...this.reportingConfig?.additionalMetadata, - session_id: this.queueId, - model, - cwd, - source, - permission_mode: permissionMode, - event_source_version: event.eventSourceVersion, - project: this.reportingConfig?.project, - }, - }, - }); - this.logger.info("codex processor: opened root span", { - queueId: this.queueId, - spanId: this.rootSpan.id, - }); + await this.rootSpan.flush(); + this.logger.debug("codex processor: flush ok", { queueId: this.queueId }); } catch (err) { - this.logger.error("codex processor: failed to open root span", { + this.logger.error("codex processor: flush failed", { queueId: this.queueId, error: String(err), }); } } + + // Read and process new transcript records, retrying on a bounded interval, + // until no turn spans remain open or the timeout elapses. Always does at least + // one read so a task_complete already on disk closes its turn immediately. On + // timeout, leaves whatever is still open for endRootSpan's backstop to close. + private async drainOpenTurns(): Promise { + if (this.transcriptPath === null) return; + const deadline = Date.now() + FLUSH_TIMEOUT_MS; + for (;;) { + try { + const { lines, offset } = this.transcriptReader.readFrom( + this.transcriptPath, + this.transcriptOffset, + ); + this.transcriptOffset = offset; + this.consumeLines(lines); + } catch (err) { + this.logger.error("codex processor: final catch-up read failed", { + queueId: this.queueId, + error: String(err), + }); + return; + } + if (this.openTurns.size === 0 || Date.now() >= deadline) { + if (this.openTurns.size > 0) { + this.logger.warn("codex processor: turns still open at flush timeout", { + queueId: this.queueId, + openTurns: this.openTurns.size, + }); + } + return; + } + await sleep(FLUSH_INTERVAL_MS); + } + } } diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index 3fca991..19dba96 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -1,7 +1,13 @@ -// Codex-specific test helpers: event builders that mimic Codex hook payloads, -// and assertProducesTrace, which runs events through the CodexEventProcessor and -// checks the resulting Braintrust trace. Generic trace/span helpers are imported -// from the shared test-helpers. +// Codex-specific test helpers. Spans are built from transcript records; hooks +// drive lifecycle (config, root enrichment, the Stop sentinel). A test expresses +// both sources in one ordered list. transcript(...) (and the typed builders that +// wrap it) represent lines being appended to the transcript file; hook builders +// (sessionStart, stop, configEvent) represent hook events arriving. +// +// Semantics: a transcript entry buffers a line. The NEXT hook triggers a +// catch-up that reads all buffered-but-unread lines, then handles the hook — +// modeling how the real processor catches up on each hook. A Stop hook waits for +// the turn's task_complete to be present before catching up. import type { EnqueueEvent } from "../../server/routes.ts"; import { @@ -13,40 +19,44 @@ import { } from "../../test-helpers.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; +import type { + TranscriptReader, + TranscriptReadResult, + TranscriptWaitResult, + WaitForOptions, +} from "./transcript-reader.ts"; + +/** Synthetic transcript path attached to hook events so catch-up reads run. */ +export const TEST_TRANSCRIPT_PATH = "/test/transcript.jsonl"; + +/** Monotonic default timestamp so transcript records sort in creation order. */ +let tsCounter = 0; +function nextTimestamp(): string { + tsCounter += 1; + return new Date(Date.UTC(2026, 0, 1, 0, 0, tsCounter)).toISOString(); +} // ============================================================================ -// Event builders +// Hook event builders // ============================================================================ -function codexEvent(eventName: string, eventData: Record): EnqueueEvent { +function codexHook(eventName: string, eventData: Record): EnqueueEvent { const queueId = typeof eventData.session_id === "string" ? eventData.session_id : "session-1"; return { queueId, eventSource: "codex-hook", eventSourceVersion: null, eventName, - eventData: { hook_event_name: eventName, ...eventData }, + eventData: { hook_event_name: eventName, transcript_path: TEST_TRANSCRIPT_PATH, ...eventData }, }; } export function sessionStart(data: Record = {}): EnqueueEvent { - return codexEvent("SessionStart", { session_id: "session-1", ...data }); -} - -export function userPromptSubmit(data: Record = {}): EnqueueEvent { - return codexEvent("UserPromptSubmit", { session_id: "session-1", ...data }); -} - -export function preToolUse(data: Record = {}): EnqueueEvent { - return codexEvent("PreToolUse", { session_id: "session-1", ...data }); -} - -export function postToolUse(data: Record = {}): EnqueueEvent { - return codexEvent("PostToolUse", { session_id: "session-1", ...data }); + return codexHook("SessionStart", { session_id: "session-1", ...data }); } export function stop(data: Record = {}): EnqueueEvent { - return codexEvent("Stop", { session_id: "session-1", ...data }); + return codexHook("Stop", { session_id: "session-1", ...data }); } /** A config event enabling tracing; pass extra reporting config via `data`. */ @@ -61,25 +71,190 @@ export function configEvent(data: Record = {}): EnqueueEvent { } // ============================================================================ -// assertProducesTrace: run events through a CodexEventProcessor and assert the -// resulting trace. One session => one trace. +// Transcript entry builders +// ============================================================================ + +/** Marks a transcript line being written to the (fake) transcript file. */ +export interface TranscriptEntry { + kind: "transcript"; + record: Record; +} + +/** + * Build a raw transcript entry from a full record envelope + * `{ timestamp, type, payload }`. `timestamp` defaults to a monotonic value. + */ +export function transcript(record: Record = {}): TranscriptEntry { + return { + kind: "transcript", + record: { timestamp: nextTimestamp(), ...record }, + }; +} + +/** session_meta record: opens the root span (cwd, session id, cli version). */ +export function sessionMeta(payload: Record = {}): TranscriptEntry { + return transcript({ type: "session_meta", payload: { id: "session-1", ...payload } }); +} + +/** turn_context record: carries the model for a turn. */ +export function turnContext(payload: Record = {}): TranscriptEntry { + return transcript({ type: "turn_context", payload }); +} + +/** task_started event: opens a turn span (by turn_id). */ +export function taskStarted(payload: Record = {}): TranscriptEntry { + return transcript({ type: "event_msg", payload: { type: "task_started", ...payload } }); +} + +/** user_message event: the prompt; sets the open turn's input. */ +export function userMessage(payload: Record = {}): TranscriptEntry { + return transcript({ type: "event_msg", payload: { type: "user_message", ...payload } }); +} + +/** task_complete event: closes a turn span (output = last_agent_message). */ +export function taskComplete(payload: Record = {}): TranscriptEntry { + return transcript({ type: "event_msg", payload: { type: "task_complete", ...payload } }); +} + +/** function_call response_item: opens a tool span (by call_id, under turn). */ +export function functionCall(payload: Record = {}): TranscriptEntry { + const { turn_id, metadata, ...rest } = payload as { + turn_id?: string; + metadata?: Record; + } & Record; + return transcript({ + type: "response_item", + payload: { + type: "function_call", + ...(turn_id !== undefined ? { metadata: { turn_id, ...metadata } } : { metadata }), + ...rest, + }, + }); +} + +/** function_call_output response_item: closes a tool span (by call_id). */ +export function functionCallOutput(payload: Record = {}): TranscriptEntry { + return transcript({ + type: "response_item", + payload: { type: "function_call_output", ...payload }, + }); +} + +/** custom_tool_call response_item (e.g. apply_patch): opens a tool span. */ +export function customToolCall(payload: Record = {}): TranscriptEntry { + const { turn_id, metadata, ...rest } = payload as { + turn_id?: string; + metadata?: Record; + } & Record; + return transcript({ + type: "response_item", + payload: { + type: "custom_tool_call", + ...(turn_id !== undefined ? { metadata: { turn_id, ...metadata } } : { metadata }), + ...rest, + }, + }); +} + +/** custom_tool_call_output response_item: closes a tool span (by call_id). */ +export function customToolCallOutput(payload: Record = {}): TranscriptEntry { + return transcript({ + type: "response_item", + payload: { type: "custom_tool_call_output", ...payload }, + }); +} + +/** One entry in a mixed trace list: a hook event or a transcript write. */ +export type TraceEntry = EnqueueEvent | TranscriptEntry; + +function isTranscriptEntry(entry: TraceEntry): entry is TranscriptEntry { + return (entry as TranscriptEntry).kind === "transcript"; +} + +// ============================================================================ +// FakeTranscriptReader: an in-memory transcript fed by transcript(...) entries. +// ============================================================================ + +/** + * Models the transcript file as a growing byte buffer. `append` writes a + * complete line; `readFrom` returns complete lines from the given offset. + * `waitFor` resolves immediately from the current buffer (the harness has + * already written everything available before the hook runs), so tests stay + * synchronous and don't actually sleep. + */ +export class FakeTranscriptReader implements TranscriptReader { + private buffer = ""; + /** Offsets passed to readFrom/waitFor, in call order (for assertions). */ + readonly readOffsets: number[] = []; + + append(line: string): void { + this.buffer += `${line}\n`; + } + + readFrom(_path: string, offset: number): TranscriptReadResult { + this.readOffsets.push(offset); + const from = offset > this.buffer.length ? 0 : offset; + const slice = this.buffer.slice(from); + const lines: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + lines.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + const sentinelFound = lines.some(predicate); + return { lines, offset: next, sentinelFound }; + } +} + +// ============================================================================ +// assertProducesTrace: run a mixed list of hook events and transcript writes +// through a CodexEventProcessor and assert the resulting trace. // ============================================================================ export async function assertProducesTrace( - events: EnqueueEvent[], + entries: TraceEntry[], expected: ExpectedSpan, - opts: { queueId?: string } = {}, + opts: { queueId?: string; reader?: FakeTranscriptReader } = {}, ): Promise { - const queueId = opts.queueId ?? events[0]?.queueId ?? "session-1"; + const hookEvents = entries.filter((e): e is EnqueueEvent => !isTranscriptEntry(e)); + const queueId = opts.queueId ?? hookEvents[0]?.queueId ?? "session-1"; + // Tracing is off by default; prepend a tracing-enabled config event unless the // caller already provided one, so trace assertions exercise the span path. - const hasConfig = events.some((e) => e.eventName === CODEX_CONFIG_EVENT); - const toProcess = hasConfig ? events : [configEvent({ session_id: queueId }), ...events]; + const hasConfig = hookEvents.some((e) => e.eventName === CODEX_CONFIG_EVENT); + const toRun: TraceEntry[] = hasConfig + ? entries + : [configEvent({ session_id: queueId }), ...entries]; + + const reader = opts.reader ?? new FakeTranscriptReader(); const trace = withCapturedTrace(); try { - const processor = new CodexEventProcessor(queueId, createTestLogger(), () => trace.spanFactory); - for (const event of toProcess) { - await processor.process(event); + const processor = new CodexEventProcessor( + queueId, + createTestLogger(), + () => trace.spanFactory, + reader, + ); + for (const entry of toRun) { + if (isTranscriptEntry(entry)) { + reader.append(JSON.stringify(entry.record)); + } else { + await processor.process(entry); + } } await processor.flush(); diff --git a/plugins/trace-codex/src/agents/codex/transcript-reader.ts b/plugins/trace-codex/src/agents/codex/transcript-reader.ts new file mode 100644 index 0000000..ef2ad70 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/transcript-reader.ts @@ -0,0 +1,155 @@ +// Reads new content from a Codex transcript file starting at a byte offset. +// +// The processor "catches up" on the transcript whenever a hook arrives: it asks +// the reader for everything appended since the last read, parses the new lines, +// and advances its offset. This interface is the seam that keeps the processor +// free of direct filesystem coupling — production reads the real file, while +// unit tests inject a fake fed by `transcript(...)` test entries. +// +// On a terminal hook (Stop) the processor must not race the transcript writer: +// the turn's closing record may not be flushed to disk yet. `waitFor` polls the +// file, bounded by a timeout, until a sentinel line appears (or it gives up). +// This is the one deliberate blocking point, and it always returns rather than +// hanging the turn. + +import { readFileSync, statSync } from "node:fs"; +import type { Logger } from "../../log.ts"; + +/** Result of a read: the complete new lines and the advanced byte offset. */ +export interface TranscriptReadResult { + /** Complete lines (newline-terminated in the file) read since `offset`. */ + lines: string[]; + /** + * The new offset to read from next time. A partial trailing line (no + * newline yet) is NOT consumed and not advanced past, so it is re-read once + * it is fully written. + */ + offset: number; +} + +/** Options bounding a `waitFor` poll. */ +export interface WaitForOptions { + /** Give up after this many milliseconds total. */ + timeoutMs: number; + /** Delay between polls. */ + intervalMs: number; +} + +/** Result of waiting: the lines read so far and whether the sentinel was seen. */ +export interface TranscriptWaitResult extends TranscriptReadResult { + /** True if `predicate` matched one of the read lines before the timeout. */ + sentinelFound: boolean; +} + +export interface TranscriptReader { + /** + * Read complete lines from `path` starting at byte `offset`. Returns the new + * lines plus the offset just past the last newline consumed. Must never + * throw: on a missing/unreadable file it returns no lines and the same + * offset, so the next hook simply tries again. + */ + readFrom(path: string, offset: number): TranscriptReadResult; + + /** + * Read from `offset`, then keep polling until `predicate` matches one of the + * cumulative lines read or the timeout elapses. Returns all lines read across + * polls, the advanced offset, and whether the sentinel was found. Never + * throws and always returns within the timeout, so the caller can fail soft + * (emit whatever exists) rather than stall the turn. + */ + waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + opts: WaitForOptions, + ): Promise; +} + +/** + * Split a buffer into complete lines (those ending in "\n") and report how many + * bytes were consumed. A trailing fragment without a newline is left for the + * next read. + */ +function splitCompleteLines(buf: Buffer): { lines: string[]; consumed: number } { + const lines: string[] = []; + let lineStart = 0; + let consumed = 0; + for (let i = 0; i < buf.length; i++) { + if (buf[i] === 0x0a /* \n */) { + lines.push(buf.toString("utf8", lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines, consumed }; +} + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Production reader: reads the file, slices from `offset`, and returns complete + * lines plus the advanced offset. Reads the whole file and slices for + * simplicity; transcripts are modest and this runs off the hook hot path. + */ +export function createFileTranscriptReader(diagLogger?: Logger): TranscriptReader { + function readFrom(path: string, offset: number): TranscriptReadResult { + try { + const size = statSync(path).size; + // File shrank or truncated (e.g. replaced): reset to start. + const from = offset > size ? 0 : offset; + if (from >= size) { + return { lines: [], offset: from }; + } + const buf = readFileSync(path); + const slice = buf.subarray(from); + const { lines, consumed } = splitCompleteLines(slice); + return { lines, offset: from + consumed }; + } catch (err) { + diagLogger?.debug("transcript reader: read failed; skipping", { + path, + offset, + error: String(err), + }); + return { lines: [], offset }; + } + } + + async function waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + opts: WaitForOptions, + ): Promise { + const deadline = Date.now() + opts.timeoutMs; + const allLines: string[] = []; + let cursor = offset; + let sentinelFound = false; + + // Poll until the sentinel appears or we run out of time. Always does at + // least one read so a sentinel already on disk is found immediately. + for (;;) { + const { lines, offset: next } = readFrom(path, cursor); + cursor = next; + for (const line of lines) { + allLines.push(line); + if (predicate(line)) sentinelFound = true; + } + if (sentinelFound || Date.now() >= deadline) break; + await sleep(opts.intervalMs); + } + + if (!sentinelFound) { + diagLogger?.debug("transcript reader: sentinel not found before timeout", { + path, + offset, + timeoutMs: opts.timeoutMs, + }); + } + return { lines: allLines, offset: cursor, sentinelFound }; + } + + return { readFrom, waitFor }; +} + +/** Default production reader (no diagnostics). */ +export const defaultTranscriptReader: TranscriptReader = createFileTranscriptReader(); diff --git a/plugins/trace-codex/src/agents/codex/transcript.test.ts b/plugins/trace-codex/src/agents/codex/transcript.test.ts new file mode 100644 index 0000000..c49404b --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/transcript.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, test } from "bun:test"; +import { parseTranscriptLine } from "./transcript.ts"; + +describe("parseTranscriptLine", () => { + test("parses a well-formed record envelope", () => { + const line = JSON.stringify({ + timestamp: "2026-06-22T04:50:25.725Z", + type: "response_item", + payload: { type: "message", role: "assistant", content: "hi" }, + }); + const record = parseTranscriptLine(line); + expect(record).not.toBeNull(); + expect(record?.type).toBe("response_item"); + expect(record?.payload?.type).toBe("message"); + expect(record?.timestamp).toBe("2026-06-22T04:50:25.725Z"); + }); + + test("preserves unknown record types (forward compatible)", () => { + const line = JSON.stringify({ type: "some_future_type", payload: { type: "whatever" } }); + const record = parseTranscriptLine(line); + expect(record?.type).toBe("some_future_type"); + expect(record?.payload?.type).toBe("whatever"); + }); + + test("preserves extra top-level fields", () => { + const line = JSON.stringify({ type: "compacted", payload: {}, window_id: 42 }); + const record = parseTranscriptLine(line); + expect((record as Record)?.window_id).toBe(42); + }); + + test("returns null for a blank line", () => { + expect(parseTranscriptLine("")).toBeNull(); + expect(parseTranscriptLine(" ")).toBeNull(); + }); + + test("returns null (no throw) for malformed JSON", () => { + expect(parseTranscriptLine('{"type":"response_item"')).toBeNull(); + expect(parseTranscriptLine("not json at all")).toBeNull(); + }); + + test("returns null for JSON that isn't an object", () => { + expect(parseTranscriptLine("[1,2,3]")).toBeNull(); + expect(parseTranscriptLine("42")).toBeNull(); + expect(parseTranscriptLine('"a string"')).toBeNull(); + expect(parseTranscriptLine("null")).toBeNull(); + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/transcript.ts b/plugins/trace-codex/src/agents/codex/transcript.ts new file mode 100644 index 0000000..e1d1a76 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/transcript.ts @@ -0,0 +1,81 @@ +// Pure parser/types for Codex transcript ("rollout") records. +// +// Codex writes an append-only JSONL transcript per session at the path every +// hook payload reports as `transcript_path`. Each line is one record with a +// stable envelope: `{ timestamp, type, payload }`. The transcript is the only +// source for LLM calls, token usage, and the true execution order of LLM and +// tool calls within a turn. +// +// This module only parses lines into typed records; it builds no spans and has +// no Braintrust dependency. The format is explicitly NOT a stable interface +// (Codex may change it across versions), so parsing is permissive and +// fail-soft: unknown record types are preserved as-is, and a malformed line +// yields null rather than throwing. + +/** + * The common envelope every transcript line shares. `payload` is intentionally + * loose: we narrow it per `type`/`payload.type` only where we need to, and we + * tolerate shapes we don't recognize. + */ +export interface TranscriptRecord { + /** ISO-8601 timestamp Codex wrote the record (its clock). */ + timestamp?: string; + /** Top-level record kind, e.g. "response_item", "event_msg", "turn_context". */ + type?: string; + /** Record-specific body. Narrow via `payload.type` for response_item/event_msg. */ + payload?: TranscriptPayload; + /** Anything else Codex includes; preserved for forward compatibility. */ + [key: string]: unknown; +} + +/** Loose payload shape. `type` discriminates response_item / event_msg subtypes. */ +export interface TranscriptPayload { + type?: string; + [key: string]: unknown; +} + +// Known top-level record types. Kept as constants (not a closed union) so +// unknown types still parse and flow through as plain records. +export const RECORD_SESSION_META = "session_meta"; +export const RECORD_TURN_CONTEXT = "turn_context"; +export const RECORD_RESPONSE_ITEM = "response_item"; +export const RECORD_EVENT_MSG = "event_msg"; +export const RECORD_COMPACTED = "compacted"; + +// Known response_item payload subtypes. +export const ITEM_MESSAGE = "message"; +export const ITEM_REASONING = "reasoning"; +export const ITEM_FUNCTION_CALL = "function_call"; +export const ITEM_FUNCTION_CALL_OUTPUT = "function_call_output"; +export const ITEM_CUSTOM_TOOL_CALL = "custom_tool_call"; +export const ITEM_CUSTOM_TOOL_CALL_OUTPUT = "custom_tool_call_output"; +export const ITEM_TOOL_SEARCH_CALL = "tool_search_call"; +export const ITEM_TOOL_SEARCH_OUTPUT = "tool_search_output"; + +// Known event_msg payload subtypes. +export const EVT_TOKEN_COUNT = "token_count"; +export const EVT_AGENT_MESSAGE = "agent_message"; +export const EVT_USER_MESSAGE = "user_message"; +export const EVT_TASK_STARTED = "task_started"; +export const EVT_TASK_COMPLETE = "task_complete"; +export const EVT_CONTEXT_COMPACTED = "context_compacted"; + +/** + * Parse a single transcript line into a TranscriptRecord. Returns null for + * blank lines and for anything that isn't a JSON object, so callers can skip it + * without breaking the session. Never throws. + */ +export function parseTranscriptLine(line: string): TranscriptRecord | null { + const trimmed = line.trim(); + if (trimmed.length === 0) return null; + try { + const parsed = JSON.parse(trimmed) as unknown; + if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) { + return null; + } + return parsed as TranscriptRecord; + } catch { + // Malformed JSON (e.g. a partially-written line): skip it. + return null; + } +} diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts index 639d4a4..a5de277 100644 --- a/plugins/trace-codex/src/test-helpers.ts +++ b/plugins/trace-codex/src/test-helpers.ts @@ -190,6 +190,8 @@ export interface ExpectedSpan { metadata?: Record; /** If set, assert whether the span has an end time (true) or not (false). */ ended?: boolean; + /** If set, assert exact start/end metric values (Unix seconds). */ + metrics?: { start?: number; end?: number }; /** Exact list of children (length and order are checked). */ children?: ExpectedSpan[]; } @@ -244,6 +246,18 @@ export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: diffs.push(`${path}.ended: expected ${expected.ended}, got ${isEnded}`); } } + if (expected.metrics !== undefined) { + if (expected.metrics.start !== undefined && actual.metrics?.start !== expected.metrics.start) { + diffs.push( + `${path}.metrics.start: expected ${expected.metrics.start}, got ${String(actual.metrics?.start)}`, + ); + } + if (expected.metrics.end !== undefined && actual.metrics?.end !== expected.metrics.end) { + diffs.push( + `${path}.metrics.end: expected ${expected.metrics.end}, got ${String(actual.metrics?.end)}`, + ); + } + } if (expected.children !== undefined) { if (actual.children.length !== expected.children.length) { diffs.push( From 7874cd8cc220a1faab824bff5711a789bf5875b7 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 12:53:44 -0600 Subject: [PATCH 05/17] make flush blocking configurable --- plugins/trace-codex/README.md | 1 + plugins/trace-codex/config.json.example | 1 + .../src/agents/codex/event-processor.test.ts | 9 +- .../src/agents/codex/event-processor.ts | 121 ++++++++++++------ .../src/agents/codex/settings.test.ts | 17 +++ .../trace-codex/src/agents/codex/settings.ts | 14 +- plugins/trace-codex/src/client/client.ts | 28 ++-- plugins/trace-codex/src/index.ts | 9 ++ .../trace-codex/src/server/enqueue-client.ts | 27 ++-- plugins/trace-codex/src/server/routes.test.ts | 48 ++++++- plugins/trace-codex/src/server/routes.ts | 39 ++++-- plugins/trace-codex/src/server/server.ts | 1 + 12 files changed, 235 insertions(+), 80 deletions(-) diff --git a/plugins/trace-codex/README.md b/plugins/trace-codex/README.md index ddd21d9..c7aa39b 100644 --- a/plugins/trace-codex/README.md +++ b/plugins/trace-codex/README.md @@ -30,6 +30,7 @@ Every setting can be provided as a `config.json` key or as an environment variab | `project` | `BRAINTRUST_PROJECT` | _(unset)_ | Project to log traces into. | | `apiUrl` | `BRAINTRUST_API_URL` | api.braintrust.dev | Braintrust API URL. | | `additionalMetadata` | `BRAINTRUST_ADDITIONAL_METADATA` | _(unset)_ | JSON object of extra metadata merged into the root span. Standard keys (`session_id`, `model`, `project`, etc.) take precedence on conflict. | +| `blockOnStop` | `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` | `false` | When `true`, the hook blocks on each turn's `Stop` until the server confirms all spans are flushed. Use in programmatic/CI runs to guarantee traces are delivered before Codex exits. | | `recordFile` | `BRAINTRUST_EVENT_SERVER_RECORD_FILE` | _(unset)_ | If set, record every event to this NDJSON file (for `replay`). | ### Advanced Options diff --git a/plugins/trace-codex/config.json.example b/plugins/trace-codex/config.json.example index 8f05530..c998275 100644 --- a/plugins/trace-codex/config.json.example +++ b/plugins/trace-codex/config.json.example @@ -6,6 +6,7 @@ "appUrl": "https://www.braintrust.dev", "project": "my-codex-project", "additionalMetadata": { "team": "platform", "env": "dev" }, + "blockOnStop": false, "recordFile": "", "port": 52734, "idleTimeoutMs": 300000, diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 0f5cac8..789b653 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -633,10 +633,11 @@ describe("CodexEventProcessor: transcript catch-up plumbing", () => { ); }); - test("flush polls the transcript until an open turn's task_complete appears", async () => { - // A reader that withholds its last appended line until a later read, modeling - // a task_complete that lands after the terminal Stop. flush() must poll until - // it appears and then close the turn. + test("a turn whose task_complete lands after its Stop is closed by the bounded wait", async () => { + // Regression for the live bug: Codex writes task_complete seconds after the + // Stop hook fires. The Stop's initial read doesn't see it (turn stays open), + // so the bounded waitFor must keep reading until it appears, then close the + // turn — with the transcript-derived end time, not a wall-clock value. class DelayedRevealReader implements TranscriptReader { private lines: string[] = []; private withheld: string | null = null; diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index d215f91..a94576f 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -58,16 +58,19 @@ const SESSION_START = "SessionStart"; const STOP = "Stop"; // Bound on how long a Stop hook waits for the turn's task_complete to appear in -// the transcript before giving up and emitting whatever exists. Small so a slow -// or stuck writer can never stall the Codex turn. -const SENTINEL_TIMEOUT_MS = 2000; +// the transcript before giving up and emitting whatever exists. Codex can write +// task_complete several seconds after firing the Stop hook, so this is generous; +// the wait returns as soon as the sentinel is seen, and only the last turn of a +// session ever relies on it (earlier turns are closed by the next hook's +// catch-up). Still bounded so a stuck writer can never stall the Codex turn. +const SENTINEL_TIMEOUT_MS = 10_000; const SENTINEL_INTERVAL_MS = 25; // Bound on how long flush() polls the transcript for still-open turns to close -// (their task_complete can land slightly after the terminal Stop). Larger than -// the per-hook wait since flush is off the Codex hot path, but still bounded so -// a stuck writer can't hang shutdown. -const FLUSH_TIMEOUT_MS = 5000; +// (their task_complete can land seconds after the terminal Stop). flush() is off +// the Codex hot path, so this is generous; it returns immediately once no turns +// remain open, and is still bounded so a stuck writer can't hang shutdown. +const FLUSH_TIMEOUT_MS = 15_000; const FLUSH_INTERVAL_MS = 50; const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); @@ -118,6 +121,11 @@ export class CodexEventProcessor implements EventProcessor { // Turn spans currently open, keyed by turn_id. A turn is opened by // task_started and closed by the matching task_complete. private readonly openTurns = new Map(); + // turn_ids whose Stop hook has fired but whose task_complete hasn't been seen + // yet (Codex can write task_complete seconds after the Stop hook). flush() + // only blocks-and-polls when one of these is still open; a turn that is merely + // mid-progress (no Stop yet) does not make an idle flush wait. + private readonly turnsAwaitingCompletion = new Set(); // Tool spans currently open, keyed by call_id. Opened by a tool-call // response_item and closed by the matching *_output. The owning turn_id is // tracked so any still-open tool spans are ended when their turn ends. @@ -188,6 +196,14 @@ export class CodexEventProcessor implements EventProcessor { this.recordRootEnrichment(event); } + // A Stop signals its turn is finishing; mark it so flush knows to wait for + // this turn's task_complete (which Codex may write after the Stop hook). + if (event.eventName === STOP) { + const data = (event.eventData ?? {}) as Record; + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + if (turnId !== undefined) this.turnsAwaitingCompletion.add(turnId); + } + try { await this.catchUpTranscript(event); } catch (err) { @@ -208,41 +224,44 @@ export class CodexEventProcessor implements EventProcessor { } // Read transcript lines appended since our last read and turn each into spans. - // The transcript path comes from the hook payload. On a Stop hook we first - // wait (bounded) for the turn's task_complete to be written, so the turn's - // final records aren't missed due to a write race. Never throws. + // The transcript path comes from the hook payload. Never throws. + // + // On a Stop, first consume whatever's already on disk. If that closes the + // turn, we're done. Otherwise Codex hasn't written this turn's task_complete + // yet (it can lag the Stop hook by seconds), so wait (bounded) for it — but + // only then, so a Stop whose turn is already complete returns immediately. private async catchUpTranscript(event: EnqueueEvent): Promise { const data = (event.eventData ?? {}) as Record; const path = typeof data.transcript_path === "string" ? data.transcript_path : undefined; if (path === undefined) return; this.transcriptPath = path; - let lines: string[]; - let offset: number; - if (event.eventName === STOP) { - const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - const result = await this.transcriptReader.waitFor( - path, - this.transcriptOffset, - (line) => this.isTaskCompleteFor(line, turnId), - { timeoutMs: SENTINEL_TIMEOUT_MS, intervalMs: SENTINEL_INTERVAL_MS }, - ); - lines = result.lines; - offset = result.offset; - if (!result.sentinelFound) { - this.logger.warn("codex processor: Stop sentinel not found; emitting partial turn", { - queueId: this.queueId, - turnId, - }); - } - } else { - const result = this.transcriptReader.readFrom(path, this.transcriptOffset); - lines = result.lines; - offset = result.offset; + // Initial read of everything available now. + const initial = this.transcriptReader.readFrom(path, this.transcriptOffset); + this.transcriptOffset = initial.offset; + this.consumeLines(initial.lines); + + if (event.eventName !== STOP) return; + + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + // If the turn already closed during the initial read, no wait is needed. + if (turnId === undefined || !this.openTurns.has(turnId)) return; + + // Wait (bounded) for this turn's task_complete to be written. + const result = await this.transcriptReader.waitFor( + path, + this.transcriptOffset, + (line) => this.isTaskCompleteFor(line, turnId), + { timeoutMs: SENTINEL_TIMEOUT_MS, intervalMs: SENTINEL_INTERVAL_MS }, + ); + this.transcriptOffset = result.offset; + this.consumeLines(result.lines); + if (!result.sentinelFound) { + this.logger.warn("codex processor: Stop sentinel not found; emitting partial turn", { + queueId: this.queueId, + turnId, + }); } - - this.transcriptOffset = offset; - this.consumeLines(lines); } // Parse and process a batch of transcript lines into spans, advancing nothing @@ -538,6 +557,7 @@ export class CodexEventProcessor implements EventProcessor { }); } finally { this.openTurns.delete(turnId); + this.turnsAwaitingCompletion.delete(turnId); } } @@ -681,6 +701,16 @@ export class CodexEventProcessor implements EventProcessor { return last; } + // Number of turns whose Stop has fired but which are still open (awaiting + // their task_complete). flush() polls only while this is > 0. + private countPendingTurns(): number { + let n = 0; + for (const turnId of this.turnsAwaitingCompletion) { + if (this.openTurns.has(turnId)) n += 1; + } + return n; + } + // End the root span on the first Stop hook. Subsequent Stops are ignored // (the SDK records a span's end time once). `endTime` is the completing turn's // end time so the root isn't stamped with a late wall-clock value; falls back @@ -748,9 +778,13 @@ export class CodexEventProcessor implements EventProcessor { } // Read and process new transcript records, retrying on a bounded interval, - // until no turn spans remain open or the timeout elapses. Always does at least - // one read so a task_complete already on disk closes its turn immediately. On - // timeout, leaves whatever is still open for endRootSpan's backstop to close. + // until no turn that has seen its Stop is still open (its task_complete can + // land seconds after the Stop hook) or the timeout elapses. Always does at + // least one read so a task_complete already on disk closes its turn + // immediately. Crucially, this does NOT wait for turns that are merely + // mid-progress (no Stop yet) — so an idle flush during an active turn returns + // promptly instead of blocking. On timeout, leaves whatever is still open for + // endRootSpan's backstop to close. private async drainOpenTurns(): Promise { if (this.transcriptPath === null) return; const deadline = Date.now() + FLUSH_TIMEOUT_MS; @@ -769,11 +803,14 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - if (this.openTurns.size === 0 || Date.now() >= deadline) { - if (this.openTurns.size > 0) { - this.logger.warn("codex processor: turns still open at flush timeout", { + // Only turns whose Stop has fired but whose task_complete hasn't arrived + // keep us waiting. (endTurnSpan removes them from both sets as they close.) + const pending = this.countPendingTurns(); + if (pending === 0 || Date.now() >= deadline) { + if (pending > 0) { + this.logger.warn("codex processor: turns awaiting completion at flush timeout", { queueId: this.queueId, - openTurns: this.openTurns.size, + pending, }); } return; diff --git a/plugins/trace-codex/src/agents/codex/settings.test.ts b/plugins/trace-codex/src/agents/codex/settings.test.ts index 1ace67b..afa5bfe 100644 --- a/plugins/trace-codex/src/agents/codex/settings.test.ts +++ b/plugins/trace-codex/src/agents/codex/settings.test.ts @@ -70,6 +70,16 @@ describe("loadSettingsFile", () => { expect(loadSettingsFile(path)).toEqual({}); }); + test("parses blockOnStop boolean; ignores non-boolean", () => { + expect(loadSettingsFile(write(JSON.stringify({ blockOnStop: true })))).toEqual({ + blockOnStop: true, + }); + expect(loadSettingsFile(write(JSON.stringify({ blockOnStop: false })))).toEqual({ + blockOnStop: false, + }); + expect(loadSettingsFile(write(JSON.stringify({ blockOnStop: "yes" })))).toEqual({}); + }); + test("missing file returns {}", () => { expect(loadSettingsFile(join(dir, "absent.json"))).toEqual({}); }); @@ -125,6 +135,13 @@ describe("applySettingsToEnv", () => { expect(env.TRACE_TO_BRAINTRUST).toBe("true"); expect(env.BRAINTRUST_ADDITIONAL_METADATA).toBe('{"team":"platform"}'); }); + + test("maps blockOnStop to BRAINTRUST_PLUGIN_BLOCK_ON_STOP", () => { + const env: NodeJS.ProcessEnv = {}; + const applied = applySettingsToEnv({ blockOnStop: true }, env); + expect(env.BRAINTRUST_PLUGIN_BLOCK_ON_STOP).toBe("true"); + expect(applied).toEqual(["blockOnStop"]); + }); }); describe("settingsFilePath", () => { diff --git a/plugins/trace-codex/src/agents/codex/settings.ts b/plugins/trace-codex/src/agents/codex/settings.ts index f1cc7f0..cb64a32 100644 --- a/plugins/trace-codex/src/agents/codex/settings.ts +++ b/plugins/trace-codex/src/agents/codex/settings.ts @@ -25,6 +25,13 @@ export interface Settings { project?: string; /** Master switch: when false or unset, no traces are reported to Braintrust. */ traceToBraintrust?: boolean; + /** + * When true, the hook blocks on the Stop event until the server confirms all + * events are processed and spans are flushed. Useful in programmatic/CI runs + * to guarantee traces are delivered before Codex exits. Defaults to false + * (fire-and-forget flush so the turn isn't stalled). + */ + blockOnStop?: boolean; /** Extra metadata merged into the root span (standard keys win on conflict). */ additionalMetadata?: Record; /** If set, record every event to this NDJSON file (for replay). */ @@ -44,6 +51,7 @@ export const SETTINGS_TO_ENV: Record = { appUrl: "BRAINTRUST_APP_URL", project: "BRAINTRUST_PROJECT", traceToBraintrust: "TRACE_TO_BRAINTRUST", + blockOnStop: "BRAINTRUST_PLUGIN_BLOCK_ON_STOP", additionalMetadata: "BRAINTRUST_ADDITIONAL_METADATA", recordFile: "BRAINTRUST_EVENT_SERVER_RECORD_FILE", port: "BRAINTRUST_EVENT_SERVER_PORT", @@ -55,6 +63,8 @@ const SETTING_KEYS = Object.keys(SETTINGS_TO_ENV) as Array; const NUMBER_KEYS = new Set(["port", "idleTimeoutMs", "idleCheckIntervalMs"]); +const BOOLEAN_KEYS = new Set(["traceToBraintrust", "blockOnStop"]); + /** * The plugin's writable data directory, where config.json lives. Resolved * independently of the server Config layer so settings detection stays its own @@ -104,8 +114,8 @@ export function loadSettingsFile(path: string): Settings { if (typeof value === "number" && Number.isFinite(value)) { (settings as Record)[key] = value; } - } else if (key === "traceToBraintrust") { - if (typeof value === "boolean") settings.traceToBraintrust = value; + } else if (BOOLEAN_KEYS.has(key)) { + if (typeof value === "boolean") (settings as Record)[key] = value; } else if (key === "additionalMetadata") { if (typeof value === "object" && !Array.isArray(value)) { settings.additionalMetadata = value as Record; diff --git a/plugins/trace-codex/src/client/client.ts b/plugins/trace-codex/src/client/client.ts index f4b1735..e918823 100644 --- a/plugins/trace-codex/src/client/client.ts +++ b/plugins/trace-codex/src/client/client.ts @@ -34,11 +34,18 @@ async function readStdin(): Promise { export interface HookClientOptions { /** * Event names that terminate a turn/session. After enqueuing one of these, - * the client asks the server to flush synchronously (POST /flush) so the - * final spans are delivered before this process — and any background server — - * is torn down. Defaults to none (fire-and-forget). + * the client asks the server to flush (POST /flush) so buffered spans are + * delivered. Defaults to none (fire-and-forget). */ terminalEvents?: readonly string[]; + /** + * Whether the terminal-event flush blocks until the server confirms all + * events are processed and spans are delivered (block=true), versus a + * fire-and-forget signal that returns immediately (block=false). Defaults to + * false; set true (via BRAINTRUST_PLUGIN_BLOCK_ON_STOP) to guarantee delivery + * before Codex exits, e.g. in programmatic/CI runs. + */ + blockOnStop?: boolean; } export async function runHookClient( @@ -79,12 +86,15 @@ export async function runHookClient( } } - // On a terminal event, block until the server confirms the queue has drained - // and buffered spans are flushed. Without this, a CI job (or any short-lived - // host) can end right after the agent's last turn — before the background - // server's idle timeout fires — and lose the final spans. + // On a terminal event, flush. When blockOnStop is set, block until the server + // confirms the queue has drained and buffered spans are delivered — important + // for a CI job (or any short-lived host) that ends right after the agent's + // last turn, before the background server's idle timeout fires, which would + // otherwise lose the final spans. Otherwise fire-and-forget so the turn isn't + // stalled; the background server delivers spans on its own. if (sawTerminal) { - const flushed = await postFlush(config, logger); - logger.debug("flush requested on terminal event", { flushed }); + const block = options.blockOnStop ?? false; + const flushed = await postFlush(config, logger, { block }); + logger.debug("flush requested on terminal event", { flushed, block }); } } diff --git a/plugins/trace-codex/src/index.ts b/plugins/trace-codex/src/index.ts index d5be092..d6f5ce1 100644 --- a/plugins/trace-codex/src/index.ts +++ b/plugins/trace-codex/src/index.ts @@ -22,6 +22,13 @@ function agentFactories(): Map { return new Map(AGENTS.map((a) => [a.eventSource, a.createProcessor])); } +/** Parse a boolean env var: true only for "true"/"1" (case-insensitive). */ +function parseBoolEnv(value: string | undefined): boolean { + if (!value) return false; + const v = value.trim().toLowerCase(); + return v === "true" || v === "1"; +} + async function main(): Promise { const mode = process.argv[2] ?? "hook"; @@ -64,6 +71,8 @@ async function main(): Promise { try { await runHookClient(config, logger, agent.buildEvents, { terminalEvents: agent.terminalEvents, + // Settings (above) have mapped BRAINTRUST_PLUGIN_BLOCK_ON_STOP into env. + blockOnStop: parseBoolEnv(process.env.BRAINTRUST_PLUGIN_BLOCK_ON_STOP), }); } catch (err) { logger.error("unexpected hook client error", { error: String(err) }); diff --git a/plugins/trace-codex/src/server/enqueue-client.ts b/plugins/trace-codex/src/server/enqueue-client.ts index a947c0e..f857ebd 100644 --- a/plugins/trace-codex/src/server/enqueue-client.ts +++ b/plugins/trace-codex/src/server/enqueue-client.ts @@ -31,24 +31,31 @@ export async function postEnqueue( } /** - * Ask the server to process everything enqueued so far and flush buffered spans - * to the backend, then wait for it to confirm. Returns true on a 2xx response. - * Never throws. + * Ask the server to flush. Returns true on a 2xx response. Never throws. * - * The hook calls this after a terminal event so the final spans are delivered - * before the process tree is torn down (e.g. when a CI job ends right after the - * agent's last turn, before the background server's idle timeout fires). + * When `block` is true (the default), the server waits until everything + * enqueued so far has been processed and buffered spans have reached the backend + * before responding. The hook uses this after a terminal event so the final + * spans are delivered before the process tree is torn down (e.g. a CI job ending + * right after the agent's last turn, before the idle timeout fires). + * + * When `block` is false, the server just signals every processor to flush its + * buffered state and responds immediately, without waiting for the queue to + * drain — for periodic/best-effort flushes that must not stall the caller. */ export async function postFlush( config: Pick, logger: Logger, + options: { block?: boolean } = {}, ): Promise { + const block = options.block ?? true; try { - const res = await fetch(`${serverBaseUrl(config)}/flush`, { + const res = await fetch(`${serverBaseUrl(config)}/flush?block=${block}`, { method: "POST", - // The server bounds its own wait (FLUSH_TIMEOUT_MS); allow a little more - // here so we receive its response rather than aborting the request. - signal: AbortSignal.timeout(12_000), + // A blocking flush waits on the server (bounded by FLUSH_TIMEOUT_MS); allow + // a little more here so we receive its response rather than aborting. A + // non-blocking flush returns promptly, so a short timeout suffices. + signal: AbortSignal.timeout(block ? 12_000 : 2_000), }); if (!res.ok) { logger.warn("flush rejected", { status: res.status }); diff --git a/plugins/trace-codex/src/server/routes.test.ts b/plugins/trace-codex/src/server/routes.test.ts index c38cc1d..5f1d1b7 100644 --- a/plugins/trace-codex/src/server/routes.test.ts +++ b/plugins/trace-codex/src/server/routes.test.ts @@ -14,6 +14,7 @@ function makeDeps(overrides: Partial = {}): RouteDeps { state: new ServerState(TEST_VERSION), logger, queue: new EventQueue({ logger }), + flushProcessors: async () => {}, onShutdownRequested: () => {}, ...overrides, }; @@ -119,7 +120,7 @@ describe("POST /flush", () => { eventData: { hook_event_name: "Stop" }, }; - test("waits for the queue to drain, then returns ok", async () => { + test("block=true waits for the queue to drain, then returns ok", async () => { const processed: string[] = []; const logger = createTestLogger(); const queue = new EventQueue({ @@ -132,17 +133,54 @@ describe("POST /flush", () => { queue.start(); const deps = makeDeps({ logger, queue }); - // Enqueue a Stop, then flush. The flush response must come only after the - // event has been processed. + // Enqueue a Stop, then flush(block=true). The flush response must come only + // after the event has been processed. await handleRequest(post("/enqueue", validEvent), deps); - const res = await handleRequest(post("/flush"), deps); + const res = await handleRequest(post("/flush?block=true"), deps); expect(res.status).toBe(200); - expect(await res.json()).toEqual({ ok: true }); + expect(await res.json()).toEqual({ ok: true, blocked: true }); expect(processed).toEqual(["Stop"]); await queue.stop(); }); + + test("block=false signals processors and returns immediately (no queue drain)", async () => { + let flushSignals = 0; + // The consumer is never started, so the enqueued event stays unprocessed and + // the queue never drains. A non-blocking flush must still return promptly, + // having signalled the processors. + const logger = createTestLogger(); + const queue = new EventQueue({ logger }); + const deps = makeDeps({ + logger, + queue, + flushProcessors: async () => { + flushSignals += 1; + }, + }); + + await handleRequest(post("/enqueue", validEvent), deps); + expect(queue.size).toBe(1); // still queued, not drained + const res = await handleRequest(post("/flush?block=false"), deps); + + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ ok: true, blocked: false }); + expect(flushSignals).toBe(1); + expect(queue.size).toBe(1); // flush did not wait for / cause a drain + }); + + test("defaults to non-blocking when no block param is given", async () => { + let flushSignals = 0; + const deps = makeDeps({ + flushProcessors: async () => { + flushSignals += 1; + }, + }); + const res = await handleRequest(post("/flush"), deps); + expect(await res.json()).toEqual({ ok: true, blocked: false }); + expect(flushSignals).toBe(1); + }); }); describe("unknown routes", () => { diff --git a/plugins/trace-codex/src/server/routes.ts b/plugins/trace-codex/src/server/routes.ts index 3ae78bf..81655e3 100644 --- a/plugins/trace-codex/src/server/routes.ts +++ b/plugins/trace-codex/src/server/routes.ts @@ -28,6 +28,11 @@ export interface RouteDeps { logger: Logger; /** Queue that /enqueue pushes events onto. */ queue: EventQueue; + /** + * Signal every active processor to flush its buffered state now, without + * waiting for the queue to drain. Used by the non-blocking /flush path. + */ + flushProcessors: () => Promise; /** Invoked when /shutdown is hit, after the response is constructed. */ onShutdownRequested: () => void; } @@ -45,7 +50,7 @@ const SERVICE_UNAVAILABLE = () => json({ error: "shutting_down" }, 503); const FLUSH_TIMEOUT_MS = 10_000; export async function handleRequest(req: Request, deps: RouteDeps): Promise { - const { state, logger, queue, onShutdownRequested } = deps; + const { state, logger, queue, flushProcessors, onShutdownRequested } = deps; const url = new URL(req.url); const path = url.pathname; @@ -82,13 +87,24 @@ export async function handleRequest(req: Request, deps: RouteDeps): Promise] (block defaults to false) + // + // block=true: wait until everything enqueued so far has been processed AND + // buffered spans have been flushed to the backend, then respond. Used on + // terminal events / shutdown in short-lived environments (e.g. CI) where the + // background server won't survive to flush on idle, so the final spans must + // be delivered before the process tree is torn down. + // + // block=false: don't wait for the queue to drain. Just send the flush signal + // through to every active processor (each delivers its buffered spans) and + // respond immediately. Use this for periodic/best-effort flushes that must + // not stall the caller. if (path === "/flush" && req.method === "POST") { + const block = parseBoolParam(url.searchParams.get("block")); + if (!block) { + await flushProcessors(); + return json({ ok: true, blocked: false }); + } const flushed = await Promise.race([ queue.drained().then(() => true), // Bound the wait so a hung backend flush can't hold the request lock (and @@ -99,7 +115,7 @@ export async function handleRequest(req: Request, deps: RouteDeps): Promise; diff --git a/plugins/trace-codex/src/server/server.ts b/plugins/trace-codex/src/server/server.ts index 06a79b0..0601f5f 100644 --- a/plugins/trace-codex/src/server/server.ts +++ b/plugins/trace-codex/src/server/server.ts @@ -99,6 +99,7 @@ export function startServer( state, logger: log, queue, + flushProcessors: () => registry.flushAll(), onShutdownRequested: () => { void stop(); }, From 5412dc1fb02525d609b2ae8fb9fa200aea610d55 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 14:19:55 -0600 Subject: [PATCH 06/17] llm spans --- .../src/agents/codex/event-processor.test.ts | 206 +++++++++++++- .../src/agents/codex/event-processor.ts | 266 ++++++++++++++++-- .../src/agents/codex/test-helpers.ts | 38 +++ plugins/trace-codex/src/test-helpers.ts | 21 +- 4 files changed, 495 insertions(+), 36 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 789b653..86df597 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -6,20 +6,24 @@ import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; import { assertProducesTrace, + assistantMessage, configEvent, customToolCall, customToolCallOutput, FakeTranscriptReader, functionCall, functionCallOutput, + reasoning, sessionMeta, sessionStart, stop, taskComplete, taskStarted, + tokenCount, transcript, turnContext, userMessage, + userMessageItem, } from "./test-helpers.ts"; import type { TranscriptReader, @@ -246,8 +250,8 @@ describe("CodexEventProcessor: turn spans", () => { }); describe("CodexEventProcessor: span timing (from transcript timestamps)", () => { - // 2026-01-01T00:00:10Z = 1767225610, :12Z = 1767225612, :15Z = 1767225615. - test("turn and tool spans get start/end from their transcript record timestamps", async () => { + // :10Z=1767225610, :11=...611, :12=...612, :13=...613, :14=...614, :15=...615. + test("llm and tool spans use their own transcript record timestamps", async () => { await assertProducesTrace( [ sessionStart(), @@ -261,9 +265,20 @@ describe("CodexEventProcessor: span timing (from transcript timestamps)", () => timestamp: "2026-01-01T00:00:11Z", payload: { type: "task_started", turn_id: "t1" }, }), + // LLM call: opens at the assistant message (:12), requests a tool (:13), + // result arrives (:14), and the call closes at token_count (:15). transcript({ type: "response_item", timestamp: "2026-01-01T00:00:12Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "go" }], + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:13Z", payload: { type: "function_call", name: "exec_command", @@ -273,12 +288,17 @@ describe("CodexEventProcessor: span timing (from transcript timestamps)", () => }), transcript({ type: "response_item", - timestamp: "2026-01-01T00:00:13Z", + timestamp: "2026-01-01T00:00:14Z", payload: { type: "function_call_output", call_id: "c1", output: "ok" }, }), transcript({ type: "event_msg", timestamp: "2026-01-01T00:00:15Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:16Z", payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, }), stop({ turn_id: "t1" }), @@ -286,17 +306,24 @@ describe("CodexEventProcessor: span timing (from transcript timestamps)", () => { span_attributes: { name: "codex: work", type: "task" }, ended: true, - // Root ends on first Stop at the turn's completion time. - metrics: { start: 1767225610, end: 1767225615 }, + metrics: { start: 1767225610, end: 1767225616 }, children: [ { span_attributes: { name: "turn: t1", type: "task" }, - metrics: { start: 1767225611, end: 1767225615 }, + metrics: { start: 1767225611, end: 1767225616 }, ended: true, children: [ { + // LLM call: opened at the message (:12), closed at token_count (:15). + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225612, end: 1767225615 }, + ended: true, + }, + { + // Tool span uses its own record timestamps: function_call (:13) + // to function_call_output (:14). Ordered after the llm by start. span_attributes: { name: "exec_command", type: "tool" }, - metrics: { start: 1767225612, end: 1767225613 }, + metrics: { start: 1767225613, end: 1767225614 }, ended: true, }, ], @@ -471,6 +498,171 @@ describe("CodexEventProcessor: tool spans", () => { }); }); +describe("CodexEventProcessor: llm spans", () => { + test("a model call becomes an llm span (output + token metrics) under its turn", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "yo!" }), + reasoning(), + assistantMessage("Hey. What are we working on?"), + tokenCount({ input_tokens: 100, output_tokens: 14, total_tokens: 114 }), + taskComplete({ turn_id: "t1", last_agent_message: "Hey. What are we working on?" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "Hey. What are we working on?" }, + // Codex token keys mapped to Braintrust standard metric names. + metrics: { prompt_tokens: 100, completion_tokens: 14, tokens: 114 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + test("llm input is reconstructed from prior conversation messages", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + // The prompt arrives as a response_item user message (history source). + userMessageItem("yo!"), + assistantMessage("Hey."), + tokenCount({ input_tokens: 10, output_tokens: 2, total_tokens: 12 }), + taskComplete({ turn_id: "t1", last_agent_message: "Hey." }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + // Input = the user prompt seen before the call opened (chat msgs). + input: [{ role: "user", content: "yo!" }], + output: { role: "assistant", content: "Hey." }, + }, + ], + }, + ], + }, + ); + }); + + test("multiple model calls interleave with tool spans in execution order", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "do it" }), + // First model call: asks for a tool. + assistantMessage("calling tool"), + functionCall({ turn_id: "t1", name: "exec_command", call_id: "c1" }), + tokenCount({ input_tokens: 10, output_tokens: 5, total_tokens: 15 }), + functionCallOutput({ call_id: "c1", output: "result" }), + // Second model call: final answer. + assistantMessage("done"), + tokenCount({ input_tokens: 20, output_tokens: 3, total_tokens: 23 }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + // Ordered siblings: llm(call 1), tool, llm(call 2). + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + // The call produced an assistant message plus a tool call. + output: [ + { role: "assistant", content: "calling tool" }, + { + role: "assistant", + content: null, + tool_calls: [ + { + id: "c1", + type: "function", + function: { name: "exec_command", arguments: "{}" }, + }, + ], + }, + ], + ended: true, + }, + { span_attributes: { name: "exec_command", type: "tool" }, ended: true }, + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "done" }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + test("a model call with no closing token_count is closed when the turn ends", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "hi" }), + assistantMessage("partial"), + // No token_count; turn completes anyway. + taskComplete({ turn_id: "t1", last_agent_message: "partial" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "partial" }, + ended: true, + }, + ], + }, + ], + }, + ); + }); +}); + describe("CodexEventProcessor: config / master switch", () => { test("a config event builds the per-session factory with its config", async () => { let seen: ReportingConfig | undefined; diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index a94576f..72a90da 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -38,11 +38,14 @@ import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; import { EVT_TASK_COMPLETE, EVT_TASK_STARTED, + EVT_TOKEN_COUNT, EVT_USER_MESSAGE, ITEM_CUSTOM_TOOL_CALL, ITEM_CUSTOM_TOOL_CALL_OUTPUT, ITEM_FUNCTION_CALL, ITEM_FUNCTION_CALL_OUTPUT, + ITEM_MESSAGE, + ITEM_REASONING, ITEM_TOOL_SEARCH_CALL, ITEM_TOOL_SEARCH_OUTPUT, parseTranscriptLine, @@ -109,6 +112,72 @@ interface RootEnrichment { permissionMode?: string; } +/** + * An OpenAI-style chat message. Used to reconstruct each LLM call's input (the + * transcript does not log the literal request) and to shape its output, so + * Braintrust renders these spans as proper LLM calls. Tool calls become an + * assistant message with `tool_calls`; tool results become a `tool` message. + */ +interface ChatMessage { + role: "developer" | "user" | "assistant" | "tool"; + content: string | null; + tool_calls?: Array<{ + id: string; + type: "function"; + function: { name: string; arguments: string }; + }>; + tool_call_id?: string; +} + +/** State for the LLM span currently in progress (one model call). */ +interface OpenLlm { + span: Span; + turnId: string | undefined; + /** Assistant messages produced by this call (text and/or tool calls). */ + output: ChatMessage[]; +} + +/** Map Codex token usage keys onto Braintrust's standard metric names. */ +function tokenMetrics(usage: Record): Record { + const metrics: Record = {}; + const num = (v: unknown): number | undefined => + typeof v === "number" && Number.isFinite(v) ? v : undefined; + const map: Array<[string, string]> = [ + ["input_tokens", "prompt_tokens"], + ["output_tokens", "completion_tokens"], + ["total_tokens", "tokens"], + ["cached_input_tokens", "prompt_cached_tokens"], + ["reasoning_output_tokens", "completion_reasoning_tokens"], + ]; + for (const [from, to] of map) { + const v = num(usage[from]); + if (v !== undefined) metrics[to] = v; + } + return metrics; +} + +/** + * Shape an LLM call's output for Braintrust. Braintrust renders a single + * assistant message or an array of messages as a chat completion; a single + * message is returned directly, multiple are returned as an array. + */ +function llmOutput(messages: ChatMessage[]): unknown { + if (messages.length === 1) return messages[0]; + return messages; +} + +/** Extract the concatenated text from a response_item message's content array. */ +function messageText(content: unknown): string | undefined { + if (!Array.isArray(content)) return undefined; + const parts: string[] = []; + for (const part of content) { + if (part && typeof part === "object" && typeof (part as { text?: unknown }).text === "string") { + parts.push((part as { text: string }).text); + } + } + return parts.length > 0 ? parts.join("") : undefined; +} + export class CodexEventProcessor implements EventProcessor { private readonly logger: Logger; private readonly queueId: string | null; @@ -126,9 +195,16 @@ export class CodexEventProcessor implements EventProcessor { // only blocks-and-polls when one of these is still open; a turn that is merely // mid-progress (no Stop yet) does not make an idle flush wait. private readonly turnsAwaitingCompletion = new Set(); + // Conversation history accumulated from transcript records (developer/user/ + // assistant messages and tool calls/outputs), as OpenAI-style chat messages. + // The transcript does not log the outbound API request, so we reconstruct each + // LLM call's input as the history captured up to the moment its span opens. + private readonly conversationHistory: ChatMessage[] = []; + // The currently-open LLM span, if a model call is in progress. One LLM call = + // a run of reasoning/message (+ tool) items terminated by a token_count. + private openLlm: OpenLlm | null = null; // Tool spans currently open, keyed by call_id. Opened by a tool-call - // response_item and closed by the matching *_output. The owning turn_id is - // tracked so any still-open tool spans are ended when their turn ends. + // response_item and closed by the matching *_output; ordered by start time. private readonly openTools = new Map(); // Root enrichment from the SessionStart hook (source/permission_mode), which // the transcript lacks. Applied when the root span is created, or patched onto @@ -303,11 +379,14 @@ export class CodexEventProcessor implements EventProcessor { if (ptype === EVT_TASK_STARTED) this.startTurnSpan(record); else if (ptype === EVT_USER_MESSAGE) this.setTurnInput(record); else if (ptype === EVT_TASK_COMPLETE) this.endTurnSpan(record); + else if (ptype === EVT_TOKEN_COUNT) this.closeLlmSpan(record); return; } if (record.type === RECORD_RESPONSE_ITEM) { const ptype = record.payload?.type ?? ""; - if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(record); + if (ptype === ITEM_MESSAGE) this.handleMessageItem(record); + else if (ptype === ITEM_REASONING) this.handleReasoningItem(record); + else if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(record); else if (TOOL_OUTPUT_TYPES.has(ptype)) this.endToolSpan(record); return; } @@ -504,6 +583,10 @@ export class CodexEventProcessor implements EventProcessor { const payload = (record.payload ?? {}) as Record; const prompt = typeof payload.message === "string" ? payload.message : undefined; if (prompt === undefined) return; + // Note: we do NOT add the prompt to conversationHistory here — the same + // prompt also arrives as a response_item user message (handleMessageItem), + // which is the canonical source for history. This event only sets the turn's + // input for display. const turnSpan = this.latestOpenTurn(); if (turnSpan === undefined) { this.logger.debug("codex processor: user_message with no open turn span", { @@ -543,6 +626,10 @@ export class CodexEventProcessor implements EventProcessor { return; } const endTime = isoToUnixSeconds(record.timestamp); + // Backstop: close any spans still open without their own closing record + // (a trailing model call with no token_count, or a tool with no output) at + // the turn's end time. + this.endOpenLlmSpan(endTime); this.endOpenToolSpansForTurn(turnId, endTime); try { turnSpan.log({ output }); @@ -561,18 +648,155 @@ export class CodexEventProcessor implements EventProcessor { } } - // Open a tool span (child of its turn) for a tool-call response_item. Keyed by - // call_id so the matching *_output can close it. Skipped (with a warning) when - // there's no call_id or no open turn for its metadata.turn_id. + // ========================================================================== + // LLM spans (transcript-driven) + // + // One LLM call = a run of reasoning/message (+ tool) items terminated by a + // token_count. The span opens lazily on the first model-output item (assistant + // message or reasoning) and closes at the next token_count, which carries the + // call's token usage. Tool calls the model emits are sibling tool spans under + // the turn (handled elsewhere); the assistant text/reasoning is the LLM span's + // output, and its input is the conversation reconstructed up to the open. + // ========================================================================== + + // Ensure an LLM span is open for the current model call, opening one (child of + // the active turn) on first use with input = the conversation so far. + private ensureLlmSpan(record: TranscriptRecord): void { + if (this.openLlm !== null) return; + const turnSpan = this.latestOpenTurn(); + if (turnSpan === undefined) { + this.logger.debug("codex processor: model output with no open turn; skipping llm span", { + queueId: this.queueId, + }); + return; + } + const turnId = this.latestOpenTurnId(); + const startTime = isoToUnixSeconds(record.timestamp); + // Snapshot the conversation as this call's input (the transcript doesn't log + // the real request, so this is a best-effort reconstruction). + const input = this.conversationHistory.map((item) => ({ ...item })); + try { + const span = turnSpan.startSpan({ + name: this.model ?? "llm", + type: "llm", + ...(startTime !== undefined ? { startTime } : {}), + event: { input, metadata: { model: this.model, turn_id: turnId } }, + }); + this.openLlm = { span, turnId, output: [] }; + this.logger.info("codex processor: opened llm span", { + queueId: this.queueId, + turnId, + spanId: span.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open llm span", { + queueId: this.queueId, + error: String(err), + }); + } + } + + // Handle a response_item message: an assistant message is this call's output + // (opens/feeds the LLM span); user/developer messages are input context. All + // are appended to conversation history as chat messages for reconstructing + // later calls' inputs. + private handleMessageItem(record: TranscriptRecord): void { + const payload = (record.payload ?? {}) as Record; + const rawRole = typeof payload.role === "string" ? payload.role : "user"; + const role: ChatMessage["role"] = + rawRole === "assistant" || rawRole === "developer" || rawRole === "tool" ? rawRole : "user"; + const text = messageText(payload.content); + if (text === undefined) return; + + const msg: ChatMessage = { role, content: text }; + if (role === "assistant") { + this.ensureLlmSpan(record); + this.openLlm?.output.push(msg); + } + this.conversationHistory.push(msg); + } + + // Handle a reasoning item: it belongs to the current model call but its raw + // content is encrypted/opaque, so we only use it to ensure the LLM span is + // open. (No readable text to surface here.) + private handleReasoningItem(record: TranscriptRecord): void { + this.ensureLlmSpan(record); + } + + // Close the current LLM call on a token_count (the segment boundary): end the + // LLM span with token-usage metrics and its accumulated output. (Tool spans + // are opened/closed by their own records, so nothing to flush here.) + private closeLlmSpan(record: TranscriptRecord): void { + if (this.openLlm === null) return; + const payload = (record.payload ?? {}) as Record; + const info = (payload.info ?? {}) as Record; + const usage = (info.last_token_usage ?? {}) as Record; + const metrics = tokenMetrics(usage); + const endTime = isoToUnixSeconds(record.timestamp); + const { span, output } = this.openLlm; + try { + span.log({ output: llmOutput(output), metrics }); + span.end(endTime !== undefined ? { endTime } : undefined); + this.logger.info("codex processor: ended llm span", { queueId: this.queueId }); + } catch (err) { + this.logger.error("codex processor: failed to end llm span", { + queueId: this.queueId, + error: String(err), + }); + } finally { + this.openLlm = null; + } + } + + // End any open LLM span without a closing token_count (e.g. on turn/root end). + private endOpenLlmSpan(endTime: number | undefined): void { + if (this.openLlm === null) return; + const { span, output } = this.openLlm; + try { + if (output.length > 0) span.log({ output: llmOutput(output) }); + span.end(endTime !== undefined ? { endTime } : undefined); + } catch (err) { + this.logger.error("codex processor: failed to end dangling llm span", { + queueId: this.queueId, + error: String(err), + }); + } finally { + this.openLlm = null; + } + } + + // Open a tool span (child of its turn) for a tool-call response_item, keyed by + // call_id so the matching *_output can close it. Each tool span is opened and + // closed by its own records (no cross-segment buffering), which keeps it robust + // against the transcript being read in pieces. Tool spans are ordered by their + // transcript start time, so they render between the LLM calls around them. private startToolSpan(record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; const name = typeof payload.name === "string" ? payload.name : undefined; const meta = (payload.metadata ?? {}) as Record; - const turnId = typeof meta.turn_id === "string" ? meta.turn_id : undefined; + const turnId = typeof meta.turn_id === "string" ? meta.turn_id : this.latestOpenTurnId(); // function_call uses `arguments` (a JSON string); custom_tool_call uses `input`. const input = payload.arguments ?? payload.input; + // A tool call is part of the assistant's response: record it as an assistant + // message carrying a tool_call, both in the open LLM call's output and in the + // conversation history (input context for later calls). + const argsString = typeof input === "string" ? input : JSON.stringify(input ?? {}); + const toolCallMsg: ChatMessage = { + role: "assistant", + content: null, + tool_calls: [ + { + id: callId ?? "", + type: "function", + function: { name: name ?? "tool", arguments: argsString }, + }, + ], + }; + this.openLlm?.output.push(toolCallMsg); + this.conversationHistory.push(toolCallMsg); + if (callId === undefined) { this.logger.warn("codex processor: tool call without call_id; skipping tool span", { queueId: this.queueId, @@ -632,11 +856,16 @@ export class CodexEventProcessor implements EventProcessor { } // Close the tool span matching a *_output record's call_id, recording the - // output. + // output, and add the result to conversation history as a `tool` message. private endToolSpan(record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; const output = payload.output ?? payload.result; + this.conversationHistory.push({ + role: "tool", + content: typeof output === "string" ? output : JSON.stringify(output ?? null), + tool_call_id: callId ?? "", + }); if (callId === undefined) { this.logger.debug("codex processor: tool output without call_id", { queueId: this.queueId, @@ -668,18 +897,12 @@ export class CodexEventProcessor implements EventProcessor { } // End any still-open tool spans owned by the given turn (e.g. a call whose - // output never arrived) when the turn ends. Uses the turn's end time so a - // dangling tool span doesn't get a wall-clock end far in the future. + // output never arrived) when the turn ends, at the turn's end time. private endOpenToolSpansForTurn(turnId: string, endTime: number | undefined): void { for (const [callId, entry] of this.openTools) { if (entry.turnId !== turnId) continue; try { entry.span.end(endTime !== undefined ? { endTime } : undefined); - this.logger.info("codex processor: ended dangling tool span on turn end", { - queueId: this.queueId, - turnId, - callId, - }); } catch (err) { this.logger.error("codex processor: failed to end dangling tool span", { queueId: this.queueId, @@ -694,13 +917,20 @@ export class CodexEventProcessor implements EventProcessor { } // The most recently opened turn that is still open (used to attach a - // user_message, which carries no turn_id). + // user_message or an LLM span, which carry no turn_id). private latestOpenTurn(): Span | undefined { let last: Span | undefined; for (const span of this.openTurns.values()) last = span; return last; } + // The turn_id of the most recently opened still-open turn. + private latestOpenTurnId(): string | undefined { + let last: string | undefined; + for (const turnId of this.openTurns.keys()) last = turnId; + return last; + } + // Number of turns whose Stop has fired but which are still open (awaiting // their task_complete). flush() polls only while this is > 0. private countPendingTurns(): number { @@ -720,7 +950,9 @@ export class CodexEventProcessor implements EventProcessor { if (this.rootSpan === null || this.rootEnded) return; this.rootEnded = true; const endArgs = endTime !== undefined ? { endTime } : undefined; - // Backstop: close any spans still open so nothing dangles. + // Backstop: close any open LLM span, any still-open tool spans, then any + // still-open turn spans, so nothing dangles. + this.endOpenLlmSpan(endTime); for (const [callId, entry] of this.openTools) { try { entry.span.end(endArgs); diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index 19dba96..6ec8163 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -164,6 +164,44 @@ export function customToolCallOutput(payload: Record = {}): Tra }); } +/** A message response_item with the given role and text. */ +export function message(role: string, text: string): TranscriptEntry { + const itemType = role === "assistant" ? "output_text" : "input_text"; + return transcript({ + type: "response_item", + payload: { type: "message", role, content: [{ type: itemType, text }] }, + }); +} + +/** assistant message response_item (model output; opens/feeds an llm span). */ +export function assistantMessage(text: string): TranscriptEntry { + return message("assistant", text); +} + +/** user message response_item (input context). */ +export function userMessageItem(text: string): TranscriptEntry { + return message("user", text); +} + +/** reasoning response_item; pass summary strings to surface reasoning text. */ +export function reasoning(summary: string[] = []): TranscriptEntry { + return transcript({ + type: "response_item", + payload: { type: "reasoning", summary, encrypted_content: "opaque" }, + }); +} + +/** + * token_count event: closes the current llm span. `usage` maps the Codex token + * keys (input_tokens, output_tokens, total_tokens, ...) onto last_token_usage. + */ +export function tokenCount(usage: Record = {}): TranscriptEntry { + return transcript({ + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: usage } }, + }); +} + /** One entry in a mixed trace list: a hook event or a transcript write. */ export type TraceEntry = EnqueueEvent | TranscriptEntry; diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts index a5de277..8bd8ac0 100644 --- a/plugins/trace-codex/src/test-helpers.ts +++ b/plugins/trace-codex/src/test-helpers.ts @@ -85,7 +85,7 @@ export interface CapturedSpan { input?: unknown; output?: unknown; metadata?: Record; - metrics?: { start?: number; end?: number }; + metrics?: { start?: number; end?: number } & Record; } export interface CapturedTrace { @@ -131,7 +131,7 @@ export interface SpanTree { input?: unknown; output?: unknown; metadata?: Record; - metrics?: { start?: number; end?: number }; + metrics?: { start?: number; end?: number } & Record; children: SpanTree[]; } @@ -191,7 +191,7 @@ export interface ExpectedSpan { /** If set, assert whether the span has an end time (true) or not (false). */ ended?: boolean; /** If set, assert exact start/end metric values (Unix seconds). */ - metrics?: { start?: number; end?: number }; + metrics?: { start?: number; end?: number } & Record; /** Exact list of children (length and order are checked). */ children?: ExpectedSpan[]; } @@ -247,15 +247,12 @@ export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: } } if (expected.metrics !== undefined) { - if (expected.metrics.start !== undefined && actual.metrics?.start !== expected.metrics.start) { - diffs.push( - `${path}.metrics.start: expected ${expected.metrics.start}, got ${String(actual.metrics?.start)}`, - ); - } - if (expected.metrics.end !== undefined && actual.metrics?.end !== expected.metrics.end) { - diffs.push( - `${path}.metrics.end: expected ${expected.metrics.end}, got ${String(actual.metrics?.end)}`, - ); + for (const [key, value] of Object.entries(expected.metrics)) { + if (value === undefined) continue; + const actualValue = actual.metrics?.[key]; + if (actualValue !== value) { + diffs.push(`${path}.metrics.${key}: expected ${value}, got ${String(actualValue)}`); + } } } if (expected.children !== undefined) { From 7e07e6493a2d14e3aeb0058480c6b758b3d9d25f Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 14:50:00 -0600 Subject: [PATCH 07/17] subagents --- .../src/agents/codex/event-processor.test.ts | 148 ++++ .../src/agents/codex/event-processor.ts | 754 +++++++++++++----- .../src/agents/codex/test-helpers.ts | 58 ++ 3 files changed, 742 insertions(+), 218 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 86df597..15a2b53 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -13,10 +13,15 @@ import { FakeTranscriptReader, functionCall, functionCallOutput, + MultiFileTranscriptReader, + postToolUse, + preToolUse, reasoning, sessionMeta, sessionStart, stop, + subagentStart, + subagentStop, taskComplete, taskStarted, tokenCount, @@ -936,3 +941,146 @@ describe("CodexEventProcessor: transcript catch-up plumbing", () => { } }); }); + +describe("CodexEventProcessor: subagents", () => { + const MAIN = "/test/main.jsonl"; + const SUB = "/test/sub.jsonl"; + + // A main session that spawns one subagent. The subagent writes to its own + // transcript file; the processor must nest its full hierarchy (subagent root + // -> turn -> llm/tool) under the spawn_agent tool span that launched it, and + // never let the two files' offsets cross-contaminate. + test("a subagent renders as a nested trace under its spawn_agent tool", async () => { + const reader = new MultiFileTranscriptReader(); + const w = (path: string, record: Record) => + reader.append(path, JSON.stringify(transcript(record).record)); + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + + // Main session: root + a turn that calls spawn_agent. + w(MAIN, { type: "session_meta", payload: { id: "s", cwd: "/work/app" } }); + w(MAIN, { type: "event_msg", payload: { type: "task_started", turn_id: "main-1" } }); + w(MAIN, { type: "turn_context", payload: { model: "gpt-5.5" } }); + w(MAIN, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(MAIN, { + type: "response_item", + payload: { + type: "function_call", + name: "spawn_agent", + call_id: "call_spawn", + arguments: "{}", + metadata: { turn_id: "main-1" }, + }, + }); + w(MAIN, { + type: "response_item", + payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" }, + }); + w(MAIN, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN })); + await processor.process( + preToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + }), + ); + // PostToolUse reveals the agent_id; maps it to the spawn tool span. + await processor.process( + postToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + tool_response: JSON.stringify({ agent_id: "agent-1" }), + }), + ); + + // Subagent: its own transcript + lifecycle hooks (carry agent_id). + w(SUB, { type: "session_meta", payload: { id: "agent-1" } }); + w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-1" } }); + w(SUB, { type: "turn_context", payload: { model: "gpt-5.5" } }); + w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(SUB, { + type: "response_item", + payload: { + type: "function_call", + name: "exec_command", + call_id: "call_exec", + arguments: "{}", + metadata: { turn_id: "sub-1" }, + }, + }); + w(SUB, { + type: "response_item", + payload: { type: "function_call_output", call_id: "call_exec", output: "ok" }, + }); + w(SUB, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + w(SUB, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" }, + }); + await processor.process( + subagentStart({ + session_id: "s", + transcript_path: SUB, + agent_id: "agent-1", + agent_type: "default", + }), + ); + await processor.process( + subagentStop({ + session_id: "s", + transcript_path: MAIN, + agent_transcript_path: SUB, + agent_id: "agent-1", + }), + ); + + // Main session completes. + w(MAIN, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "all done" }, + }); + await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" })); + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + expect(tree?.name).toBe("codex: app"); + const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); + expect(mainTurn).toBeDefined(); + + // The spawn_agent tool span is under the main turn, and the subagent root + // is nested under it (not a sibling of the main turn, not duplicated). + const spawn = mainTurn?.children.find((c) => c.name === "spawn_agent"); + expect(spawn?.type).toBe("tool"); + const subRoot = spawn?.children.find((c) => c.name === "subagent: agent-1"); + expect(subRoot).toBeDefined(); + expect(subRoot?.type).toBe("task"); + + const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); + expect(subTurn).toBeDefined(); + const exec = subTurn?.children.find((c) => c.name === "exec_command"); + expect(exec?.type).toBe("tool"); + + // No subagent turn leaked to the top level (the original bug). + expect(tree?.children.some((c) => c.name === "turn: sub-1")).toBe(false); + } finally { + trace.cleanup(); + } + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 72a90da..76f42f8 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -59,6 +59,10 @@ import { defaultTranscriptReader, type TranscriptReader } from "./transcript-rea const SESSION_START = "SessionStart"; const STOP = "Stop"; +const SUBAGENT_START = "SubagentStart"; +const SUBAGENT_STOP = "SubagentStop"; +const POST_TOOL_USE = "PostToolUse"; +const SPAWN_AGENT_TOOL = "spawn_agent"; // Bound on how long a Stop hook waits for the turn's task_complete to appear in // the transcript before giving up and emitting whatever exists. Codex can write @@ -137,6 +141,53 @@ interface OpenLlm { output: ChatMessage[]; } +/** + * All parsing state tied to a single transcript file. The main session has one + * scope (its parent is the root span); each subagent gets its own scope (its + * parent is the subagent root span, a child of the spawn_agent tool that created + * it). Keeping offset + open spans per file is essential: a session and its + * subagents are interleaved on one hook stream but write to separate transcript + * files, so a single shared offset would read the wrong file's bytes. + */ +interface TranscriptScope { + /** Absolute transcript path this scope reads from. */ + readonly path: string; + /** "main" session vs. a spawned subagent. */ + readonly kind: "main" | "subagent"; + /** + * Parent span for this scope's turn spans. For the main scope this is the + * root span; for a subagent it is the subagent root span. Resolved lazily so a + * subagent scope can be registered before its parent span is built. + */ + readonly parentSpan: () => Span | null; + /** Byte offset already consumed from `path`. */ + offset: number; + /** Turn spans currently open in this scope, keyed by turn_id. */ + readonly openTurns: Map; + /** turn_ids whose Stop has fired but whose task_complete hasn't been seen. */ + readonly turnsAwaitingCompletion: Set; + /** OpenAI-style chat history, used to reconstruct each LLM call's input. */ + readonly conversationHistory: ChatMessage[]; + /** The currently-open LLM span in this scope, if a model call is in progress. */ + openLlm: OpenLlm | null; + /** Tool spans currently open in this scope, keyed by call_id. */ + readonly openTools: Map; + /** Model slug, learned from this scope's first turn_context. */ + model: string | undefined; + /** End time (Unix seconds) of the most recently completed turn in this scope. */ + lastTurnEndTime: number | undefined; + /** + * For a subagent scope: its root span (the span the subagent's turns hang + * under) and whether it has been ended. Null/false for the main scope, which + * uses the processor's root span instead. The root span is created lazily from + * the subagent's own session_meta record (so it gets a real start time); + * `pendingSubagent` holds what's needed to create it until then. + */ + subagentRootSpan: Span | null; + subagentEnded: boolean; + pendingSubagent?: { agentId: string; agentType: string | undefined; parent: Span }; +} + /** Map Codex token usage keys onto Braintrust's standard metric names. */ function tokenMetrics(usage: Record): Record { const metrics: Record = {}; @@ -187,43 +238,28 @@ export class CodexEventProcessor implements EventProcessor { private reportingConfig: ReportingConfig | undefined; private rootSpan: Span | null = null; private rootEnded = false; - // Turn spans currently open, keyed by turn_id. A turn is opened by - // task_started and closed by the matching task_complete. - private readonly openTurns = new Map(); - // turn_ids whose Stop hook has fired but whose task_complete hasn't been seen - // yet (Codex can write task_complete seconds after the Stop hook). flush() - // only blocks-and-polls when one of these is still open; a turn that is merely - // mid-progress (no Stop yet) does not make an idle flush wait. - private readonly turnsAwaitingCompletion = new Set(); - // Conversation history accumulated from transcript records (developer/user/ - // assistant messages and tool calls/outputs), as OpenAI-style chat messages. - // The transcript does not log the outbound API request, so we reconstruct each - // LLM call's input as the history captured up to the moment its span opens. - private readonly conversationHistory: ChatMessage[] = []; - // The currently-open LLM span, if a model call is in progress. One LLM call = - // a run of reasoning/message (+ tool) items terminated by a token_count. - private openLlm: OpenLlm | null = null; - // Tool spans currently open, keyed by call_id. Opened by a tool-call - // response_item and closed by the matching *_output; ordered by start time. - private readonly openTools = new Map(); // Root enrichment from the SessionStart hook (source/permission_mode), which // the transcript lacks. Applied when the root span is created, or patched onto // the root if it already exists. private rootEnrichment: RootEnrichment = {}; - // Model slug, learned from the first turn_context (session_meta has none). - private model: string | undefined; - // End time (Unix seconds) of the most recently completed turn. Used to end the - // root span at the right time when the session is finalized (on flush), since - // Codex has no session-end hook and the SDK records a span's end time once. - private lastTurnEndTime: number | undefined; // Reads new transcript content on each hook (the "catch-up"). private readonly transcriptReader: TranscriptReader; - // Byte offset into the transcript already consumed. In-memory only; if this - // processor is evicted and recreated, the transcript is re-read from 0. - private transcriptOffset = 0; - // Transcript path from the latest hook, so flush() can do a final catch-up - // (the last turn's task_complete can land just after its Stop hook fires). - private transcriptPath: string | null = null; + // One parsing scope per transcript file (the main session plus one per + // subagent), keyed by transcript path. Each owns its own offset and open + // spans, so interleaved hooks for different files never cross-contaminate. + private readonly scopes = new Map(); + // The main session's scope (its turns hang under the root span). Created on + // the first hook carrying the main transcript path. + private mainScope: TranscriptScope | null = null; + // call_id -> the spawn_agent tool span, recorded when a spawn_agent tool span + // opens in the main transcript. The spawn_agent PostToolUse hook carries the + // resulting agent_id and the call_id (tool_use_id), letting us map agent_id to + // its spawn tool span (below) even after the tool span has closed. + private readonly spawnToolSpansByCallId = new Map(); + // agent_id -> the spawn_agent tool span that created that subagent. Used to + // parent a subagent's root span under the spawning tool. Resolved from the + // spawn_agent PostToolUse (agent_id + call_id) via spawnToolSpansByCallId. + private readonly spawnToolSpansByAgentId = new Map(); constructor( queueId: string | null, @@ -252,8 +288,9 @@ export class CodexEventProcessor implements EventProcessor { // Dispatcher. The config event short-circuits (it configures reporting and // must run before any spans). Every other hook (a) records any hook-only data - // it carries, (b) catches up the transcript into spans, and on Stop (c) ends - // the root span. Wrapped so transcript work can never break the turn. + // it carries (root enrichment, subagent lifecycle), (b) catches up the + // relevant transcript into spans, and on a main-session Stop (c) ends the root + // span. Wrapped so transcript work can never break the turn. async process(event: EnqueueEvent): Promise { if (event.eventName === CODEX_CONFIG_EVENT) { this.configure(event); @@ -268,70 +305,96 @@ export class CodexEventProcessor implements EventProcessor { return; } + const data = (event.eventData ?? {}) as Record; + const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; + if (event.eventName === SESSION_START) { this.recordRootEnrichment(event); } + // A subagent is starting: open its root span (under the spawn_agent tool that + // created it) and register a scope for its transcript so its hooks build a + // full turn/llm/tool hierarchy inside that span. + if (event.eventName === SUBAGENT_START) { + this.handleSubagentStart(event); + } + + // Resolve the scope this hook belongs to (main vs. a specific subagent) by + // its transcript path, so each file keeps its own offset and open spans. + const scope = this.scopeForEvent(event); + // A Stop signals its turn is finishing; mark it so flush knows to wait for // this turn's task_complete (which Codex may write after the Stop hook). - if (event.eventName === STOP) { - const data = (event.eventData ?? {}) as Record; + if ((event.eventName === STOP || event.eventName === SUBAGENT_STOP) && scope !== null) { const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; - if (turnId !== undefined) this.turnsAwaitingCompletion.add(turnId); + if (turnId !== undefined) scope.turnsAwaitingCompletion.add(turnId); } - try { - await this.catchUpTranscript(event); - } catch (err) { - this.logger.error("codex processor: transcript catch-up failed", { - queueId: this.queueId, - eventName: event.eventName, - error: String(err), - }); + if (scope !== null) { + try { + await this.catchUpScope(scope, event); + } catch (err) { + this.logger.error("codex processor: transcript catch-up failed", { + queueId: this.queueId, + eventName: event.eventName, + error: String(err), + }); + } + } + + // After catching up the main transcript (which creates the spawn_agent tool + // span), map the resulting subagent's agent_id to that span so a following + // SubagentStart can nest the subagent under it. The PostToolUse's tool_use_id + // is the spawn_agent function_call's call_id; its tool_response carries the + // agent_id. + if (event.eventName === POST_TOOL_USE && agentId === undefined) { + this.recordSpawnedAgent(event); + } + + // A subagent has finished: close its turns and root span. + if (event.eventName === SUBAGENT_STOP && agentId !== undefined) { + this.handleSubagentStop(event, agentId); } - // End the root span on the first Stop. The catch-up above has already - // processed this turn's task_complete, so lastTurnEndTime is the turn's - // completion time — used as the root's end time so it isn't stamped with a - // late wall-clock value. - if (event.eventName === STOP) { - this.endRootSpan(this.lastTurnEndTime); + // End the (main) root span on the first main-session Stop. A SubagentStop + // must NOT end the root — the parent session is still running. + if (event.eventName === STOP && agentId === undefined) { + this.endRootSpan(this.mainScope?.lastTurnEndTime); } } - // Read transcript lines appended since our last read and turn each into spans. - // The transcript path comes from the hook payload. Never throws. + // Read transcript lines appended to this scope's file since its last read and + // turn each into spans. Never throws. // - // On a Stop, first consume whatever's already on disk. If that closes the - // turn, we're done. Otherwise Codex hasn't written this turn's task_complete - // yet (it can lag the Stop hook by seconds), so wait (bounded) for it — but - // only then, so a Stop whose turn is already complete returns immediately. - private async catchUpTranscript(event: EnqueueEvent): Promise { + // On a Stop/SubagentStop, first consume whatever's already on disk. If that + // closes the turn, we're done. Otherwise Codex hasn't written this turn's + // task_complete yet (it can lag the Stop hook by seconds), so wait (bounded) + // for it — but only then, so a Stop whose turn is already complete returns + // immediately. + private async catchUpScope(scope: TranscriptScope, event: EnqueueEvent): Promise { const data = (event.eventData ?? {}) as Record; - const path = typeof data.transcript_path === "string" ? data.transcript_path : undefined; - if (path === undefined) return; - this.transcriptPath = path; // Initial read of everything available now. - const initial = this.transcriptReader.readFrom(path, this.transcriptOffset); - this.transcriptOffset = initial.offset; - this.consumeLines(initial.lines); + const initial = this.transcriptReader.readFrom(scope.path, scope.offset); + scope.offset = initial.offset; + this.consumeLines(scope, initial.lines); - if (event.eventName !== STOP) return; + const isStop = event.eventName === STOP || event.eventName === SUBAGENT_STOP; + if (!isStop) return; const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; // If the turn already closed during the initial read, no wait is needed. - if (turnId === undefined || !this.openTurns.has(turnId)) return; + if (turnId === undefined || !scope.openTurns.has(turnId)) return; // Wait (bounded) for this turn's task_complete to be written. const result = await this.transcriptReader.waitFor( - path, - this.transcriptOffset, + scope.path, + scope.offset, (line) => this.isTaskCompleteFor(line, turnId), { timeoutMs: SENTINEL_TIMEOUT_MS, intervalMs: SENTINEL_INTERVAL_MS }, ); - this.transcriptOffset = result.offset; - this.consumeLines(result.lines); + scope.offset = result.offset; + this.consumeLines(scope, result.lines); if (!result.sentinelFound) { this.logger.warn("codex processor: Stop sentinel not found; emitting partial turn", { queueId: this.queueId, @@ -340,13 +403,232 @@ export class CodexEventProcessor implements EventProcessor { } } - // Parse and process a batch of transcript lines into spans, advancing nothing - // (the caller owns the offset). Used by catch-up and by the final flush read. - private consumeLines(lines: string[]): void { + // ========================================================================== + // Transcript scopes + // ========================================================================== + + // Resolve (creating if needed) the scope a hook belongs to, by its transcript + // path. A subagent hook (carrying agent_id) uses the subagent's transcript; + // its scope is created by SubagentStart and looked up here. Everything else is + // the main session. Returns null if the hook carries no usable transcript path + // or a subagent scope hasn't been registered yet. + private scopeForEvent(event: EnqueueEvent): TranscriptScope | null { + const data = (event.eventData ?? {}) as Record; + const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; + // SubagentStop reports the subagent file under agent_transcript_path (its + // transcript_path points at the PARENT session), so prefer that. + const path = + agentId !== undefined && typeof data.agent_transcript_path === "string" + ? data.agent_transcript_path + : typeof data.transcript_path === "string" + ? data.transcript_path + : undefined; + if (path === undefined) return null; + + if (agentId === undefined) return this.ensureMainScope(path); + + // Subagent hook: its scope must already exist (registered by SubagentStart). + const existing = this.scopes.get(path); + if (existing !== undefined) return existing; + this.logger.debug("codex processor: subagent hook before its scope exists; skipping", { + queueId: this.queueId, + agentId, + }); + return null; + } + + // The main session's scope, created on first use. Its turns hang under the + // processor's root span. + private ensureMainScope(path: string): TranscriptScope { + if (this.mainScope !== null) return this.mainScope; + const scope = this.newScope(path, "main", () => this.rootSpan); + this.mainScope = scope; + this.scopes.set(path, scope); + return scope; + } + + private newScope( + path: string, + kind: "main" | "subagent", + parentSpan: () => Span | null, + ): TranscriptScope { + return { + path, + kind, + parentSpan, + offset: 0, + openTurns: new Map(), + turnsAwaitingCompletion: new Set(), + conversationHistory: [], + openLlm: null, + openTools: new Map(), + model: undefined, + lastTurnEndTime: undefined, + subagentRootSpan: null, + subagentEnded: false, + }; + } + + // ========================================================================== + // Subagent lifecycle + // ========================================================================== + + // Record the agent_id produced by a spawn_agent tool, mapping it to that + // tool's span (located by call_id), so a later SubagentStart can parent the + // subagent's root span under the spawning tool. No-op for non-spawn tools or + // when the tool span isn't known. + private recordSpawnedAgent(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const toolName = typeof data.tool_name === "string" ? data.tool_name : undefined; + if (toolName !== SPAWN_AGENT_TOOL) return; + const callId = typeof data.tool_use_id === "string" ? data.tool_use_id : undefined; + if (callId === undefined) return; + const response = data.tool_response; + let agentId: string | undefined; + if (typeof response === "string") { + try { + const parsed = JSON.parse(response) as Record; + if (typeof parsed.agent_id === "string") agentId = parsed.agent_id; + } catch { + // tool_response wasn't JSON; nothing to map. + } + } else if (response !== null && typeof response === "object") { + const ar = (response as Record).agent_id; + if (typeof ar === "string") agentId = ar; + } + if (agentId === undefined) return; + const span = this.spawnToolSpansByCallId.get(callId); + if (span === undefined) { + this.logger.debug("codex processor: spawn_agent PostToolUse with no known tool span", { + queueId: this.queueId, + callId, + agentId, + }); + return; + } + this.spawnToolSpansByAgentId.set(agentId, span); + } + + // A subagent is starting. Register a scope reading the subagent's own + // transcript so its turns/llm/tools build a full hierarchy. The subagent root + // span is created lazily from the subagent's session_meta (so it gets a real + // start time); we stash the parent here — the spawn_agent tool span that + // created it (matched by agent_id from the spawn_agent PostToolUse), or the + // main root span as a fallback if that mapping isn't known. + private handleSubagentStart(event: EnqueueEvent): void { + const data = (event.eventData ?? {}) as Record; + const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; + const path = typeof data.transcript_path === "string" ? data.transcript_path : undefined; + if (agentId === undefined || path === undefined) { + this.logger.warn("codex processor: SubagentStart missing agent_id/transcript_path", { + queueId: this.queueId, + }); + return; + } + if (this.scopes.has(path)) return; // already registered + + const parent = this.spawnToolSpansByAgentId.get(agentId) ?? this.rootSpan; + if (parent === null) { + this.logger.warn("codex processor: SubagentStart with no parent span; skipping", { + queueId: this.queueId, + agentId, + }); + return; + } + const agentType = typeof data.agent_type === "string" ? data.agent_type : undefined; + const scope = this.newScope(path, "subagent", () => scope.subagentRootSpan); + scope.pendingSubagent = { agentId, agentType, parent }; + this.scopes.set(path, scope); + this.logger.info("codex processor: registered subagent scope", { + queueId: this.queueId, + agentId, + }); + } + + // Create a subagent's root span from its session_meta record (child of the + // spawn_agent tool span, with the record's timestamp as start time). Called + // once per subagent scope, when its session_meta is first consumed. + private startSubagentRootSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const pending = scope.pendingSubagent; + if (pending === undefined || scope.subagentRootSpan !== null) return; + const startTime = isoToUnixSeconds(record.timestamp); + try { + scope.subagentRootSpan = pending.parent.startSpan({ + name: `subagent: ${pending.agentId}`, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { metadata: { agent_id: pending.agentId, agent_type: pending.agentType } }, + }); + this.logger.info("codex processor: opened subagent root span", { + queueId: this.queueId, + agentId: pending.agentId, + spanId: scope.subagentRootSpan.id, + }); + } catch (err) { + this.logger.error("codex processor: failed to open subagent root span", { + queueId: this.queueId, + agentId: pending.agentId, + error: String(err), + }); + } + } + + // A subagent has finished (its task_complete has been consumed by the catch-up + // in process()). Close any spans still open in its scope, then end its root + // span at the subagent's last turn end time. + private handleSubagentStop(event: EnqueueEvent, agentId: string): void { + const data = (event.eventData ?? {}) as Record; + const path = + typeof data.agent_transcript_path === "string" ? data.agent_transcript_path : undefined; + const scope = path !== undefined ? this.scopes.get(path) : undefined; + if (scope === undefined || scope.subagentRootSpan === null) { + this.logger.debug("codex processor: SubagentStop with no scope/root span", { + queueId: this.queueId, + agentId, + }); + return; + } + if (scope.subagentEnded) return; + scope.subagentEnded = true; + const endTime = scope.lastTurnEndTime; + const endArgs = endTime !== undefined ? { endTime } : undefined; + this.endOpenLlmSpan(scope, endTime); + this.closeAllOpenTools(scope, endArgs); + for (const [, span] of scope.openTurns) { + try { + span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling subagent turn span", { + queueId: this.queueId, + agentId, + error: String(err), + }); + } + } + scope.openTurns.clear(); + try { + scope.subagentRootSpan.end(endArgs); + this.logger.info("codex processor: ended subagent root span", { + queueId: this.queueId, + agentId, + }); + } catch (err) { + this.logger.error("codex processor: failed to end subagent root span", { + queueId: this.queueId, + agentId, + error: String(err), + }); + } + } + + // Parse and process a batch of transcript lines into spans for the given + // scope, advancing nothing (the caller owns the offset). Used by catch-up and + // by the final flush read. + private consumeLines(scope: TranscriptScope, lines: string[]): void { for (const line of lines) { const record = parseTranscriptLine(line); if (record === null) continue; - this.processTranscriptEvent(record); + this.processTranscriptEvent(scope, record); } } @@ -362,32 +644,35 @@ export class CodexEventProcessor implements EventProcessor { return record.payload?.turn_id === turnId; } - // Turn one parsed transcript record into span operations. Unknown record types - // are ignored. Never throws; per-record failures are logged. - private processTranscriptEvent(record: TranscriptRecord): void { + // Turn one parsed transcript record into span operations for its scope. + // Unknown record types are ignored. Never throws; per-record failures logged. + private processTranscriptEvent(scope: TranscriptScope, record: TranscriptRecord): void { try { if (record.type === RECORD_SESSION_META) { - this.startRootSpan(record); + // The main session's session_meta opens the processor root span; a + // subagent's opens its (lazily created) subagent root span. + if (scope.kind === "main") this.startRootSpan(record); + else this.startSubagentRootSpan(scope, record); return; } if (record.type === RECORD_TURN_CONTEXT) { - this.noteModel(record); + this.noteModel(scope, record); return; } if (record.type === RECORD_EVENT_MSG) { const ptype = record.payload?.type; - if (ptype === EVT_TASK_STARTED) this.startTurnSpan(record); - else if (ptype === EVT_USER_MESSAGE) this.setTurnInput(record); - else if (ptype === EVT_TASK_COMPLETE) this.endTurnSpan(record); - else if (ptype === EVT_TOKEN_COUNT) this.closeLlmSpan(record); + if (ptype === EVT_TASK_STARTED) this.startTurnSpan(scope, record); + else if (ptype === EVT_USER_MESSAGE) this.setTurnInput(scope, record); + else if (ptype === EVT_TASK_COMPLETE) this.endTurnSpan(scope, record); + else if (ptype === EVT_TOKEN_COUNT) this.closeLlmSpan(scope, record); return; } if (record.type === RECORD_RESPONSE_ITEM) { const ptype = record.payload?.type ?? ""; - if (ptype === ITEM_MESSAGE) this.handleMessageItem(record); - else if (ptype === ITEM_REASONING) this.handleReasoningItem(record); - else if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(record); - else if (TOOL_OUTPUT_TYPES.has(ptype)) this.endToolSpan(record); + if (ptype === ITEM_MESSAGE) this.handleMessageItem(scope, record); + else if (ptype === ITEM_REASONING) this.handleReasoningItem(scope, record); + else if (TOOL_CALL_TYPES.has(ptype)) this.startToolSpan(scope, record); + else if (TOOL_OUTPUT_TYPES.has(ptype)) this.endToolSpan(scope, record); return; } } catch (err) { @@ -485,12 +770,12 @@ export class CodexEventProcessor implements EventProcessor { type: "task", ...(startTime !== undefined ? { startTime } : {}), event: { - input: { model: this.model, cwd, source: this.rootEnrichment.source }, + input: { model: this.mainScope?.model, cwd, source: this.rootEnrichment.source }, metadata: { // User-provided extras first, so the standard keys below win. ...this.reportingConfig?.additionalMetadata, session_id: sessionId, - model: this.model, + model: this.mainScope?.model, cwd, source: this.rootEnrichment.source, permission_mode: this.rootEnrichment.permissionMode, @@ -511,18 +796,19 @@ export class CodexEventProcessor implements EventProcessor { } } - // Learn the model from turn_context (session_meta has none). Backfill it onto - // the root input/metadata the first time we see it. - private noteModel(record: TranscriptRecord): void { + // Learn the scope's model from turn_context (session_meta has none). Backfill + // it onto the scope's parent span input/metadata the first time we see it. + private noteModel(scope: TranscriptScope, record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const model = typeof payload.model === "string" ? payload.model : undefined; - if (model === undefined || this.model !== undefined) return; - this.model = model; - if (this.rootSpan !== null) { + if (model === undefined || scope.model !== undefined) return; + scope.model = model; + const parent = scope.parentSpan(); + if (parent !== null) { try { - this.rootSpan.log({ input: { model }, metadata: { model } }); + parent.log({ input: { model }, metadata: { model } }); } catch (err) { - this.logger.error("codex processor: failed to backfill model on root", { + this.logger.error("codex processor: failed to backfill model", { queueId: this.queueId, error: String(err), }); @@ -530,10 +816,11 @@ export class CodexEventProcessor implements EventProcessor { } } - // Open a turn span on task_started, child of the root, keyed by turn_id. - private startTurnSpan(record: TranscriptRecord): void { - if (this.rootSpan === null) { - this.logger.warn("codex processor: task_started without a root span; ignoring", { + // Open a turn span on task_started, child of the scope's parent, by turn_id. + private startTurnSpan(scope: TranscriptScope, record: TranscriptRecord): void { + const parent = scope.parentSpan(); + if (parent === null) { + this.logger.warn("codex processor: task_started without a parent span; ignoring", { queueId: this.queueId, }); return; @@ -546,7 +833,7 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - if (this.openTurns.has(turnId)) { + if (scope.openTurns.has(turnId)) { this.logger.warn("codex processor: duplicate turn_id; keeping existing turn span", { queueId: this.queueId, turnId, @@ -555,13 +842,13 @@ export class CodexEventProcessor implements EventProcessor { } const startTime = isoToUnixSeconds(record.timestamp); try { - const turnSpan = this.rootSpan.startSpan({ + const turnSpan = parent.startSpan({ name: `turn: ${turnId}`, type: "task", ...(startTime !== undefined ? { startTime } : {}), - event: { metadata: { turn_id: turnId, model: this.model } }, + event: { metadata: { turn_id: turnId, model: scope.model } }, }); - this.openTurns.set(turnId, turnSpan); + scope.openTurns.set(turnId, turnSpan); this.logger.info("codex processor: opened turn span", { queueId: this.queueId, turnId, @@ -578,8 +865,8 @@ export class CodexEventProcessor implements EventProcessor { // Set the open turn's input from a user_message event (the prompt). The // user_message carries no turn_id, so it applies to the most recently opened - // still-open turn. - private setTurnInput(record: TranscriptRecord): void { + // still-open turn in this scope. + private setTurnInput(scope: TranscriptScope, record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const prompt = typeof payload.message === "string" ? payload.message : undefined; if (prompt === undefined) return; @@ -587,7 +874,7 @@ export class CodexEventProcessor implements EventProcessor { // prompt also arrives as a response_item user message (handleMessageItem), // which is the canonical source for history. This event only sets the turn's // input for display. - const turnSpan = this.latestOpenTurn(); + const turnSpan = this.latestOpenTurn(scope); if (turnSpan === undefined) { this.logger.debug("codex processor: user_message with no open turn span", { queueId: this.queueId, @@ -606,7 +893,7 @@ export class CodexEventProcessor implements EventProcessor { // Close the turn span on task_complete, recording the final assistant message // as output. Also closes any tool spans still open under that turn. - private endTurnSpan(record: TranscriptRecord): void { + private endTurnSpan(scope: TranscriptScope, record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const turnId = typeof payload.turn_id === "string" ? payload.turn_id : undefined; const output = @@ -617,7 +904,7 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - const turnSpan = this.openTurns.get(turnId); + const turnSpan = scope.openTurns.get(turnId); if (turnSpan === undefined) { this.logger.debug("codex processor: task_complete with no open turn span", { queueId: this.queueId, @@ -629,12 +916,12 @@ export class CodexEventProcessor implements EventProcessor { // Backstop: close any spans still open without their own closing record // (a trailing model call with no token_count, or a tool with no output) at // the turn's end time. - this.endOpenLlmSpan(endTime); - this.endOpenToolSpansForTurn(turnId, endTime); + this.endOpenLlmSpan(scope, endTime); + this.endOpenToolSpansForTurn(scope, turnId, endTime); try { turnSpan.log({ output }); turnSpan.end(endTime !== undefined ? { endTime } : undefined); - if (endTime !== undefined) this.lastTurnEndTime = endTime; + if (endTime !== undefined) scope.lastTurnEndTime = endTime; this.logger.info("codex processor: ended turn span", { queueId: this.queueId, turnId }); } catch (err) { this.logger.error("codex processor: failed to end turn span", { @@ -643,8 +930,8 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } finally { - this.openTurns.delete(turnId); - this.turnsAwaitingCompletion.delete(turnId); + scope.openTurns.delete(turnId); + scope.turnsAwaitingCompletion.delete(turnId); } } @@ -659,30 +946,30 @@ export class CodexEventProcessor implements EventProcessor { // output, and its input is the conversation reconstructed up to the open. // ========================================================================== - // Ensure an LLM span is open for the current model call, opening one (child of - // the active turn) on first use with input = the conversation so far. - private ensureLlmSpan(record: TranscriptRecord): void { - if (this.openLlm !== null) return; - const turnSpan = this.latestOpenTurn(); + // Ensure an LLM span is open for the scope's current model call, opening one + // (child of the active turn) on first use with input = the conversation so far. + private ensureLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void { + if (scope.openLlm !== null) return; + const turnSpan = this.latestOpenTurn(scope); if (turnSpan === undefined) { this.logger.debug("codex processor: model output with no open turn; skipping llm span", { queueId: this.queueId, }); return; } - const turnId = this.latestOpenTurnId(); + const turnId = this.latestOpenTurnId(scope); const startTime = isoToUnixSeconds(record.timestamp); // Snapshot the conversation as this call's input (the transcript doesn't log // the real request, so this is a best-effort reconstruction). - const input = this.conversationHistory.map((item) => ({ ...item })); + const input = scope.conversationHistory.map((item) => ({ ...item })); try { const span = turnSpan.startSpan({ - name: this.model ?? "llm", + name: scope.model ?? "llm", type: "llm", ...(startTime !== undefined ? { startTime } : {}), - event: { input, metadata: { model: this.model, turn_id: turnId } }, + event: { input, metadata: { model: scope.model, turn_id: turnId } }, }); - this.openLlm = { span, turnId, output: [] }; + scope.openLlm = { span, turnId, output: [] }; this.logger.info("codex processor: opened llm span", { queueId: this.queueId, turnId, @@ -700,7 +987,7 @@ export class CodexEventProcessor implements EventProcessor { // (opens/feeds the LLM span); user/developer messages are input context. All // are appended to conversation history as chat messages for reconstructing // later calls' inputs. - private handleMessageItem(record: TranscriptRecord): void { + private handleMessageItem(scope: TranscriptScope, record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const rawRole = typeof payload.role === "string" ? payload.role : "user"; const role: ChatMessage["role"] = @@ -710,30 +997,30 @@ export class CodexEventProcessor implements EventProcessor { const msg: ChatMessage = { role, content: text }; if (role === "assistant") { - this.ensureLlmSpan(record); - this.openLlm?.output.push(msg); + this.ensureLlmSpan(scope, record); + scope.openLlm?.output.push(msg); } - this.conversationHistory.push(msg); + scope.conversationHistory.push(msg); } // Handle a reasoning item: it belongs to the current model call but its raw // content is encrypted/opaque, so we only use it to ensure the LLM span is // open. (No readable text to surface here.) - private handleReasoningItem(record: TranscriptRecord): void { - this.ensureLlmSpan(record); + private handleReasoningItem(scope: TranscriptScope, record: TranscriptRecord): void { + this.ensureLlmSpan(scope, record); } // Close the current LLM call on a token_count (the segment boundary): end the // LLM span with token-usage metrics and its accumulated output. (Tool spans // are opened/closed by their own records, so nothing to flush here.) - private closeLlmSpan(record: TranscriptRecord): void { - if (this.openLlm === null) return; + private closeLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void { + if (scope.openLlm === null) return; const payload = (record.payload ?? {}) as Record; const info = (payload.info ?? {}) as Record; const usage = (info.last_token_usage ?? {}) as Record; const metrics = tokenMetrics(usage); const endTime = isoToUnixSeconds(record.timestamp); - const { span, output } = this.openLlm; + const { span, output } = scope.openLlm; try { span.log({ output: llmOutput(output), metrics }); span.end(endTime !== undefined ? { endTime } : undefined); @@ -744,14 +1031,14 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } finally { - this.openLlm = null; + scope.openLlm = null; } } // End any open LLM span without a closing token_count (e.g. on turn/root end). - private endOpenLlmSpan(endTime: number | undefined): void { - if (this.openLlm === null) return; - const { span, output } = this.openLlm; + private endOpenLlmSpan(scope: TranscriptScope, endTime: number | undefined): void { + if (scope.openLlm === null) return; + const { span, output } = scope.openLlm; try { if (output.length > 0) span.log({ output: llmOutput(output) }); span.end(endTime !== undefined ? { endTime } : undefined); @@ -761,7 +1048,7 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } finally { - this.openLlm = null; + scope.openLlm = null; } } @@ -770,12 +1057,20 @@ export class CodexEventProcessor implements EventProcessor { // closed by its own records (no cross-segment buffering), which keeps it robust // against the transcript being read in pieces. Tool spans are ordered by their // transcript start time, so they render between the LLM calls around them. - private startToolSpan(record: TranscriptRecord): void { + private startToolSpan(scope: TranscriptScope, record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; - const name = typeof payload.name === "string" ? payload.name : undefined; + // Most tool calls carry `name`; some (e.g. tool_search_call) don't, so fall + // back to the record subtype with the "_call" suffix stripped. + const ptype = typeof payload.type === "string" ? payload.type : undefined; + const name = + typeof payload.name === "string" + ? payload.name + : ptype !== undefined + ? ptype.replace(/_call$/, "") + : undefined; const meta = (payload.metadata ?? {}) as Record; - const turnId = typeof meta.turn_id === "string" ? meta.turn_id : this.latestOpenTurnId(); + const turnId = typeof meta.turn_id === "string" ? meta.turn_id : this.latestOpenTurnId(scope); // function_call uses `arguments` (a JSON string); custom_tool_call uses `input`. const input = payload.arguments ?? payload.input; @@ -794,8 +1089,8 @@ export class CodexEventProcessor implements EventProcessor { }, ], }; - this.openLlm?.output.push(toolCallMsg); - this.conversationHistory.push(toolCallMsg); + scope.openLlm?.output.push(toolCallMsg); + scope.conversationHistory.push(toolCallMsg); if (callId === undefined) { this.logger.warn("codex processor: tool call without call_id; skipping tool span", { @@ -811,7 +1106,7 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - const turnSpan = this.openTurns.get(turnId); + const turnSpan = scope.openTurns.get(turnId); if (turnSpan === undefined) { this.logger.warn("codex processor: tool call with no open turn span; skipping", { queueId: this.queueId, @@ -820,7 +1115,7 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - if (this.openTools.has(callId)) { + if (scope.openTools.has(callId)) { this.logger.warn("codex processor: duplicate call_id; keeping existing tool span", { queueId: this.queueId, callId, @@ -838,7 +1133,14 @@ export class CodexEventProcessor implements EventProcessor { metadata: { tool_name: name, call_id: callId, turn_id: turnId }, }, }); - this.openTools.set(callId, { span: toolSpan, turnId }); + scope.openTools.set(callId, { span: toolSpan, turnId }); + // Remember spawn_agent tool spans (main scope only) so a subagent's root + // span can be nested under the tool that launched it. The agent_id is not + // in the transcript; it arrives on the spawn_agent PostToolUse hook, which + // we map to this span by call_id. + if (scope.kind === "main" && name === SPAWN_AGENT_TOOL) { + this.spawnToolSpansByCallId.set(callId, toolSpan); + } this.logger.info("codex processor: opened tool span", { queueId: this.queueId, turnId, @@ -857,11 +1159,11 @@ export class CodexEventProcessor implements EventProcessor { // Close the tool span matching a *_output record's call_id, recording the // output, and add the result to conversation history as a `tool` message. - private endToolSpan(record: TranscriptRecord): void { + private endToolSpan(scope: TranscriptScope, record: TranscriptRecord): void { const payload = (record.payload ?? {}) as Record; const callId = typeof payload.call_id === "string" ? payload.call_id : undefined; const output = payload.output ?? payload.result; - this.conversationHistory.push({ + scope.conversationHistory.push({ role: "tool", content: typeof output === "string" ? output : JSON.stringify(output ?? null), tool_call_id: callId ?? "", @@ -872,7 +1174,7 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - const entry = this.openTools.get(callId); + const entry = scope.openTools.get(callId); if (entry === undefined) { this.logger.debug("codex processor: tool output with no open tool span", { queueId: this.queueId, @@ -892,14 +1194,18 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } finally { - this.openTools.delete(callId); + scope.openTools.delete(callId); } } // End any still-open tool spans owned by the given turn (e.g. a call whose // output never arrived) when the turn ends, at the turn's end time. - private endOpenToolSpansForTurn(turnId: string, endTime: number | undefined): void { - for (const [callId, entry] of this.openTools) { + private endOpenToolSpansForTurn( + scope: TranscriptScope, + turnId: string, + endTime: number | undefined, + ): void { + for (const [callId, entry] of scope.openTools) { if (entry.turnId !== turnId) continue; try { entry.span.end(endTime !== undefined ? { endTime } : undefined); @@ -911,72 +1217,86 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } finally { - this.openTools.delete(callId); + scope.openTools.delete(callId); + } + } + } + + // Close every still-open tool span in a scope (used when ending a subagent or + // the root). Clears the scope's openTools. + private closeAllOpenTools( + scope: TranscriptScope, + endArgs: { endTime: number } | undefined, + ): void { + for (const [callId, entry] of scope.openTools) { + try { + entry.span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling tool span", { + queueId: this.queueId, + callId, + error: String(err), + }); } } + scope.openTools.clear(); } - // The most recently opened turn that is still open (used to attach a - // user_message or an LLM span, which carry no turn_id). - private latestOpenTurn(): Span | undefined { + // The most recently opened turn that is still open in this scope (used to + // attach a user_message or an LLM span, which carry no turn_id). + private latestOpenTurn(scope: TranscriptScope): Span | undefined { let last: Span | undefined; - for (const span of this.openTurns.values()) last = span; + for (const span of scope.openTurns.values()) last = span; return last; } - // The turn_id of the most recently opened still-open turn. - private latestOpenTurnId(): string | undefined { + // The turn_id of the most recently opened still-open turn in this scope. + private latestOpenTurnId(scope: TranscriptScope): string | undefined { let last: string | undefined; - for (const turnId of this.openTurns.keys()) last = turnId; + for (const turnId of scope.openTurns.keys()) last = turnId; return last; } - // Number of turns whose Stop has fired but which are still open (awaiting - // their task_complete). flush() polls only while this is > 0. + // Number of turns (across all scopes) whose Stop has fired but which are still + // open (awaiting their task_complete). flush() polls only while this is > 0. private countPendingTurns(): number { let n = 0; - for (const turnId of this.turnsAwaitingCompletion) { - if (this.openTurns.has(turnId)) n += 1; + for (const scope of this.scopes.values()) { + for (const turnId of scope.turnsAwaitingCompletion) { + if (scope.openTurns.has(turnId)) n += 1; + } } return n; } - // End the root span on the first Stop hook. Subsequent Stops are ignored - // (the SDK records a span's end time once). `endTime` is the completing turn's - // end time so the root isn't stamped with a late wall-clock value; falls back - // to the SDK's now() if unknown. Any still-open turn/tool spans are closed at - // the same time so nothing dangles. + // End the root span on the first main-session Stop. Subsequent Stops are + // ignored (the SDK records a span's end time once). `endTime` is the completing + // turn's end time so the root isn't stamped with a late wall-clock value; falls + // back to the SDK's now() if unknown. Any still-open spans in the main scope + // are closed at the same time so nothing dangles. private endRootSpan(endTime: number | undefined): void { if (this.rootSpan === null || this.rootEnded) return; this.rootEnded = true; const endArgs = endTime !== undefined ? { endTime } : undefined; + const scope = this.mainScope; // Backstop: close any open LLM span, any still-open tool spans, then any - // still-open turn spans, so nothing dangles. - this.endOpenLlmSpan(endTime); - for (const [callId, entry] of this.openTools) { - try { - entry.span.end(endArgs); - } catch (err) { - this.logger.error("codex processor: failed to end dangling tool span on root end", { - queueId: this.queueId, - callId, - error: String(err), - }); - } - } - this.openTools.clear(); - for (const [turnId, span] of this.openTurns) { - try { - span.end(endArgs); - } catch (err) { - this.logger.error("codex processor: failed to end dangling turn span on root end", { - queueId: this.queueId, - turnId, - error: String(err), - }); + // still-open turn spans in the main scope, so nothing dangles. + if (scope !== null) { + this.endOpenLlmSpan(scope, endTime); + this.closeAllOpenTools(scope, endArgs); + for (const [turnId, span] of scope.openTurns) { + try { + span.end(endArgs); + } catch (err) { + this.logger.error("codex processor: failed to end dangling turn span on root end", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } } + scope.openTurns.clear(); } - this.openTurns.clear(); try { this.rootSpan.end(endArgs); this.logger.info("codex processor: ended root span", { queueId: this.queueId }); @@ -993,10 +1313,10 @@ export class CodexEventProcessor implements EventProcessor { // Final catch-up: a turn's task_complete can land slightly after its Stop // hook fires, so the Stop's bounded wait may miss it and leave the turn open. // flush() happens after the terminal Stop's /flush (and on idle/eviction). - // Poll the transcript until every open turn has closed (or a bounded timeout - // elapses), processing new records as they appear. This processes the - // transcript (it does not unilaterally end spans), staying consistent with - // "the transcript is the truth". + // Poll every scope's transcript until every open turn has closed (or a + // bounded timeout elapses), processing new records as they appear. This + // processes the transcript (it does not unilaterally end spans), staying + // consistent with "the transcript is the truth". await this.drainOpenTurns(); try { await this.rootSpan.flush(); @@ -1009,31 +1329,29 @@ export class CodexEventProcessor implements EventProcessor { } } - // Read and process new transcript records, retrying on a bounded interval, - // until no turn that has seen its Stop is still open (its task_complete can - // land seconds after the Stop hook) or the timeout elapses. Always does at - // least one read so a task_complete already on disk closes its turn - // immediately. Crucially, this does NOT wait for turns that are merely + // Read and process new transcript records for every scope, retrying on a + // bounded interval, until no turn that has seen its Stop is still open (its + // task_complete can land seconds after the Stop hook) or the timeout elapses. + // Always does at least one read so a task_complete already on disk closes its + // turn immediately. Crucially, this does NOT wait for turns that are merely // mid-progress (no Stop yet) — so an idle flush during an active turn returns - // promptly instead of blocking. On timeout, leaves whatever is still open for - // endRootSpan's backstop to close. + // promptly. On timeout, leaves whatever is still open for the backstops. private async drainOpenTurns(): Promise { - if (this.transcriptPath === null) return; + if (this.scopes.size === 0) return; const deadline = Date.now() + FLUSH_TIMEOUT_MS; for (;;) { - try { - const { lines, offset } = this.transcriptReader.readFrom( - this.transcriptPath, - this.transcriptOffset, - ); - this.transcriptOffset = offset; - this.consumeLines(lines); - } catch (err) { - this.logger.error("codex processor: final catch-up read failed", { - queueId: this.queueId, - error: String(err), - }); - return; + for (const scope of this.scopes.values()) { + try { + const { lines, offset } = this.transcriptReader.readFrom(scope.path, scope.offset); + scope.offset = offset; + this.consumeLines(scope, lines); + } catch (err) { + this.logger.error("codex processor: final catch-up read failed", { + queueId: this.queueId, + path: scope.path, + error: String(err), + }); + } } // Only turns whose Stop has fired but whose task_complete hasn't arrived // keep us waiting. (endTurnSpan removes them from both sets as they close.) diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index 6ec8163..0888d0f 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -59,6 +59,22 @@ export function stop(data: Record = {}): EnqueueEvent { return codexHook("Stop", { session_id: "session-1", ...data }); } +export function subagentStart(data: Record = {}): EnqueueEvent { + return codexHook("SubagentStart", { session_id: "session-1", ...data }); +} + +export function subagentStop(data: Record = {}): EnqueueEvent { + return codexHook("SubagentStop", { session_id: "session-1", ...data }); +} + +export function preToolUse(data: Record = {}): EnqueueEvent { + return codexHook("PreToolUse", { session_id: "session-1", ...data }); +} + +export function postToolUse(data: Record = {}): EnqueueEvent { + return codexHook("PostToolUse", { session_id: "session-1", ...data }); +} + /** A config event enabling tracing; pass extra reporting config via `data`. */ export function configEvent(data: Record = {}): EnqueueEvent { return { @@ -258,6 +274,48 @@ export class FakeTranscriptReader implements TranscriptReader { } } +/** + * A transcript reader backed by multiple in-memory files, keyed by path. Used by + * subagent tests, where the main session and each subagent write to separate + * transcript files but are read by one processor. `append(path, line)` writes to + * a specific file; reads/waits operate per path with independent offsets. + */ +export class MultiFileTranscriptReader implements TranscriptReader { + private readonly buffers = new Map(); + + append(path: string, line: string): void { + this.buffers.set(path, `${this.buffers.get(path) ?? ""}${line}\n`); + } + + readFrom(path: string, offset: number): TranscriptReadResult { + const buffer = this.buffers.get(path) ?? ""; + const from = offset > buffer.length ? 0 : offset; + const slice = buffer.slice(from); + const lines: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + lines.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + const sentinelFound = lines.some(predicate); + return { lines, offset: next, sentinelFound }; + } +} + // ============================================================================ // assertProducesTrace: run a mixed list of hook events and transcript writes // through a CodexEventProcessor and assert the resulting trace. From fe15e743746c19503e0a33c8cfa8b032437ce83b Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 15:34:46 -0600 Subject: [PATCH 08/17] compaction and permission requests --- .../src/agents/codex/event-processor.test.ts | 357 ++++++++++++++++++ .../src/agents/codex/event-processor.ts | 257 ++++++++++++- .../src/agents/codex/test-helpers.ts | 16 + plugins/trace-codex/src/test-helpers.ts | 37 +- 4 files changed, 651 insertions(+), 16 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 15a2b53..3067bcf 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -7,6 +7,7 @@ import { CodexEventProcessor } from "./event-processor.ts"; import { assertProducesTrace, assistantMessage, + compacted, configEvent, customToolCall, customToolCallOutput, @@ -14,7 +15,9 @@ import { functionCall, functionCallOutput, MultiFileTranscriptReader, + postCompact, postToolUse, + preCompact, preToolUse, reasoning, sessionMeta, @@ -503,6 +506,360 @@ describe("CodexEventProcessor: tool spans", () => { }); }); +describe("CodexEventProcessor: permissions", () => { + test("an escalated tool call is annotated with permission metadata and a tag", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "c1", + arguments: JSON.stringify({ + cmd: "curl example.com", + sandbox_permissions: "require_escalated", + justification: "Need network access", + prefix_rule: ["curl"], + }), + }), + functionCallOutput({ call_id: "c1", output: "ok" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { + permission: { + sandbox_permissions: "require_escalated", + justification: "Need network access", + prefix_rule: ["curl"], + }, + }, + tags: ["permission-request"], + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // An escalation is a failed sandboxed attempt followed by an escalated retry. + // Both are separate tool spans (distinct call_ids); only the retry carries the + // permission annotation. + test("a sandboxed attempt then escalated retry: only the retry is annotated", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "attempt", + arguments: JSON.stringify({ cmd: "curl example.com" }), + }), + functionCallOutput({ call_id: "attempt", output: "Error: network blocked" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "retry", + arguments: JSON.stringify({ + cmd: "curl example.com", + sandbox_permissions: "require_escalated", + justification: "Need network access", + }), + }), + functionCallOutput({ call_id: "retry", output: "ok" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + // The failed sandboxed attempt: no permission annotation. + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { call_id: "attempt", permission: undefined }, + output: "Error: network blocked", + ended: true, + }, + { + // The escalated retry: annotated. + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { + call_id: "retry", + permission: { + sandbox_permissions: "require_escalated", + justification: "Need network access", + }, + }, + tags: ["permission-request"], + output: "ok", + ended: true, + }, + ], + }, + ], + }, + ); + }); + + test("a non-escalated tool call has no permission metadata or tag", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + taskStarted({ turn_id: "t1" }), + functionCall({ + turn_id: "t1", + name: "exec_command", + call_id: "c1", + arguments: JSON.stringify({ cmd: "ls" }), + }), + functionCallOutput({ call_id: "c1", output: "file.txt" }), + taskComplete({ turn_id: "t1", last_agent_message: "done" }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "exec_command", type: "tool" }, + metadata: { permission: undefined }, + ended: true, + }, + ], + }, + ], + }, + ); + }); +}); + +describe("CodexEventProcessor: compaction", () => { + // A compaction is its own turn containing a `compacted` record. That turn span + // is relabeled "compaction", tagged, annotated, and given an llm child span + // showing what the compaction did (input = prior history; output = the + // replacement history). The trigger comes from the compact hooks, and the turn + // is terminated by PostCompact (a compaction turn gets no Stop of its own). + test("a compaction turn becomes a compaction span with an llm child", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "tc" }), + preCompact({ turn_id: "tc", trigger: "manual" }), + compacted({ + message: "", + replacement_history: [ + { type: "message", role: "user", content: [{ type: "input_text", text: "hi" }] }, + { type: "compaction", encrypted_content: "opaque" }, + ], + window_id: 1, + }), + tokenCount({ total_tokens: 6451 }), + taskComplete({ turn_id: "tc", last_agent_message: null }), + postCompact({ turn_id: "tc", trigger: "manual" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "compaction", type: "task" }, + metadata: { + compaction: { trigger: "manual", replaced_message_count: 2, window_id: 1 }, + }, + tags: ["compaction"], + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + metadata: { compaction: true }, + // Output is shaped as the kept context + a clear summary marker + // (the real summary is encrypted), not the raw replacement list. + output: { + summary: "[summary unavailable — encrypted by Codex]", + kept_messages: [ + { + type: "message", + role: "user", + content: [{ type: "input_text", text: "hi" }], + }, + ], + }, + metrics: { tokens: 6451 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // The trigger must be captured whether the hook arrives before or after the + // transcript's compacted record. Here the hook comes AFTER (e.g. a resumed + // session whose compacted record is read during SessionStart's catch-up); the + // trigger is back-filled onto the already-built span. + test("trigger is back-filled when the compact hook arrives after the record", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "tc" }), + compacted({ replacement_history: [{ type: "compaction" }], window_id: 2 }), + tokenCount({ total_tokens: 10 }), + taskComplete({ turn_id: "tc", last_agent_message: null }), + // Stop processes the compacted record (trigger not yet known)... + stop({ turn_id: "tc" }), + // ...then the PostCompact hook reveals the trigger, back-filled onto the span. + postCompact({ turn_id: "tc", trigger: "auto" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "compaction", type: "task" }, + metadata: { + compaction: { trigger: "auto", replaced_message_count: 1, window_id: 2 }, + }, + tags: ["compaction"], + }, + ], + }, + ); + }); + + // Regression for the live bug: a compaction turn gets no Stop, and PostCompact + // fires BEFORE Codex writes the turn's task_complete. PostCompact must do the + // same bounded wait Stop does, so the compaction span closes within the hook + // instead of lingering open until idle eviction (where it may never flush). + test("PostCompact waits for a task_complete that lands after it, closing the span", async () => { + class DelayedRevealReader implements TranscriptReader { + private lines: string[] = []; + private withheld: string | null = null; + private readsUntilReveal = 0; + + append(line: string): void { + this.lines.push(line); + } + + withhold(line: string, afterReads: number): void { + this.withheld = line; + this.readsUntilReveal = afterReads; + } + + readFrom(_path: string, offset: number): TranscriptReadResult { + if (this.withheld !== null) { + if (this.readsUntilReveal <= 0) { + this.lines.push(this.withheld); + this.withheld = null; + } else { + this.readsUntilReveal -= 1; + } + } + const buf = this.lines.length > 0 ? `${this.lines.join("\n")}\n` : ""; + const from = offset > buf.length ? 0 : offset; + const slice = buf.slice(from); + const out: string[] = []; + let consumed = 0; + let lineStart = 0; + for (let i = 0; i < slice.length; i++) { + if (slice[i] === "\n") { + out.push(slice.slice(lineStart, i)); + lineStart = i + 1; + consumed = lineStart; + } + } + return { lines: out, offset: from + consumed }; + } + + async waitFor( + path: string, + offset: number, + predicate: (line: string) => boolean, + _opts: WaitForOptions, + ): Promise { + const { lines, offset: next } = this.readFrom(path, offset); + return { lines, offset: next, sentinelFound: lines.some(predicate) }; + } + } + + const reader = new DelayedRevealReader(); + const write = (record: Record) => + reader.append(JSON.stringify({ timestamp: "2026-01-01T00:00:10Z", ...record })); + write({ type: "session_meta", payload: { id: "s", cwd: "/work" } }); + write({ type: "event_msg", payload: { type: "task_started", turn_id: "tc" } }); + write({ + timestamp: "2026-01-01T00:00:11Z", + type: "compacted", + payload: { replacement_history: [{ type: "compaction" }], window_id: 1 }, + }); + write({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + // task_complete is withheld until after PostCompact's initial read, so only + // its bounded waitFor reveals it. + reader.withhold( + JSON.stringify({ + timestamp: "2026-01-01T00:00:15Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "tc", last_agent_message: null }, + }), + 1, + ); + + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + await processor.process(sessionStart({ session_id: "s" })); + await processor.process(preCompact({ session_id: "s", turn_id: "tc", trigger: "manual" })); + // PostCompact's initial read won't see task_complete; its bounded wait must. + await processor.process(postCompact({ session_id: "s", turn_id: "tc", trigger: "manual" })); + + const tree = spansToTree(await trace.drain()); + const compaction = tree?.children.find((c) => c.name === "compaction"); + expect(compaction).toBeDefined(); + // The span closed within the hook (end time from the late task_complete). + expect(compaction?.metrics?.end).toBe(1767225615); // 2026-01-01T00:00:15Z + } finally { + trace.cleanup(); + } + }); +}); + describe("CodexEventProcessor: llm spans", () => { test("a model call becomes an llm span (output + token metrics) under its turn", async () => { await assertProducesTrace( diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 76f42f8..a2c2797 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -49,6 +49,7 @@ import { ITEM_TOOL_SEARCH_CALL, ITEM_TOOL_SEARCH_OUTPUT, parseTranscriptLine, + RECORD_COMPACTED, RECORD_EVENT_MSG, RECORD_RESPONSE_ITEM, RECORD_SESSION_META, @@ -62,7 +63,15 @@ const STOP = "Stop"; const SUBAGENT_START = "SubagentStart"; const SUBAGENT_STOP = "SubagentStop"; const POST_TOOL_USE = "PostToolUse"; +const PRE_COMPACT = "PreCompact"; +const POST_COMPACT = "PostCompact"; const SPAWN_AGENT_TOOL = "spawn_agent"; +// Tag applied to a tool span whose call requested escalated permissions, so +// permission-gated actions are easy to find/filter in Braintrust. +const PERMISSION_TAG = "permission-request"; +// Tag applied to a turn span that performed a context compaction, so compactions +// are easy to find/filter in Braintrust. +const COMPACTION_TAG = "compaction"; // Bound on how long a Stop hook waits for the turn's task_complete to appear in // the transcript before giving up and emitting whatever exists. Codex can write @@ -82,6 +91,17 @@ const FLUSH_INTERVAL_MS = 50; const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); +/** + * Whether a hook signals the end of a turn (so we should wait, bounded, for the + * turn's task_complete to be written). Stop ends a normal turn; SubagentStop a + * subagent turn; PostCompact a compaction turn — which, unlike the others, never + * gets a Stop, so this is what lets its span close promptly instead of lingering + * open until idle eviction. + */ +function isTurnTerminal(eventName: string): boolean { + return eventName === STOP || eventName === SUBAGENT_STOP || eventName === POST_COMPACT; +} + /** Tool-call response_item subtypes that OPEN a tool span. */ const TOOL_CALL_TYPES = new Set([ITEM_FUNCTION_CALL, ITEM_CUSTOM_TOOL_CALL, ITEM_TOOL_SEARCH_CALL]); /** Tool-call response_item subtypes that CLOSE a tool span (by call_id). */ @@ -139,6 +159,12 @@ interface OpenLlm { turnId: string | undefined; /** Assistant messages produced by this call (text and/or tool calls). */ output: ChatMessage[]; + /** + * When true, the span's output was already set at creation (e.g. a compaction + * call, whose output is the replacement history), so closing it must not + * overwrite it with the accumulated `output` messages. + */ + outputPreset?: boolean; } /** @@ -229,6 +255,75 @@ function messageText(content: unknown): string | undefined { return parts.length > 0 ? parts.join("") : undefined; } +/** + * Shape a compaction's `replacement_history` into a readable "after compaction" + * output. The replacement history is the new context Codex keeps going forward: + * a few recent messages kept verbatim, plus a `compaction` entry holding the + * summary as `encrypted_content` (opaque to us). We surface the readable kept + * messages and replace the encrypted summary with a clear marker, so the span + * doesn't look like the model merely echoed the last user message. + */ +function compactionOutput(replacement: unknown[] | undefined): Record { + if (replacement === undefined) { + return { summary: "[unavailable]", kept_messages: [] }; + } + const kept: unknown[] = []; + let summaryEncrypted = false; + for (const entry of replacement) { + if (entry !== null && typeof entry === "object") { + const e = entry as Record; + if (e.type === "compaction") { + if (typeof e.encrypted_content === "string") summaryEncrypted = true; + continue; + } + } + kept.push(entry); + } + return { + summary: summaryEncrypted ? "[summary unavailable — encrypted by Codex]" : "[no summary]", + kept_messages: kept, + }; +} + +/** A tool call's permission/escalation request, parsed from its arguments. */ +interface PermissionInfo { + /** The requested sandbox permission, e.g. "require_escalated". */ + sandbox_permissions: string; + /** The model's justification shown to the user (may be absent). */ + justification?: string; + /** The command prefix the escalation applies to (may be absent). */ + prefix_rule?: unknown; +} + +/** + * Parse a tool call's permission/escalation request from its arguments. Codex + * records an escalated retry's request inline in the function_call arguments + * (`sandbox_permissions`, `justification`, `prefix_rule`); when present, we + * surface it on the tool span. `args` is the raw arguments (a JSON string for + * function_call, or an object for custom_tool_call). Returns undefined when + * there's no escalation request. + */ +function permissionInfo(args: unknown): PermissionInfo | undefined { + let obj: Record | undefined; + if (typeof args === "string") { + try { + const parsed = JSON.parse(args) as unknown; + if (parsed !== null && typeof parsed === "object") obj = parsed as Record; + } catch { + return undefined; + } + } else if (args !== null && typeof args === "object") { + obj = args as Record; + } + if (obj === undefined) return undefined; + const sandbox = obj.sandbox_permissions; + if (typeof sandbox !== "string" || sandbox.length === 0) return undefined; + const info: PermissionInfo = { sandbox_permissions: sandbox }; + if (typeof obj.justification === "string") info.justification = obj.justification; + if (obj.prefix_rule !== undefined) info.prefix_rule = obj.prefix_rule; + return info; +} + export class CodexEventProcessor implements EventProcessor { private readonly logger: Logger; private readonly queueId: string | null; @@ -260,6 +355,16 @@ export class CodexEventProcessor implements EventProcessor { // parent a subagent's root span under the spawning tool. Resolved from the // spawn_agent PostToolUse (agent_id + call_id) via spawnToolSpansByCallId. private readonly spawnToolSpansByAgentId = new Map(); + // turn_id -> compaction trigger ("manual"/"auto"), recorded from the + // PreCompact/PostCompact hooks (the transcript's compacted record lacks the + // trigger). A side map so the hook and the transcript's compacted record can + // arrive in any order: whichever comes first stores its half here / below, and + // the second one reconciles. (On a resumed session the transcript's compacted + // record is read during SessionStart's catch-up, before the hook arrives.) + private readonly compactionTriggerByTurn = new Map(); + // turn_id -> the compaction span built from the transcript, kept so a later + // PreCompact/PostCompact hook can back-fill the trigger onto it. + private readonly compactionSpansByTurn = new Map(); constructor( queueId: string | null, @@ -312,6 +417,21 @@ export class CodexEventProcessor implements EventProcessor { this.recordRootEnrichment(event); } + // Compaction trigger is hook-only (not in the transcript). Record it keyed by + // turn_id, and if the compaction span was already built from the transcript + // (e.g. on a resumed session its compacted record is read during + // SessionStart's catch-up, before this hook arrives), back-fill the trigger + // onto it now. Order-independent: handleCompacted reads the map; this patches + // the span — whichever happens second reconciles. + if (event.eventName === PRE_COMPACT || event.eventName === POST_COMPACT) { + const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; + const trigger = typeof data.trigger === "string" ? data.trigger : undefined; + if (turnId !== undefined && trigger !== undefined) { + this.compactionTriggerByTurn.set(turnId, trigger); + this.backfillCompactionTrigger(turnId, trigger); + } + } + // A subagent is starting: open its root span (under the spawn_agent tool that // created it) and register a scope for its transcript so its hooks build a // full turn/llm/tool hierarchy inside that span. @@ -323,9 +443,11 @@ export class CodexEventProcessor implements EventProcessor { // its transcript path, so each file keeps its own offset and open spans. const scope = this.scopeForEvent(event); - // A Stop signals its turn is finishing; mark it so flush knows to wait for - // this turn's task_complete (which Codex may write after the Stop hook). - if ((event.eventName === STOP || event.eventName === SUBAGENT_STOP) && scope !== null) { + // A turn-terminal signal: the turn is finishing, so mark it (flush, and the + // catch-up below, wait for this turn's task_complete, which Codex may write + // after the hook). Stop ends a normal turn; SubagentStop ends a subagent + // turn; PostCompact ends a compaction turn (which gets no Stop of its own). + if (isTurnTerminal(event.eventName) && scope !== null) { const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; if (turnId !== undefined) scope.turnsAwaitingCompletion.add(turnId); } @@ -366,11 +488,13 @@ export class CodexEventProcessor implements EventProcessor { // Read transcript lines appended to this scope's file since its last read and // turn each into spans. Never throws. // - // On a Stop/SubagentStop, first consume whatever's already on disk. If that - // closes the turn, we're done. Otherwise Codex hasn't written this turn's - // task_complete yet (it can lag the Stop hook by seconds), so wait (bounded) - // for it — but only then, so a Stop whose turn is already complete returns - // immediately. + // On a turn-terminal hook (Stop/SubagentStop/PostCompact), first consume + // whatever's already on disk. If that closes the turn, we're done. Otherwise + // Codex hasn't written this turn's task_complete yet (it can lag the hook by + // seconds), so wait (bounded) for it — but only then, so a hook whose turn is + // already complete returns immediately. PostCompact matters specifically + // because a compaction turn gets no Stop, so without this its span would stay + // open until idle eviction (and may never be flushed). private async catchUpScope(scope: TranscriptScope, event: EnqueueEvent): Promise { const data = (event.eventData ?? {}) as Record; @@ -379,8 +503,7 @@ export class CodexEventProcessor implements EventProcessor { scope.offset = initial.offset; this.consumeLines(scope, initial.lines); - const isStop = event.eventName === STOP || event.eventName === SUBAGENT_STOP; - if (!isStop) return; + if (!isTurnTerminal(event.eventName)) return; const turnId = typeof data.turn_id === "string" ? data.turn_id : undefined; // If the turn already closed during the initial read, no wait is needed. @@ -396,8 +519,9 @@ export class CodexEventProcessor implements EventProcessor { scope.offset = result.offset; this.consumeLines(scope, result.lines); if (!result.sentinelFound) { - this.logger.warn("codex processor: Stop sentinel not found; emitting partial turn", { + this.logger.warn("codex processor: task_complete sentinel not found; partial turn", { queueId: this.queueId, + eventName: event.eventName, turnId, }); } @@ -659,6 +783,10 @@ export class CodexEventProcessor implements EventProcessor { this.noteModel(scope, record); return; } + if (record.type === RECORD_COMPACTED) { + this.handleCompacted(scope, record); + return; + } if (record.type === RECORD_EVENT_MSG) { const ptype = record.payload?.type; if (ptype === EVT_TASK_STARTED) this.startTurnSpan(scope, record); @@ -863,6 +991,103 @@ export class CodexEventProcessor implements EventProcessor { } } + // Handle a `compacted` record: the current turn is a context compaction (Codex + // runs it as its own turn). Relabel that turn span as a "compaction" span with + // metadata (trigger from the PreCompact/PostCompact hook, the count of messages + // the compaction collapsed to, and the window_id) and a tag, then open an llm + // span representing the compaction model call. The following token_count closes + // that llm span with its token usage, so the trace shows what the compaction + // did: the conversation before it (input) and the kept context after (output). + private handleCompacted(scope: TranscriptScope, record: TranscriptRecord): void { + const turnId = this.latestOpenTurnId(scope); + const turnSpan = this.latestOpenTurn(scope); + if (turnId === undefined || turnSpan === undefined) { + this.logger.debug("codex processor: compacted record with no open turn span", { + queueId: this.queueId, + }); + return; + } + const payload = (record.payload ?? {}) as Record; + const replacement = Array.isArray(payload.replacement_history) + ? (payload.replacement_history as unknown[]) + : undefined; + const trigger = this.compactionTriggerByTurn.get(turnId); + + // Relabel the turn span as a compaction span and annotate it. Keep the span + // so a PreCompact/PostCompact hook arriving later can back-fill the trigger. + this.compactionSpansByTurn.set(turnId, turnSpan); + try { + turnSpan.setAttributes({ name: "compaction", type: "task" }); + turnSpan.log({ + metadata: { + compaction: { + trigger, + replaced_message_count: replacement?.length, + window_id: payload.window_id, + }, + }, + tags: [COMPACTION_TAG], + }); + } catch (err) { + this.logger.error("codex processor: failed to annotate compaction span", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + + // Open an llm span for the compaction model call. To make it self-evident + // (and avoid the "model just echoed the user" look), shape input/output as + // before/after of the context window: + // input = the conversation history before compaction (+ its count) + // output = the kept context after compaction (readable messages) plus a + // clear marker for the summary, which Codex stores encrypted and + // does not expose to us. + // Closed by the turn's token_count (metrics) like any other model call. + const startTime = isoToUnixSeconds(record.timestamp); + const before = scope.conversationHistory.map((item) => ({ ...item })); + const output = compactionOutput(replacement); + try { + const span = turnSpan.startSpan({ + name: scope.model ?? "compaction", + type: "llm", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input: { messages_before_compaction: before.length, history: before }, + output, + metadata: { model: scope.model, turn_id: turnId, compaction: true }, + }, + }); + // Reuse the open-llm slot so the turn's token_count closes it with usage. + // outputPreset: the replacement history is already logged as the output, so + // the closing token_count must not overwrite it. + scope.openLlm = { span, turnId, output: [], outputPreset: true }; + } catch (err) { + this.logger.error("codex processor: failed to open compaction llm span", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + } + + // Patch the trigger onto a compaction span when its PreCompact/PostCompact hook + // arrives after the span was already built from the transcript. No-op if the + // span isn't known yet (handleCompacted will read the trigger from the map). + private backfillCompactionTrigger(turnId: string, trigger: string): void { + const span = this.compactionSpansByTurn.get(turnId); + if (span === undefined) return; + try { + span.log({ metadata: { compaction: { trigger } } }); + } catch (err) { + this.logger.error("codex processor: failed to back-fill compaction trigger", { + queueId: this.queueId, + turnId, + error: String(err), + }); + } + } + // Set the open turn's input from a user_message event (the prompt). The // user_message carries no turn_id, so it applies to the most recently opened // still-open turn in this scope. @@ -1020,9 +1245,9 @@ export class CodexEventProcessor implements EventProcessor { const usage = (info.last_token_usage ?? {}) as Record; const metrics = tokenMetrics(usage); const endTime = isoToUnixSeconds(record.timestamp); - const { span, output } = scope.openLlm; + const { span, output, outputPreset } = scope.openLlm; try { - span.log({ output: llmOutput(output), metrics }); + span.log(outputPreset ? { metrics } : { output: llmOutput(output), metrics }); span.end(endTime !== undefined ? { endTime } : undefined); this.logger.info("codex processor: ended llm span", { queueId: this.queueId }); } catch (err) { @@ -1122,6 +1347,9 @@ export class CodexEventProcessor implements EventProcessor { }); return; } + // If this call requested escalated permissions (Codex records the request + // inline in the arguments), surface it on the span as metadata and a tag. + const permission = permissionInfo(input); const startTime = isoToUnixSeconds(record.timestamp); try { const toolSpan = turnSpan.startSpan({ @@ -1130,7 +1358,8 @@ export class CodexEventProcessor implements EventProcessor { ...(startTime !== undefined ? { startTime } : {}), event: { input, - metadata: { tool_name: name, call_id: callId, turn_id: turnId }, + metadata: { tool_name: name, call_id: callId, turn_id: turnId, permission }, + ...(permission !== undefined ? { tags: [PERMISSION_TAG] } : {}), }, }); scope.openTools.set(callId, { span: toolSpan, turnId }); diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index 0888d0f..527d3a0 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -71,6 +71,14 @@ export function preToolUse(data: Record = {}): EnqueueEvent { return codexHook("PreToolUse", { session_id: "session-1", ...data }); } +export function preCompact(data: Record = {}): EnqueueEvent { + return codexHook("PreCompact", { session_id: "session-1", ...data }); +} + +export function postCompact(data: Record = {}): EnqueueEvent { + return codexHook("PostCompact", { session_id: "session-1", ...data }); +} + export function postToolUse(data: Record = {}): EnqueueEvent { return codexHook("PostToolUse", { session_id: "session-1", ...data }); } @@ -218,6 +226,14 @@ export function tokenCount(usage: Record = {}): TranscriptEntry }); } +/** + * compacted record: marks the current turn as a context compaction. + * `replacement_history` is the summarized history that replaces prior context. + */ +export function compacted(payload: Record = {}): TranscriptEntry { + return transcript({ type: "compacted", payload }); +} + /** One entry in a mixed trace list: a hook event or a transcript write. */ export type TraceEntry = EnqueueEvent | TranscriptEntry; diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts index 8bd8ac0..caea91b 100644 --- a/plugins/trace-codex/src/test-helpers.ts +++ b/plugins/trace-codex/src/test-helpers.ts @@ -85,6 +85,7 @@ export interface CapturedSpan { input?: unknown; output?: unknown; metadata?: Record; + tags?: string[]; metrics?: { start?: number; end?: number } & Record; } @@ -131,6 +132,7 @@ export interface SpanTree { input?: unknown; output?: unknown; metadata?: Record; + tags?: string[]; metrics?: { start?: number; end?: number } & Record; children: SpanTree[]; } @@ -171,6 +173,7 @@ export function spansToTree(spans: CapturedSpan[]): SpanTree | null { input: span.input, output: span.output, metadata: span.metadata, + tags: span.tags, metrics: span.metrics, children, }; @@ -188,6 +191,8 @@ export interface ExpectedSpan { input?: unknown; output?: unknown; metadata?: Record; + /** If set, assert each listed tag is present on the span. */ + tags?: string[]; /** If set, assert whether the span has an end time (true) or not (false). */ ended?: boolean; /** If set, assert exact start/end metric values (Unix seconds). */ @@ -200,6 +205,23 @@ function nameMatches(actual: string | undefined, expected: string | RegExp): boo return expected instanceof RegExp ? expected.test(actual ?? "") : actual === expected; } +/** JSON.stringify with object keys sorted recursively, for order-insensitive + * deep equality of plain data (objects/arrays/primitives). */ +function canonicalJson(value: unknown): string { + const sort = (v: unknown): unknown => { + if (Array.isArray(v)) return v.map(sort); + if (v !== null && typeof v === "object") { + const out: Record = {}; + for (const key of Object.keys(v as Record).sort()) { + out[key] = sort((v as Record)[key]); + } + return out; + } + return v; + }; + return JSON.stringify(sort(value)); +} + export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: string): string[] { const diffs: string[] = []; if (actual === null) { @@ -235,11 +257,22 @@ export function diffSpan(actual: SpanTree | null, expected: ExpectedSpan, path: } if (expected.metadata !== undefined) { for (const [key, value] of Object.entries(expected.metadata)) { - const a = JSON.stringify(actual.metadata?.[key]); - const e = JSON.stringify(value); + // Compare with sorted keys so nested-object key order doesn't matter (the + // SDK may reorder keys, e.g. when metadata is merged across log() calls). + const a = canonicalJson(actual.metadata?.[key]); + const e = canonicalJson(value); if (a !== e) diffs.push(`${path}.metadata.${key}: expected ${e}, got ${a}`); } } + if (expected.tags !== undefined) { + for (const tag of expected.tags) { + if (!actual.tags?.includes(tag)) { + diffs.push( + `${path}.tags: expected to include "${tag}", got ${JSON.stringify(actual.tags)}`, + ); + } + } + } if (expected.ended !== undefined) { const isEnded = actual.metrics?.end !== undefined; if (isEnded !== expected.ended) { From 79777e7ce2bcd63404abcad119312c74e82c35c4 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 16:11:46 -0600 Subject: [PATCH 09/17] add transcript path to span attributes --- .../src/agents/codex/event-processor.test.ts | 131 +++++++++++++++++- .../src/agents/codex/event-processor.ts | 18 ++- .../src/agents/codex/test-helpers.ts | 20 +++ 3 files changed, 167 insertions(+), 2 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 3067bcf..df7c001 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -25,6 +25,7 @@ import { stop, subagentStart, subagentStop, + TEST_TRANSCRIPT_PATH, taskComplete, taskStarted, tokenCount, @@ -55,7 +56,12 @@ describe("CodexEventProcessor: root span", () => { ], { span_attributes: { name: "codex: myapp", type: "task" }, - metadata: { session_id: "session-1", model: "gpt-5.5", source: "startup" }, + metadata: { + session_id: "session-1", + model: "gpt-5.5", + source: "startup", + transcript_path: TEST_TRANSCRIPT_PATH, + }, children: [], }, ); @@ -1418,6 +1424,8 @@ describe("CodexEventProcessor: subagents", () => { const tree = spansToTree(await trace.drain()); expect(tree?.name).toBe("codex: app"); + // The root carries the main session's transcript path. + expect(tree?.metadata?.transcript_path).toBe(MAIN); const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); expect(mainTurn).toBeDefined(); @@ -1428,6 +1436,8 @@ describe("CodexEventProcessor: subagents", () => { const subRoot = spawn?.children.find((c) => c.name === "subagent: agent-1"); expect(subRoot).toBeDefined(); expect(subRoot?.type).toBe("task"); + // The subagent root carries its own transcript path. + expect(subRoot?.metadata?.transcript_path).toBe(SUB); const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); expect(subTurn).toBeDefined(); @@ -1440,4 +1450,123 @@ describe("CodexEventProcessor: subagents", () => { trace.cleanup(); } }); + + // Regression for the live bug: after SubagentStop finalizes a subagent, a later + // flush (which drains every scope) must NOT re-read the subagent transcript and + // re-open its already-closed spans — doing so left the subagent root/turn/llm + // hanging (no end) because the re-opened, un-ended spans overwrote the closed + // ones. A finalized subagent scope must be inert. + test("a finalized subagent's spans stay closed across later flushes", async () => { + const reader = new MultiFileTranscriptReader(); + const w = (path: string, record: Record) => + reader.append(path, JSON.stringify(transcript(record).record)); + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + + // Main session spawns a subagent and stays open (no main Stop yet). + w(MAIN, { type: "session_meta", payload: { id: "s", cwd: "/work/app" } }); + w(MAIN, { type: "event_msg", payload: { type: "task_started", turn_id: "main-1" } }); + w(MAIN, { + type: "response_item", + payload: { + type: "function_call", + name: "spawn_agent", + call_id: "call_spawn", + arguments: "{}", + metadata: { turn_id: "main-1" }, + }, + }); + w(MAIN, { + type: "response_item", + payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" }, + }); + await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN })); + await processor.process( + postToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + tool_response: JSON.stringify({ agent_id: "agent-1" }), + }), + ); + + // Subagent transcript: a turn with a model call, then task_complete, then a + // TRAILING record after task_complete. SubagentStop's bounded wait stops at + // the task_complete sentinel, leaving the trailing record unconsumed; a + // later flush would otherwise re-read it and re-open spans on the finalized + // scope (the bug). With the fix, the finalized scope is inert. + w(SUB, { type: "session_meta", payload: { id: "agent-1" } }); + w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-1" } }); + w(SUB, { type: "turn_context", payload: { model: "gpt-5.5" } }); + w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(SUB, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + w(SUB, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" }, + }); + await processor.process( + subagentStart({ + session_id: "s", + transcript_path: SUB, + agent_id: "agent-1", + agent_type: "default", + }), + ); + await processor.process( + subagentStop({ + session_id: "s", + transcript_path: MAIN, + agent_transcript_path: SUB, + agent_id: "agent-1", + }), + ); + + // After finalize, MORE records land in the subagent transcript (Codex can + // write trailing records, and the rollout file persists). A later flush + // drains every scope and re-reads this file. If the finalized scope weren't + // inert, these would re-open a turn + llm on it, leaving spans hanging. + w(SUB, { type: "event_msg", payload: { type: "task_started", turn_id: "sub-2" } }); + w(SUB, { type: "response_item", payload: { type: "reasoning", summary: [] } }); + w(SUB, { + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: {} } }, + }); + await processor.flush(); + + // Main session finishes. + w(MAIN, { + type: "event_msg", + payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "done" }, + }); + await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" })); + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + const spawn = tree?.children + .find((c) => c.name === "turn: main-1") + ?.children.find((c) => c.name === "spawn_agent"); + const subRoot = spawn?.children.find((c) => c.name === "subagent: agent-1"); + const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); + + // The subagent root and turn are closed (not hanging), and the post-finalize + // records did NOT re-open anything: no second turn, no re-opened llm. + expect(subRoot?.metrics?.end).toBeDefined(); + expect(subTurn?.metrics?.end).toBeDefined(); + expect(subRoot?.children.some((c) => c.name === "turn: sub-2")).toBe(false); + expect(subRoot?.children.filter((c) => c.name?.startsWith("turn:")).length).toBe(1); + } finally { + trace.cleanup(); + } + }); }); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index a2c2797..2afac7d 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -681,7 +681,14 @@ export class CodexEventProcessor implements EventProcessor { name: `subagent: ${pending.agentId}`, type: "task", ...(startTime !== undefined ? { startTime } : {}), - event: { metadata: { agent_id: pending.agentId, agent_type: pending.agentType } }, + event: { + metadata: { + agent_id: pending.agentId, + agent_type: pending.agentType, + // The subagent's own rollout transcript on disk. + transcript_path: scope.path, + }, + }, }); this.logger.info("codex processor: opened subagent root span", { queueId: this.queueId, @@ -749,6 +756,9 @@ export class CodexEventProcessor implements EventProcessor { // scope, advancing nothing (the caller owns the offset). Used by catch-up and // by the final flush read. private consumeLines(scope: TranscriptScope, lines: string[]): void { + // A finalized subagent scope is inert: never re-open/mutate its spans (a + // re-read after SubagentStop would otherwise re-open already-closed spans). + if (scope.subagentEnded) return; for (const line of lines) { const record = parseTranscriptLine(line); if (record === null) continue; @@ -909,6 +919,8 @@ export class CodexEventProcessor implements EventProcessor { permission_mode: this.rootEnrichment.permissionMode, cli_version: typeof payload.cli_version === "string" ? payload.cli_version : undefined, project: this.reportingConfig?.project, + // The session's rollout transcript on disk (the source for this trace). + transcript_path: this.mainScope?.path, }, }, }); @@ -1570,6 +1582,10 @@ export class CodexEventProcessor implements EventProcessor { const deadline = Date.now() + FLUSH_TIMEOUT_MS; for (;;) { for (const scope of this.scopes.values()) { + // A finalized subagent scope is inert: re-reading it would re-open spans + // that were already closed (and the SDK would overwrite the closed spans + // with un-ended ones). Skip it. + if (scope.subagentEnded) continue; try { const { lines, offset } = this.transcriptReader.readFrom(scope.path, scope.offset); scope.offset = offset; diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index 527d3a0..438c5b4 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -298,12 +298,32 @@ export class FakeTranscriptReader implements TranscriptReader { */ export class MultiFileTranscriptReader implements TranscriptReader { private readonly buffers = new Map(); + /** Per-path withheld trailing line that only appears after N more reads. */ + private readonly withheldByPath = new Map(); append(path: string, line: string): void { this.buffers.set(path, `${this.buffers.get(path) ?? ""}${line}\n`); } + /** + * Append a line to `path` that only becomes visible after `afterReads` more + * reads of that path. Models a record Codex writes slightly later, so it isn't + * seen by the catch-up that's running now but is by a subsequent one. + */ + withhold(path: string, line: string, afterReads: number): void { + this.withheldByPath.set(path, { line, afterReads }); + } + readFrom(path: string, offset: number): TranscriptReadResult { + const pending = this.withheldByPath.get(path); + if (pending !== undefined) { + if (pending.afterReads <= 0) { + this.append(path, pending.line); + this.withheldByPath.delete(path); + } else { + pending.afterReads -= 1; + } + } const buffer = this.buffers.get(path) ?? ""; const from = offset > buffer.length ? 0 : offset; const slice = buffer.slice(from); From 5f219ae296abb8eb488c87f6f695eddb6f88651d Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 16:16:29 -0600 Subject: [PATCH 10/17] make subagents siblings to spawn tool call --- .../src/agents/codex/event-processor.test.ts | 12 ++--- .../src/agents/codex/event-processor.ts | 54 ++++++++++--------- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index df7c001..6add901 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -1430,10 +1430,11 @@ describe("CodexEventProcessor: subagents", () => { expect(mainTurn).toBeDefined(); // The spawn_agent tool span is under the main turn, and the subagent root - // is nested under it (not a sibling of the main turn, not duplicated). + // is a SIBLING of it (both under the main turn) — not nested in the tool. const spawn = mainTurn?.children.find((c) => c.name === "spawn_agent"); expect(spawn?.type).toBe("tool"); - const subRoot = spawn?.children.find((c) => c.name === "subagent: agent-1"); + expect(spawn?.children.some((c) => c.name === "subagent: agent-1")).toBe(false); + const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1"); expect(subRoot).toBeDefined(); expect(subRoot?.type).toBe("task"); // The subagent root carries its own transcript path. @@ -1553,10 +1554,9 @@ describe("CodexEventProcessor: subagents", () => { await processor.flush(); const tree = spansToTree(await trace.drain()); - const spawn = tree?.children - .find((c) => c.name === "turn: main-1") - ?.children.find((c) => c.name === "spawn_agent"); - const subRoot = spawn?.children.find((c) => c.name === "subagent: agent-1"); + // The subagent root is a sibling of the spawn_agent tool (under main-1). + const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); + const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1"); const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); // The subagent root and turn are closed (not hanging), and the post-finalize diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 2afac7d..8dcfd84 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -346,15 +346,17 @@ export class CodexEventProcessor implements EventProcessor { // The main session's scope (its turns hang under the root span). Created on // the first hook carrying the main transcript path. private mainScope: TranscriptScope | null = null; - // call_id -> the spawn_agent tool span, recorded when a spawn_agent tool span - // opens in the main transcript. The spawn_agent PostToolUse hook carries the - // resulting agent_id and the call_id (tool_use_id), letting us map agent_id to - // its spawn tool span (below) even after the tool span has closed. - private readonly spawnToolSpansByCallId = new Map(); - // agent_id -> the spawn_agent tool span that created that subagent. Used to - // parent a subagent's root span under the spawning tool. Resolved from the - // spawn_agent PostToolUse (agent_id + call_id) via spawnToolSpansByCallId. - private readonly spawnToolSpansByAgentId = new Map(); + // call_id -> the turn span that contains a spawn_agent tool call, recorded when + // that tool span opens in the main transcript. The spawn_agent PostToolUse hook + // carries the resulting agent_id and the call_id (tool_use_id), letting us map + // agent_id to that turn span (below). We keep the TURN (not the tool span) so a + // subagent's root can be parented as a SIBLING of the spawn_agent tool — both + // under the same turn — rather than nested inside the tool. + private readonly spawnTurnSpansByCallId = new Map(); + // agent_id -> the turn span under which that subagent should be parented (the + // turn that ran the spawn_agent tool). Resolved from the spawn_agent PostToolUse + // (agent_id + call_id) via spawnTurnSpansByCallId. + private readonly spawnTurnSpansByAgentId = new Map(); // turn_id -> compaction trigger ("manual"/"auto"), recorded from the // PreCompact/PostCompact hooks (the transcript's compacted record lacks the // trigger). A side map so the hook and the transcript's compacted record can @@ -621,24 +623,25 @@ export class CodexEventProcessor implements EventProcessor { if (typeof ar === "string") agentId = ar; } if (agentId === undefined) return; - const span = this.spawnToolSpansByCallId.get(callId); - if (span === undefined) { - this.logger.debug("codex processor: spawn_agent PostToolUse with no known tool span", { + const turnSpan = this.spawnTurnSpansByCallId.get(callId); + if (turnSpan === undefined) { + this.logger.debug("codex processor: spawn_agent PostToolUse with no known turn span", { queueId: this.queueId, callId, agentId, }); return; } - this.spawnToolSpansByAgentId.set(agentId, span); + this.spawnTurnSpansByAgentId.set(agentId, turnSpan); } // A subagent is starting. Register a scope reading the subagent's own // transcript so its turns/llm/tools build a full hierarchy. The subagent root // span is created lazily from the subagent's session_meta (so it gets a real - // start time); we stash the parent here — the spawn_agent tool span that - // created it (matched by agent_id from the spawn_agent PostToolUse), or the - // main root span as a fallback if that mapping isn't known. + // start time); we stash the parent here — the turn span that ran the + // spawn_agent tool (matched by agent_id from the spawn_agent PostToolUse), so + // the subagent root is a SIBLING of that spawn_agent tool span. Falls back to + // the main root span if that mapping isn't known. private handleSubagentStart(event: EnqueueEvent): void { const data = (event.eventData ?? {}) as Record; const agentId = typeof data.agent_id === "string" ? data.agent_id : undefined; @@ -651,7 +654,7 @@ export class CodexEventProcessor implements EventProcessor { } if (this.scopes.has(path)) return; // already registered - const parent = this.spawnToolSpansByAgentId.get(agentId) ?? this.rootSpan; + const parent = this.spawnTurnSpansByAgentId.get(agentId) ?? this.rootSpan; if (parent === null) { this.logger.warn("codex processor: SubagentStart with no parent span; skipping", { queueId: this.queueId, @@ -669,9 +672,10 @@ export class CodexEventProcessor implements EventProcessor { }); } - // Create a subagent's root span from its session_meta record (child of the - // spawn_agent tool span, with the record's timestamp as start time). Called - // once per subagent scope, when its session_meta is first consumed. + // Create a subagent's root span from its session_meta record (sibling of the + // spawn_agent tool span — both under the spawning turn — with the record's + // timestamp as start time). Called once per subagent scope, when its + // session_meta is first consumed. private startSubagentRootSpan(scope: TranscriptScope, record: TranscriptRecord): void { const pending = scope.pendingSubagent; if (pending === undefined || scope.subagentRootSpan !== null) return; @@ -1375,12 +1379,12 @@ export class CodexEventProcessor implements EventProcessor { }, }); scope.openTools.set(callId, { span: toolSpan, turnId }); - // Remember spawn_agent tool spans (main scope only) so a subagent's root - // span can be nested under the tool that launched it. The agent_id is not - // in the transcript; it arrives on the spawn_agent PostToolUse hook, which - // we map to this span by call_id. + // Remember the TURN span that ran a spawn_agent tool (main scope only) so a + // subagent's root span can be parented as a sibling of the tool (both under + // this turn). The agent_id isn't in the transcript; it arrives on the + // spawn_agent PostToolUse hook, which we map to this turn by call_id. if (scope.kind === "main" && name === SPAWN_AGENT_TOOL) { - this.spawnToolSpansByCallId.set(callId, toolSpan); + this.spawnTurnSpansByCallId.set(callId, turnSpan); } this.logger.info("codex processor: opened tool span", { queueId: this.queueId, From 1c0cd899ef476769b375b97105b8381ea31a3664 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 16:52:25 -0600 Subject: [PATCH 11/17] agent reasoning --- .../src/agents/codex/event-processor.test.ts | 79 +++++++++++++++++++ .../src/agents/codex/event-processor.ts | 62 +++++++++++++-- .../src/agents/codex/test-helpers.ts | 13 ++- 3 files changed, 144 insertions(+), 10 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 6add901..3214822 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -903,6 +903,85 @@ describe("CodexEventProcessor: llm spans", () => { ); }); + // A reasoning item with a readable summary is surfaced as a `reasoning` entry + // in the llm output (Braintrust's OpenAI Responses shape), interleaved before + // the assistant message. + test("a reasoning summary is captured as a reasoning item in the llm output", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessageItem("solve it"), + reasoning(["**Inspecting the problem**", "**Deriving the formula**"]), + assistantMessage("The answer is 42."), + tokenCount({ input_tokens: 10, output_tokens: 5, total_tokens: 15 }), + taskComplete({ turn_id: "t1", last_agent_message: "The answer is 42." }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: [ + { + type: "reasoning", + summary: [ + { type: "summary_text", text: "**Inspecting the problem**" }, + { type: "summary_text", text: "**Deriving the formula**" }, + ], + }, + { role: "assistant", content: "The answer is 42." }, + ], + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // A reasoning item with only encrypted content (no readable summary) opens the + // llm span but adds nothing to the output. + test("an encrypted-only reasoning item adds no reasoning to the output", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessageItem("hi"), + reasoning(), // no summary + assistantMessage("Hello."), + tokenCount({ total_tokens: 5 }), + taskComplete({ turn_id: "t1", last_agent_message: "Hello." }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + // Single assistant message only — no reasoning entry. + output: { role: "assistant", content: "Hello." }, + }, + ], + }, + ], + }, + ); + }); + test("llm input is reconstructed from prior conversation messages", async () => { await assertProducesTrace( [ diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 8dcfd84..d60548f 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -153,12 +153,27 @@ interface ChatMessage { tool_call_id?: string; } +/** + * A reasoning ("thinking") item, in the OpenAI Responses shape Braintrust + * renders. `summary` is the readable reasoning text the model exposes (Codex + * records it on `reasoning` transcript items as `summary_text` entries; the full + * chain-of-thought itself is encrypted and not available). Interleaved with chat + * messages in an LLM call's output and in the conversation history. + */ +interface ReasoningItem { + type: "reasoning"; + summary: Array<{ type: "summary_text"; text: string }>; +} + +/** An item in an LLM call's output or the conversation history. */ +type ConversationItem = ChatMessage | ReasoningItem; + /** State for the LLM span currently in progress (one model call). */ interface OpenLlm { span: Span; turnId: string | undefined; - /** Assistant messages produced by this call (text and/or tool calls). */ - output: ChatMessage[]; + /** Items produced by this call: reasoning, assistant text, and/or tool calls. */ + output: ConversationItem[]; /** * When true, the span's output was already set at creation (e.g. a compaction * call, whose output is the replacement history), so closing it must not @@ -192,8 +207,9 @@ interface TranscriptScope { readonly openTurns: Map; /** turn_ids whose Stop has fired but whose task_complete hasn't been seen. */ readonly turnsAwaitingCompletion: Set; - /** OpenAI-style chat history, used to reconstruct each LLM call's input. */ - readonly conversationHistory: ChatMessage[]; + /** OpenAI-style chat history (messages + reasoning), used to reconstruct each + * LLM call's input. */ + readonly conversationHistory: ConversationItem[]; /** The currently-open LLM span in this scope, if a model call is in progress. */ openLlm: OpenLlm | null; /** Tool spans currently open in this scope, keyed by call_id. */ @@ -238,11 +254,33 @@ function tokenMetrics(usage: Record): Record { * assistant message or an array of messages as a chat completion; a single * message is returned directly, multiple are returned as an array. */ -function llmOutput(messages: ChatMessage[]): unknown { +function llmOutput(messages: ConversationItem[]): unknown { if (messages.length === 1) return messages[0]; return messages; } +/** + * Extract the readable reasoning summary from a `reasoning` item's `summary` + * field: a list of `{type: "summary_text", text}` entries. Returns the + * non-empty entries (matching Braintrust's reasoning shape), or undefined when + * there's nothing readable (the full reasoning is encrypted and not exposed). + */ +function reasoningSummary( + summary: unknown, +): Array<{ type: "summary_text"; text: string }> | undefined { + if (!Array.isArray(summary)) return undefined; + const parts: Array<{ type: "summary_text"; text: string }> = []; + for (const entry of summary) { + if (entry && typeof entry === "object") { + const text = (entry as { text?: unknown }).text; + if (typeof text === "string" && text.length > 0) { + parts.push({ type: "summary_text", text }); + } + } + } + return parts.length > 0 ? parts : undefined; +} + /** Extract the concatenated text from a response_item message's content array. */ function messageText(content: unknown): string | undefined { if (!Array.isArray(content)) return undefined; @@ -1244,11 +1282,19 @@ export class CodexEventProcessor implements EventProcessor { scope.conversationHistory.push(msg); } - // Handle a reasoning item: it belongs to the current model call but its raw - // content is encrypted/opaque, so we only use it to ensure the LLM span is - // open. (No readable text to surface here.) + // Handle a reasoning item: it belongs to the current model call. Open the LLM + // span and, when the model exposed a readable reasoning summary, surface it as + // a `reasoning` item in the call's output and in the conversation history (so + // subsequent calls' input shows the prior thinking too). The full reasoning is + // encrypted, so items with no readable summary only open the span. private handleReasoningItem(scope: TranscriptScope, record: TranscriptRecord): void { this.ensureLlmSpan(scope, record); + const payload = (record.payload ?? {}) as Record; + const summary = reasoningSummary(payload.summary); + if (summary === undefined) return; + const item: ReasoningItem = { type: "reasoning", summary }; + scope.openLlm?.output.push(item); + scope.conversationHistory.push(item); } // Close the current LLM call on a token_count (the segment boundary): end the diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index 438c5b4..e85d729 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -207,11 +207,20 @@ export function userMessageItem(text: string): TranscriptEntry { return message("user", text); } -/** reasoning response_item; pass summary strings to surface reasoning text. */ +/** + * reasoning response_item. Pass summary strings to surface readable reasoning + * ("thinking"); they're wrapped in Codex's real `{type:"summary_text", text}` + * shape. With no strings, only `encrypted_content` is present (the common case), + * so nothing readable is surfaced. + */ export function reasoning(summary: string[] = []): TranscriptEntry { return transcript({ type: "response_item", - payload: { type: "reasoning", summary, encrypted_content: "opaque" }, + payload: { + type: "reasoning", + summary: summary.map((text) => ({ type: "summary_text", text })), + encrypted_content: "opaque", + }, }); } From 3c4502f11fde762c76fa153e7ce32902b4880e25 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 17:43:38 -0600 Subject: [PATCH 12/17] session resume --- plugins/trace-codex/AGENTS.md | 69 ++- plugins/trace-codex/README.md | 13 + .../src/agents/codex/event-processor.test.ts | 4 + .../src/agents/codex/event-processor.ts | 471 +++++++++++++++--- .../trace-codex/src/agents/codex/register.ts | 30 +- .../src/agents/codex/resume.test.ts | 378 ++++++++++++++ .../src/agents/codex/snapshot-store.test.ts | 105 ++++ .../src/agents/codex/snapshot-store.ts | 161 ++++++ .../src/agents/codex/state-snapshot.test.ts | 64 +++ .../src/agents/codex/state-snapshot.ts | 144 ++++++ plugins/trace-codex/src/braintrust/logger.ts | 50 ++ plugins/trace-codex/src/test-helpers.ts | 68 ++- 12 files changed, 1477 insertions(+), 80 deletions(-) create mode 100644 plugins/trace-codex/src/agents/codex/resume.test.ts create mode 100644 plugins/trace-codex/src/agents/codex/snapshot-store.test.ts create mode 100644 plugins/trace-codex/src/agents/codex/snapshot-store.ts create mode 100644 plugins/trace-codex/src/agents/codex/state-snapshot.test.ts create mode 100644 plugins/trace-codex/src/agents/codex/state-snapshot.ts diff --git a/plugins/trace-codex/AGENTS.md b/plugins/trace-codex/AGENTS.md index e1d3a84..332223d 100644 --- a/plugins/trace-codex/AGENTS.md +++ b/plugins/trace-codex/AGENTS.md @@ -95,21 +95,72 @@ fire-and-forget: the client POSTs to `/enqueue` and returns immediately, while the background server does the slow work (Braintrust SDK calls, flushes) off the critical path. -We **deliberately block in a few places**, and only where correctness requires -it: - -- On a **terminal event** (`Stop`), the client calls `/flush` and waits for the - server to confirm buffered spans reached Braintrust. Without this, a - short-lived host (e.g. a CI job that ends right after the last turn) would tear - the process tree down before the final spans are delivered. This is the one - routine blocking point, and it's bounded by a timeout (`/flush` gives up rather - than hanging the turn). +The terminal event is the one place where blocking is even an option, and it's +**opt-in**: + +- On a **terminal event** (`Stop`), the client always calls `/flush`, but by + default it fires the flush and returns immediately so the turn isn't stalled. + Setting `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` makes the client instead wait for the + server to confirm buffered spans reached Braintrust. The blocking mode exists + for short-lived hosts (e.g. a CI job that ends right after the last turn) that + would otherwise tear the process tree down before the final spans are + delivered. When enabled, the wait is bounded by a timeout (`/flush` gives up + rather than hanging the turn). In normal interactive use the server stays alive + and flushes off the critical path, so no spans are lost without blocking. When adding behavior, prefer enqueue-and-return. Reach for a blocking request/await only when data would otherwise be lost, keep it off the per-hook hot path where possible, and always bound it so a slow or stuck backend can't stall the agent. +### 4. Sessions can outlive the server (resume) + +The server is short-lived (idle shutdown after ~5 min, or the user closes and +later resumes). A Codex session, though, can span that gap. To avoid dropping +the tail of a trace — or worse, re-emitting it as duplicate/orphaned spans — the +Codex processor **persists its resumable state** and rehydrates it when a new +processor is created for the same session id. + +How it works: + +- The transcript (rollout JSONL) on disk is the durable source of truth for span + *content*. What a restart loses is the processor's in-memory bookkeeping: + transcript byte offsets, which spans are still open (and their identities), + the reconstructed conversation history, and the subagent/compaction side-maps. +- On every `flush()` (idle drain, eviction, terminal Stop, shutdown), the + processor writes a JSON snapshot of that bookkeeping to + `PLUGIN_DATA/state/.json` (`src/agents/codex/snapshot-store.ts`). + Span handles are stored as identities (`span_id`/`root_span_id`/`parents` plus + name/type), captured from the SDK's synchronous span getters. +- On its first event, a processor attempts a one-time restore for its session + id. If a compatible snapshot exists, it recreates each span handle bound to the + original id via `SpanFactory.rehydrateSpan` (Braintrust merges rows by + `span_id`, so further `log()`/`end()` calls continue the same trace). Restore + runs **before** the tracing-enabled master switch, because the snapshot also + carries the session's reporting config (a bare mid-session restart may have no + leading config event). +- The snapshot is **not** deleted when the root span ends. Codex's `Stop` hook is + per-**turn**, not per-session (there is no session-end hook), and we end the + root on the first `Stop` — but later turns still attach as children of that same + root, so a restart *between* turns must be able to resume. The snapshot + therefore persists past root-end; stale ones (sessions that never resume) are + reclaimed by a startup GC sweep that removes snapshots older than a TTL. + +Invariants to preserve when changing this: + +- **Persistence is Codex-owned**, not part of the generic event server: *where* a + plugin may persist state is the host agent's call (Codex gives us + `PLUGIN_DATA`). Keep the store and snapshot shape under `src/agents/codex/`. + The only generic addition is `SpanFactory.rehydrateSpan` (pure SDK behavior). +- **Never persist secrets.** The `apiKey` is stripped from the snapshot's + reporting config and re-resolved from env / the config event on resume (mirrors + the event recorder's redaction). +- **Never throw.** The store swallows and logs all I/O errors; a persistence or + restore failure must not break a turn — it just falls back to starting fresh. +- **Version-gate snapshots.** Each carries `pluginVersion` + a schema version; + a mismatch discards the snapshot. Bump `SNAPSHOT_SCHEMA_VERSION` in + `state-snapshot.ts` whenever the shape changes incompatibly. + ## Making changes - **Generic vs. agent code**: put agent-specific logic in `src/agents//`; diff --git a/plugins/trace-codex/README.md b/plugins/trace-codex/README.md index c7aa39b..2049697 100644 --- a/plugins/trace-codex/README.md +++ b/plugins/trace-codex/README.md @@ -33,6 +33,19 @@ Every setting can be provided as a `config.json` key or as an environment variab | `blockOnStop` | `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` | `false` | When `true`, the hook blocks on each turn's `Stop` until the server confirms all spans are flushed. Use in programmatic/CI runs to guarantee traces are delivered before Codex exits. | | `recordFile` | `BRAINTRUST_EVENT_SERVER_RECORD_FILE` | _(unset)_ | If set, record every event to this NDJSON file (for `replay`). | +### Resuming sessions + +Note that when resuming a session, the original session's options will remain in effect. + +For example: + +```sh +TRACE_TO_BRAINTRUST=true codex # first session enables braintrust +TRACE_TO_BRAINTRUST=false codex resume abcde # the resumed session will still be traced because the original session was +``` + +The trace itself also survives the background server stopping. The server shuts down after the idle window (or on an explicit shutdown), but a session can outlive it — you might leave it open past the timeout, send another message after the server stopped, or `codex resume` later. The plugin snapshots each session's in-progress trace state under `$PLUGIN_DATA/state/` and restores it when the session continues, so later turns keep landing in the same trace instead of starting a new one. Stale snapshots age out automatically, and secrets (your API key) are never written to them. + ### Advanced Options Advanced plugin settings for debugging or developing the plugin diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 3214822..88f8197 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -1118,6 +1118,10 @@ describe("CodexEventProcessor: config / master switch", () => { ({ id: "s", end: () => 0, log: () => {} }) as unknown as ReturnType< SpanFactory["startSpan"] >, + rehydrateSpan: () => + ({ id: "s", end: () => 0, log: () => {} }) as unknown as ReturnType< + SpanFactory["startSpan"] + >, flush: async () => {}, }; const provider = (config?: ReportingConfig) => { diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index d60548f..5f21d41 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -30,11 +30,23 @@ import { type Span, type SpanFactory, type SpanFactoryProvider, + type SpanRef, + spanRef, } from "../../braintrust/logger.ts"; import type { Logger } from "../../log.ts"; import type { EventProcessor } from "../../processor/event-processor.ts"; import type { EnqueueEvent } from "../../server/routes.ts"; +import { PLUGIN_VERSION } from "../../version.ts"; import { CODEX_CONFIG_EVENT } from "./event-builder.ts"; +import type { SnapshotStore } from "./snapshot-store.ts"; +import { + type CodexSnapshot, + type ConversationItemSnapshot, + isCompatibleSnapshot, + redactReportingConfig, + type ScopeSnapshot, + SNAPSHOT_SCHEMA_VERSION, +} from "./state-snapshot.ts"; import { EVT_TASK_COMPLETE, EVT_TASK_STARTED, @@ -362,6 +374,17 @@ function permissionInfo(args: unknown): PermissionInfo | undefined { return info; } +// ConversationItems (chat messages / reasoning) are already plain JSON, so they +// round-trip through a snapshot unchanged. These two helpers just bridge the +// nominal type boundary between the live union and the snapshot's record array +// (via unknown) without copying. +function toItemSnapshots(items: ConversationItem[]): ConversationItemSnapshot[] { + return items as unknown as ConversationItemSnapshot[]; +} +function fromItemSnapshots(items: ConversationItemSnapshot[]): ConversationItem[] { + return items as unknown as ConversationItem[]; +} + export class CodexEventProcessor implements EventProcessor { private readonly logger: Logger; private readonly queueId: string | null; @@ -405,17 +428,33 @@ export class CodexEventProcessor implements EventProcessor { // turn_id -> the compaction span built from the transcript, kept so a later // PreCompact/PostCompact hook can back-fill the trigger onto it. private readonly compactionSpansByTurn = new Map(); + // span_id -> the name/type the span was created (or renamed) with. The SDK + // doesn't expose these as getters, but a rehydrated span must re-assert them + // (else the SDK infers a wrong name from the call stack on resume), so we + // remember them here and feed them into each SpanRef at serialize time. + private readonly spanAttrs = new Map(); + // Optional per-session state store. When present, the processor persists a + // snapshot of its resumable state on flush and, on its first event, rehydrates + // any snapshot left by a previous server run for this session (so a session + // that outlived the server — idle shutdown or explicit close+resume — keeps + // building the same trace instead of starting over). Null disables resume. + private readonly snapshotStore: SnapshotStore | null; + // Whether we've already attempted a one-time restore from the store. Restore + // runs lazily on the first non-config event, once the span factory exists. + private restoreAttempted = false; constructor( queueId: string | null, logger: Logger, spanFactoryProvider: SpanFactoryProvider = defaultSpanFactoryProvider, transcriptReader: TranscriptReader = defaultTranscriptReader, + snapshotStore: SnapshotStore | null = null, ) { this.queueId = queueId; this.logger = logger; this.spanFactoryProvider = spanFactoryProvider; this.transcriptReader = transcriptReader; + this.snapshotStore = snapshotStore; } /** The session's SpanFactory, built from the config event on first use. */ @@ -441,6 +480,17 @@ export class CodexEventProcessor implements EventProcessor { this.configure(event); return; } + + // First real event for this session: if a previous server run left a + // snapshot (idle shutdown mid-session, or an explicit close+resume), rebuild + // our state from it before processing. Runs once, regardless of which hook + // arrives first — the very next hook after a restart may not be SessionStart, + // and may not be preceded by a config event. Restore brings back the + // session's reporting config, so it MUST run before the master-switch check + // below (otherwise a resumed session with no leading config event would look + // untraced and we'd drop its events). + this.ensureRestored(); + // Master switch: when tracing is disabled, drop everything (no SDK calls). if (!this.tracingEnabled) { this.logger.debug("codex processor: tracing disabled; dropping event", { @@ -719,19 +769,24 @@ export class CodexEventProcessor implements EventProcessor { if (pending === undefined || scope.subagentRootSpan !== null) return; const startTime = isoToUnixSeconds(record.timestamp); try { - scope.subagentRootSpan = pending.parent.startSpan({ - name: `subagent: ${pending.agentId}`, - type: "task", - ...(startTime !== undefined ? { startTime } : {}), - event: { - metadata: { - agent_id: pending.agentId, - agent_type: pending.agentType, - // The subagent's own rollout transcript on disk. - transcript_path: scope.path, + const subagentName = `subagent: ${pending.agentId}`; + scope.subagentRootSpan = this.trackSpan( + pending.parent.startSpan({ + name: subagentName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { + metadata: { + agent_id: pending.agentId, + agent_type: pending.agentType, + // The subagent's own rollout transcript on disk. + transcript_path: scope.path, + }, }, - }, - }); + }), + subagentName, + "task", + ); this.logger.info("codex processor: opened subagent root span", { queueId: this.queueId, agentId: pending.agentId, @@ -914,7 +969,10 @@ export class CodexEventProcessor implements EventProcessor { if (this.rootSpan !== null) { try { this.rootSpan.log({ - metadata: { source: this.rootEnrichment.source, permission_mode: permissionMode }, + metadata: { + source: this.rootEnrichment.source, + permission_mode: this.rootEnrichment.permissionMode, + }, }); } catch (err) { this.logger.error("codex processor: failed to enrich root span", { @@ -945,27 +1003,32 @@ export class CodexEventProcessor implements EventProcessor { const spanName = projectDir ? `codex: ${projectDir}` : "codex session"; try { - this.rootSpan = this.spanFactory.startSpan({ - name: spanName, - type: "task", - ...(startTime !== undefined ? { startTime } : {}), - event: { - input: { model: this.mainScope?.model, cwd, source: this.rootEnrichment.source }, - metadata: { - // User-provided extras first, so the standard keys below win. - ...this.reportingConfig?.additionalMetadata, - session_id: sessionId, - model: this.mainScope?.model, - cwd, - source: this.rootEnrichment.source, - permission_mode: this.rootEnrichment.permissionMode, - cli_version: typeof payload.cli_version === "string" ? payload.cli_version : undefined, - project: this.reportingConfig?.project, - // The session's rollout transcript on disk (the source for this trace). - transcript_path: this.mainScope?.path, + this.rootSpan = this.trackSpan( + this.spanFactory.startSpan({ + name: spanName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input: { model: this.mainScope?.model, cwd, source: this.rootEnrichment.source }, + metadata: { + // User-provided extras first, so the standard keys below win. + ...this.reportingConfig?.additionalMetadata, + session_id: sessionId, + model: this.mainScope?.model, + cwd, + source: this.rootEnrichment.source, + permission_mode: this.rootEnrichment.permissionMode, + cli_version: + typeof payload.cli_version === "string" ? payload.cli_version : undefined, + project: this.reportingConfig?.project, + // The session's rollout transcript on disk (the source for this trace). + transcript_path: this.mainScope?.path, + }, }, - }, - }); + }), + spanName, + "task", + ); this.logger.info("codex processor: opened root span", { queueId: this.queueId, spanId: this.rootSpan.id, @@ -1023,13 +1086,18 @@ export class CodexEventProcessor implements EventProcessor { return; } const startTime = isoToUnixSeconds(record.timestamp); + const turnName = `turn: ${turnId}`; try { - const turnSpan = parent.startSpan({ - name: `turn: ${turnId}`, - type: "task", - ...(startTime !== undefined ? { startTime } : {}), - event: { metadata: { turn_id: turnId, model: scope.model } }, - }); + const turnSpan = this.trackSpan( + parent.startSpan({ + name: turnName, + type: "task", + ...(startTime !== undefined ? { startTime } : {}), + event: { metadata: { turn_id: turnId, model: scope.model } }, + }), + turnName, + "task", + ); scope.openTurns.set(turnId, turnSpan); this.logger.info("codex processor: opened turn span", { queueId: this.queueId, @@ -1072,6 +1140,9 @@ export class CodexEventProcessor implements EventProcessor { this.compactionSpansByTurn.set(turnId, turnSpan); try { turnSpan.setAttributes({ name: "compaction", type: "task" }); + // Remember the rename so a resumed snapshot re-asserts "compaction", not + // the original "turn: ..." name. + this.spanAttrs.set(turnSpan.spanId, { name: "compaction", type: "task" }); turnSpan.log({ metadata: { compaction: { @@ -1101,17 +1172,22 @@ export class CodexEventProcessor implements EventProcessor { const startTime = isoToUnixSeconds(record.timestamp); const before = scope.conversationHistory.map((item) => ({ ...item })); const output = compactionOutput(replacement); + const compactionLlmName = scope.model ?? "compaction"; try { - const span = turnSpan.startSpan({ - name: scope.model ?? "compaction", - type: "llm", - ...(startTime !== undefined ? { startTime } : {}), - event: { - input: { messages_before_compaction: before.length, history: before }, - output, - metadata: { model: scope.model, turn_id: turnId, compaction: true }, - }, - }); + const span = this.trackSpan( + turnSpan.startSpan({ + name: compactionLlmName, + type: "llm", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input: { messages_before_compaction: before.length, history: before }, + output, + metadata: { model: scope.model, turn_id: turnId, compaction: true }, + }, + }), + compactionLlmName, + "llm", + ); // Reuse the open-llm slot so the turn's token_count closes it with usage. // outputPreset: the replacement history is already logged as the output, so // the closing token_count must not overwrite it. @@ -1241,13 +1317,18 @@ export class CodexEventProcessor implements EventProcessor { // Snapshot the conversation as this call's input (the transcript doesn't log // the real request, so this is a best-effort reconstruction). const input = scope.conversationHistory.map((item) => ({ ...item })); + const llmName = scope.model ?? "llm"; try { - const span = turnSpan.startSpan({ - name: scope.model ?? "llm", - type: "llm", - ...(startTime !== undefined ? { startTime } : {}), - event: { input, metadata: { model: scope.model, turn_id: turnId } }, - }); + const span = this.trackSpan( + turnSpan.startSpan({ + name: llmName, + type: "llm", + ...(startTime !== undefined ? { startTime } : {}), + event: { input, metadata: { model: scope.model, turn_id: turnId } }, + }), + llmName, + "llm", + ); scope.openLlm = { span, turnId, output: [] }; this.logger.info("codex processor: opened llm span", { queueId: this.queueId, @@ -1413,17 +1494,22 @@ export class CodexEventProcessor implements EventProcessor { // inline in the arguments), surface it on the span as metadata and a tag. const permission = permissionInfo(input); const startTime = isoToUnixSeconds(record.timestamp); + const toolName = name ?? "tool"; try { - const toolSpan = turnSpan.startSpan({ - name: name ?? "tool", - type: "tool", - ...(startTime !== undefined ? { startTime } : {}), - event: { - input, - metadata: { tool_name: name, call_id: callId, turn_id: turnId, permission }, - ...(permission !== undefined ? { tags: [PERMISSION_TAG] } : {}), - }, - }); + const toolSpan = this.trackSpan( + turnSpan.startSpan({ + name: toolName, + type: "tool", + ...(startTime !== undefined ? { startTime } : {}), + event: { + input, + metadata: { tool_name: name, call_id: callId, turn_id: turnId, permission }, + ...(permission !== undefined ? { tags: [PERMISSION_TAG] } : {}), + }, + }), + toolName, + "tool", + ); scope.openTools.set(callId, { span: toolSpan, turnId }); // Remember the TURN span that ran a spawn_agent tool (main scope only) so a // subagent's root span can be parented as a sibling of the tool (both under @@ -1597,6 +1683,11 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } + // NOTE: we deliberately do NOT delete the snapshot here. Ending the root span + // means "this turn's Stop fired", not "the session is over" — Codex has no + // session-end hook, and later turns attach as children of this same root. The + // snapshot must persist so a restart between turns can resume the session; + // stale ones are reclaimed by the store's age-based GC. } async flush(): Promise { @@ -1618,6 +1709,256 @@ export class CodexEventProcessor implements EventProcessor { error: String(err), }); } + // Persist state so a future server run can resume this session's trace. Done + // after the catch-up + flush above so the snapshot reflects the latest + // offsets and closed spans. + this.persistState(); + } + + // ========================================================================== + // State persistence (resume support) + // ========================================================================== + + // Persist this session's snapshot so a future server run can resume the trace. + // Best-effort: the store never throws. No-op without a store/session id, or + // before any root span exists (nothing worth resuming yet). + // + // We persist even once the root span has ended. Codex's "Stop" is per-TURN, + // not per-session (there is no session-end hook), and we end the root on the + // first Stop — but the session lives on and more turns can follow, each a + // child of that same (ended) root. So the snapshot must survive past root-end, + // or a server restart between turns would lose the thread and start a brand-new + // trace. Stale snapshots from sessions that truly never resume are reclaimed by + // the store's age-based GC, not by deleting here. + private persistState(): void { + if (this.snapshotStore === null || this.queueId === null) return; + if (this.rootSpan === null) return; // nothing worth resuming yet + try { + this.snapshotStore.write(this.queueId, this.serialize()); + } catch (err) { + this.logger.error("codex processor: failed to serialize state", { + queueId: this.queueId, + error: String(err), + }); + } + } + + // One-time restore from a snapshot left by a previous server run. Runs lazily + // on the first event (after reporting is configured, so the span factory and + // tracing-enabled check are settled). Guarded so it happens at most once. + private ensureRestored(): void { + if (this.restoreAttempted) return; + this.restoreAttempted = true; + if (this.snapshotStore === null || this.queueId === null) return; + // If we've already built a root span this run, there's nothing to restore + // (we're the original processor, not a post-restart one). + if (this.rootSpan !== null) return; + const snapshot = this.snapshotStore.read(this.queueId); + if (snapshot === null) return; + if (!isCompatibleSnapshot(snapshot, PLUGIN_VERSION)) { + this.logger.info("codex processor: snapshot version mismatch; starting fresh", { + queueId: this.queueId, + snapshotVersion: snapshot.pluginVersion, + snapshotSchema: snapshot.schemaVersion, + }); + this.snapshotStore.delete(this.queueId); + return; + } + try { + this.restore(snapshot); + this.logger.info("codex processor: restored session state from snapshot", { + queueId: this.queueId, + scopes: snapshot.scopes.length, + }); + } catch (err) { + this.logger.error("codex processor: failed to restore snapshot; starting fresh", { + queueId: this.queueId, + error: String(err), + }); + } + } + + // Remember the name/type a span was created (or renamed) with, so it can be + // re-asserted on rehydration. Returns the span unchanged for call-site + // convenience. + private trackSpan(span: Span, name: string, type: SpanRef["type"]): Span { + this.spanAttrs.set(span.spanId, { name, type }); + return span; + } + + // Capture a span's identity plus its remembered name/type for the snapshot. + private refFor(span: Span): SpanRef { + const attrs = this.spanAttrs.get(span.spanId); + return spanRef(span, attrs?.name, attrs?.type); + } + + // Build a serializable snapshot of all resumable state. Span handles are + // captured as identities (spanId/rootSpanId/parents) plus their name/type; the + // transcript on disk remains the source of truth for content, so nothing here + // re-derives spans. + private serialize(): CodexSnapshot { + const scopes: ScopeSnapshot[] = []; + for (const scope of this.scopes.values()) { + scopes.push({ + path: scope.path, + kind: scope.kind, + offset: scope.offset, + openTurns: Array.from(scope.openTurns, ([turnId, span]) => ({ + turnId, + span: this.refFor(span), + })), + turnsAwaitingCompletion: Array.from(scope.turnsAwaitingCompletion), + conversationHistory: toItemSnapshots(scope.conversationHistory), + openLlm: + scope.openLlm === null + ? null + : { + span: this.refFor(scope.openLlm.span), + turnId: scope.openLlm.turnId, + output: toItemSnapshots(scope.openLlm.output), + outputPreset: scope.openLlm.outputPreset, + }, + openTools: Array.from(scope.openTools, ([callId, entry]) => ({ + callId, + span: this.refFor(entry.span), + turnId: entry.turnId, + })), + model: scope.model, + lastTurnEndTime: scope.lastTurnEndTime, + subagentRootSpan: + scope.subagentRootSpan === null ? null : this.refFor(scope.subagentRootSpan), + subagentEnded: scope.subagentEnded, + ...(scope.pendingSubagent !== undefined + ? { + pendingSubagent: { + agentId: scope.pendingSubagent.agentId, + agentType: scope.pendingSubagent.agentType, + parent: this.refFor(scope.pendingSubagent.parent), + }, + } + : {}), + }); + } + return { + pluginVersion: PLUGIN_VERSION, + schemaVersion: SNAPSHOT_SCHEMA_VERSION, + sessionId: this.queueId, + savedAt: Date.now(), + reportingConfig: redactReportingConfig(this.reportingConfig), + rootSpan: this.rootSpan === null ? null : this.refFor(this.rootSpan), + rootEnded: this.rootEnded, + rootEnrichment: { ...this.rootEnrichment }, + mainScopePath: this.mainScope?.path ?? null, + scopes, + spawnTurnSpansByCallId: Array.from(this.spawnTurnSpansByCallId, ([key, span]) => ({ + key, + span: this.refFor(span), + })), + spawnTurnSpansByAgentId: Array.from(this.spawnTurnSpansByAgentId, ([key, span]) => ({ + key, + span: this.refFor(span), + })), + compactionTriggerByTurn: Array.from(this.compactionTriggerByTurn, ([key, value]) => ({ + key, + value, + })), + compactionSpansByTurn: Array.from(this.compactionSpansByTurn, ([key, span]) => ({ + key, + span: this.refFor(span), + })), + }; + } + + // Rebuild live state from a snapshot. Span handles are recreated bound to their + // original ids (so further log()/end() merge into the same rows). Each scope's + // lazy parent-span resolver is reconstructed the same way newScope/ + // handleSubagentStart do, so subsequent transcript records attach correctly. + private restore(snapshot: CodexSnapshot): void { + // Reporting config first: it determines the span factory (project/creds). + // The snapshot is the source of truth — we adopt it as-is rather than + // merging in this run's config, so a resume whose env/settings drifted from + // the original session can't clobber the snapshot's values. The one field + // the snapshot can't carry is the `apiKey` secret (intentionally never + // persisted), so we re-attach the live one resolved from this run's config + // event / env. This must happen before we touch `this.spanFactory`, which is + // built lazily from `this.reportingConfig`. + if (snapshot.reportingConfig !== undefined) { + this.reportingConfig = { + ...snapshot.reportingConfig, + apiKey: this.reportingConfig?.apiKey, + }; + } + + const factory = this.spanFactory; + // Rehydrate a handle and re-remember its name/type, so if this resumed + // processor is itself snapshotted again, the attributes survive the next hop. + const rehydrate = (ref: SpanRef): Span => { + const span = factory.rehydrateSpan(ref); + if (ref.name !== undefined && ref.type !== undefined) { + this.spanAttrs.set(span.spanId, { name: ref.name, type: ref.type }); + } + return span; + }; + + this.rootSpan = snapshot.rootSpan === null ? null : rehydrate(snapshot.rootSpan); + this.rootEnded = snapshot.rootEnded; + this.rootEnrichment = { ...snapshot.rootEnrichment }; + + for (const s of snapshot.scopes) { + const isMain = s.path === snapshot.mainScopePath; + // The lazy parent resolver mirrors live construction: a main scope's turns + // hang under the root span; a subagent scope's under its own (restored) + // subagent root span. + const scope: TranscriptScope = { + path: s.path, + kind: s.kind, + parentSpan: isMain ? () => this.rootSpan : () => scope.subagentRootSpan, + offset: s.offset, + openTurns: new Map(s.openTurns.map((t) => [t.turnId, rehydrate(t.span)])), + turnsAwaitingCompletion: new Set(s.turnsAwaitingCompletion), + conversationHistory: fromItemSnapshots(s.conversationHistory), + openLlm: + s.openLlm === null + ? null + : { + span: rehydrate(s.openLlm.span), + turnId: s.openLlm.turnId, + output: fromItemSnapshots(s.openLlm.output), + outputPreset: s.openLlm.outputPreset, + }, + openTools: new Map( + s.openTools.map((t) => [t.callId, { span: rehydrate(t.span), turnId: t.turnId }]), + ), + model: s.model, + lastTurnEndTime: s.lastTurnEndTime, + subagentRootSpan: s.subagentRootSpan === null ? null : rehydrate(s.subagentRootSpan), + subagentEnded: s.subagentEnded, + ...(s.pendingSubagent !== undefined + ? { + pendingSubagent: { + agentId: s.pendingSubagent.agentId, + agentType: s.pendingSubagent.agentType, + parent: rehydrate(s.pendingSubagent.parent), + }, + } + : {}), + }; + this.scopes.set(s.path, scope); + if (isMain) this.mainScope = scope; + } + + for (const e of snapshot.spawnTurnSpansByCallId) { + this.spawnTurnSpansByCallId.set(e.key, rehydrate(e.span)); + } + for (const e of snapshot.spawnTurnSpansByAgentId) { + this.spawnTurnSpansByAgentId.set(e.key, rehydrate(e.span)); + } + for (const e of snapshot.compactionTriggerByTurn) { + this.compactionTriggerByTurn.set(e.key, e.value); + } + for (const e of snapshot.compactionSpansByTurn) { + this.compactionSpansByTurn.set(e.key, rehydrate(e.span)); + } } // Read and process new transcript records for every scope, retrying on a diff --git a/plugins/trace-codex/src/agents/codex/register.ts b/plugins/trace-codex/src/agents/codex/register.ts index a905601..980ed15 100644 --- a/plugins/trace-codex/src/agents/codex/register.ts +++ b/plugins/trace-codex/src/agents/codex/register.ts @@ -3,10 +3,12 @@ // another agent means adding a sibling module that exports the same shape. import type { EventBuilder } from "../../client/client.ts"; +import type { Logger } from "../../log.ts"; import type { EventProcessorFactory } from "../../processor/event-processor.ts"; import { buildEnqueueEvents, CODEX_EVENT_SOURCE } from "./event-builder.ts"; import { CodexEventProcessor } from "./event-processor.ts"; import { applySettingsToEnv, loadSettingsFile, settingsFilePath } from "./settings.ts"; +import { SnapshotStore } from "./snapshot-store.ts"; export interface Agent { /** Event source string this agent's events carry (matches the registry key). */ @@ -17,9 +19,11 @@ export interface Agent { buildEvents: EventBuilder; /** * Event names that terminate a turn/session. After enqueuing one of these, - * the hook asks the server to flush synchronously so the final spans are - * delivered before the process tree is torn down (e.g. in CI). Codex only - * exposes a per-turn "Stop" hook today; there is no session-end event. + * the hook asks the server to flush. By default the flush is fire-and-forget + * so the turn isn't stalled; setting BRAINTRUST_PLUGIN_BLOCK_ON_STOP makes the + * hook block until the server confirms the final spans are delivered before + * the process tree is torn down (e.g. in CI). Codex only exposes a per-turn + * "Stop" hook today; there is no session-end event. */ terminalEvents: string[]; /** @@ -31,10 +35,28 @@ export interface Agent { loadSettings: () => string[]; } +// One shared snapshot store for all Codex sessions, created on first use. It +// persists each session's resumable state under PLUGIN_DATA/state so a restarted +// server can rehydrate in-progress traces. Created lazily (reusing the first +// session's logger) so a one-time GC of stale orphan snapshots runs the first +// time we actually trace, and so this module stays side-effect-free on import. +let sharedSnapshotStore: SnapshotStore | null = null; +function snapshotStore(logger: Logger): SnapshotStore { + if (sharedSnapshotStore === null) { + sharedSnapshotStore = new SnapshotStore(logger); + // Reclaim stale snapshots. Snapshots intentionally outlive their root span + // (later turns resume onto the same root), so they are never self-deleted on + // root-end; the only cleanup is this age-based sweep of ones that never + // resumed. + sharedSnapshotStore.gcOlderThan(); + } + return sharedSnapshotStore; +} + export const codexAgent: Agent = { eventSource: CODEX_EVENT_SOURCE, createProcessor: (queueId, logger, spanFactoryProvider) => - new CodexEventProcessor(queueId, logger, spanFactoryProvider), + new CodexEventProcessor(queueId, logger, spanFactoryProvider, undefined, snapshotStore(logger)), buildEvents: buildEnqueueEvents, terminalEvents: ["Stop"], loadSettings: () => applySettingsToEnv(loadSettingsFile(settingsFilePath())), diff --git a/plugins/trace-codex/src/agents/codex/resume.test.ts b/plugins/trace-codex/src/agents/codex/resume.test.ts new file mode 100644 index 0000000..c618a7e --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/resume.test.ts @@ -0,0 +1,378 @@ +// Resume / state-persistence tests. +// +// When the event server shuts down mid-session (idle timeout) or the user closes +// and later resumes, a fresh CodexEventProcessor is created for the same session +// id. These tests verify that, given a persisted snapshot, the new processor +// rehydrates the original spans and continues the SAME trace — producing the +// same result as if one processor had handled the whole session — instead of +// duplicating or orphaning spans. + +import { describe, expect, test } from "bun:test"; +import type { SpanFactory } from "../../braintrust/logger.ts"; +import { + type CapturedSpan, + createTestLogger, + diffSpan, + type ExpectedSpan, + spansToTree, + withCapturedTrace, +} from "../../test-helpers.ts"; +import { CodexEventProcessor } from "./event-processor.ts"; +import { SnapshotStore } from "./snapshot-store.ts"; +import type { CodexSnapshot } from "./state-snapshot.ts"; +import { + configEvent, + FakeTranscriptReader, + functionCall, + functionCallOutput, + preToolUse, + sessionMeta, + sessionStart, + stop, + type TraceEntry, + taskComplete, + taskStarted, + type transcript, + turnContext, + userMessage, +} from "./test-helpers.ts"; + +/** An in-memory snapshot store, so tests don't touch disk. Mirrors the disk + * store's surface (the parts the processor uses). */ +class MemorySnapshotStore extends SnapshotStore { + private readonly mem = new Map(); + + constructor() { + super(createTestLogger(), { dir: "/dev/null/unused" }); + } + + override read(sessionId: string): CodexSnapshot | null { + const snap = this.mem.get(sessionId); + // Deep-clone so callers can't mutate stored state (matches disk JSON). + return snap ? (JSON.parse(JSON.stringify(snap)) as CodexSnapshot) : null; + } + + override write(sessionId: string, snapshot: CodexSnapshot): void { + this.mem.set(sessionId, JSON.parse(JSON.stringify(snapshot)) as CodexSnapshot); + } + + override delete(sessionId: string): void { + this.mem.delete(sessionId); + } + + has(sessionId: string): boolean { + return this.mem.has(sessionId); + } +} + +const isTranscriptEntry = (e: TraceEntry): e is ReturnType => + (e as { kind?: string }).kind === "transcript"; + +/** + * Run a list of entries (hooks + transcript writes) through a processor, then + * flush. Shares the reader, factory, and store across calls so a second + * processor can resume the first's session. Returns nothing; spans accumulate in + * the shared captured trace. + */ +async function runThrough( + entries: TraceEntry[], + deps: { + queueId: string; + reader: FakeTranscriptReader; + factory: SpanFactory; + store: SnapshotStore; + }, +): Promise { + const processor = new CodexEventProcessor( + deps.queueId, + createTestLogger(), + () => deps.factory, + deps.reader, + deps.store, + ); + for (const entry of entries) { + if (isTranscriptEntry(entry)) { + deps.reader.append(JSON.stringify(entry.record)); + } else { + await processor.process(entry); + } + } + await processor.flush(); +} + +/** Assert a captured trace matches an expected tree. */ +function expectTrace(spans: CapturedSpan[], expected: ExpectedSpan): void { + const tree = spansToTree(spans); + const diffs = diffSpan(tree, expected, "root"); + if (diffs.length > 0) { + throw new Error(`trace does not match expected:\n${diffs.join("\n")}`); + } +} + +describe("CodexEventProcessor: resume from snapshot", () => { + // The realistic mid-session restart: the server idle-shuts-down DURING a turn + // (before its Stop), then the next hook arrives on a new processor. The + // resumed processor must rehydrate the open root + turn and finish them, + // yielding one trace with the turn closed exactly once (no duplicates). + test("an idle restart mid-turn rehydrates open spans and finishes them", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-midturn"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: open the session and a turn, but NO Stop yet. The SessionStart + // and a PreToolUse hook drive transcript catch-up (in production every + // hook does). An idle flush then persists the open state. The config + // carries an apiKey to prove it's never written to the snapshot. + await runThrough( + [ + configEvent({ session_id: queueId, apiKey: "sk-secret-do-not-persist" }), + sessionMeta({ id: queueId, cwd: "/tmp/proj" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "hello" }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + // Mid-session: the root and turn are open, so a snapshot exists. + expect(store.has(queueId)).toBe(true); + const snap = store.read(queueId); + expect(snap?.rootEnded).toBe(false); + expect(snap?.scopes[0]?.openTurns.length).toBe(1); + // The persisted snapshot must never contain the apiKey (secret). + expect(JSON.stringify(snap)).not.toContain("apiKey"); + + // Run 2: a NEW processor (server restarted). The first event isn't a + // SessionStart — it's the Stop closing the turn. The resumed processor + // must restore the open root+turn and finish them. + await runThrough( + [ + // No config event this run: the resumed processor uses the snapshot's + // reporting config. (In production a resume DOES re-send config, but + // restore must work without it too.) + taskComplete({ turn_id: "turn-1", last_agent_message: "all done" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + // The resumed Stop ended the root, but the snapshot is kept (more turns + // could still follow — Codex's Stop is per-turn, not per-session). It's + // updated to reflect the now-ended root. + expect(store.has(queueId)).toBe(true); + expect(store.read(queueId)?.rootEnded).toBe(true); + + const spans = await trace.drain(); + // Exactly one root, one turn — no duplicates from the restart. + expectTrace(spans, { + span_attributes: { name: "codex: proj", type: "task" }, + metadata: { session_id: queueId }, + ended: true, + children: [ + { + span_attributes: { name: "turn: turn-1", type: "task" }, + input: "hello", + output: "all done", + ended: true, + children: [], + }, + ], + }); + } finally { + trace.cleanup(); + } + }); + + // The /shutdown-between-turns case: turn 1 completes (its Stop fires, which + // ends the root), the server is shut down, then the user sends another message. + // The root ending on the first Stop must NOT discard the snapshot — turn 2 has + // to resume under the SAME root, not start a brand-new trace. + test("a turn sent after a between-turns shutdown resumes under the same root", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-multiturn"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a full first turn, including its Stop (which ends the root span). + // Then the server shuts down (we stop using processor A after its flush). + await runThrough( + [ + configEvent({ session_id: queueId }), + sessionMeta({ id: queueId, cwd: "/tmp/proj" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "first" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "did first" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + // The root ended on turn 1's Stop — but the snapshot must SURVIVE, since + // more turns can still come (Codex's Stop is per-turn, not per-session). + expect(store.has(queueId)).toBe(true); + expect(store.read(queueId)?.rootEnded).toBe(true); + + // Run 2: a NEW processor (server restarted after /shutdown). A second turn + // arrives. It must attach under the SAME restored root, not a new one. + await runThrough( + [ + taskStarted({ turn_id: "turn-2" }), + userMessage({ message: "second" }), + taskComplete({ turn_id: "turn-2", last_agent_message: "did second" }), + stop({ turn_id: "turn-2" }), + ], + deps, + ); + + const spans = await trace.drain(); + // One root, two turns under it — a single continuous trace. + expectTrace(spans, { + span_attributes: { name: "codex: proj", type: "task" }, + metadata: { session_id: queueId }, + children: [ + { + span_attributes: { name: "turn: turn-1", type: "task" }, + output: "did first", + ended: true, + children: [], + }, + { + span_attributes: { name: "turn: turn-2", type: "task" }, + input: "second", + output: "did second", + ended: true, + children: [], + }, + ], + }); + } finally { + trace.cleanup(); + } + }); + + // A deeper hierarchy (turn -> tool) open across the restart must rehydrate with + // the correct span names (not SDK-inferred ones) and close cleanly. + test("an open tool span survives a restart and closes with its original name", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-tool"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a turn opens and calls a tool (function_call), but the tool's + // output hasn't arrived yet when the server idle-shuts-down. + await runThrough( + [ + configEvent({ session_id: queueId }), + sessionMeta({ id: queueId, cwd: "/tmp/proj" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "run a tool" }), + functionCall({ turn_id: "turn-1", call_id: "call-1", name: "shell" }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + const snap = store.read(queueId); + expect(snap?.scopes[0]?.openTools.length).toBe(1); + expect(snap?.scopes[0]?.openTools[0]?.span.name).toBe("shell"); + + // Run 2: a fresh processor. The tool output and turn completion arrive, then + // the Stop. The rehydrated tool span must close under the rehydrated turn. + await runThrough( + [ + functionCallOutput({ call_id: "call-1", output: "ok" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "tool done" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const spans = await trace.drain(); + expectTrace(spans, { + span_attributes: { name: "codex: proj", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: turn-1", type: "task" }, + output: "tool done", + ended: true, + children: [ + { + span_attributes: { name: "shell", type: "tool" }, + output: "ok", + ended: true, + children: [], + }, + ], + }, + ], + }); + } finally { + trace.cleanup(); + } + }); + + test("a version-incompatible snapshot is discarded and the session starts fresh", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-badver"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + + // Plant a snapshot with an incompatible plugin version. + store.write(queueId, { + pluginVersion: "0.0.0-incompatible", + schemaVersion: 1, + sessionId: queueId, + savedAt: Date.now(), + reportingConfig: { traceToBraintrust: true }, + rootSpan: { spanId: "stale", rootSpanId: "stale", spanParents: [] }, + rootEnded: false, + rootEnrichment: {}, + mainScopePath: "/old.jsonl", + scopes: [], + spawnTurnSpansByCallId: [], + spawnTurnSpansByAgentId: [], + compactionTriggerByTurn: [], + compactionSpansByTurn: [], + }); + + const deps = { queueId, reader, factory: trace.spanFactory, store }; + await runThrough( + [ + configEvent({ session_id: queueId }), + sessionMeta({ id: queueId, cwd: "/tmp/fresh" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "turn-1" }), + userMessage({ message: "hi" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "ok" }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const spans = await trace.drain(); + // A fresh trace was built (the stale snapshot's "stale" span id is absent), + // and the incompatible snapshot was removed. + expect(spans.some((s) => s.span_id === "stale")).toBe(false); + expectTrace(spans, { + span_attributes: { name: "codex: fresh", type: "task" }, + children: [{ span_attributes: { name: "turn: turn-1" }, output: "ok", children: [] }], + }); + } finally { + trace.cleanup(); + } + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts new file mode 100644 index 0000000..5eb1b34 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts @@ -0,0 +1,105 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync, utimesSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createTestLogger } from "../../test-helpers.ts"; +import { SnapshotStore } from "./snapshot-store.ts"; +import type { CodexSnapshot } from "./state-snapshot.ts"; + +function makeSnapshot(sessionId: string): CodexSnapshot { + return { + pluginVersion: "9.9.9", + schemaVersion: 1, + sessionId, + savedAt: Date.now(), + reportingConfig: { project: "p", traceToBraintrust: true }, + rootSpan: { spanId: "r", rootSpanId: "r", spanParents: [] }, + rootEnded: false, + rootEnrichment: { source: "startup" }, + mainScopePath: "/t.jsonl", + scopes: [], + spawnTurnSpansByCallId: [], + spawnTurnSpansByAgentId: [], + compactionTriggerByTurn: [], + compactionSpansByTurn: [], + }; +} + +describe("SnapshotStore", () => { + let dir: string; + let store: SnapshotStore; + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), "snap-store-")); + store = new SnapshotStore(createTestLogger(), { dir }); + }); + + afterEach(() => { + rmSync(dir, { recursive: true, force: true }); + }); + + test("read returns null when no snapshot exists", () => { + expect(store.read("missing")).toBeNull(); + }); + + test("write then read round-trips a snapshot", () => { + const snap = makeSnapshot("sess-1"); + store.write("sess-1", snap); + const loaded = store.read("sess-1"); + expect(loaded).toEqual(snap); + }); + + test("write is isolated per session id", () => { + store.write("a", makeSnapshot("a")); + store.write("b", makeSnapshot("b")); + expect(store.read("a")?.sessionId).toBe("a"); + expect(store.read("b")?.sessionId).toBe("b"); + }); + + test("delete removes a snapshot", () => { + store.write("sess-1", makeSnapshot("sess-1")); + store.delete("sess-1"); + expect(store.read("sess-1")).toBeNull(); + }); + + test("delete of a missing snapshot does not throw", () => { + expect(() => store.delete("nope")).not.toThrow(); + }); + + test("read of a malformed snapshot file returns null (never throws)", () => { + store.write("bad", makeSnapshot("bad")); + // Corrupt the file on disk. + writeFileSync(join(dir, "state", "bad.json"), "{ not json"); + expect(store.read("bad")).toBeNull(); + }); + + test("a session id with path separators can't escape the state dir", () => { + const evil = "../../escape"; + store.write(evil, makeSnapshot(evil)); + // It round-trips by the same (sanitized) key... + expect(store.read(evil)?.sessionId).toBe(evil); + // ...and never created anything outside the state dir. + expect(() => rmSync(join(dir, "state"), { recursive: true })).not.toThrow(); + }); + + test("gcOlderThan removes snapshots older than the ttl but keeps fresh ones", () => { + store.write("old", makeSnapshot("old")); + store.write("fresh", makeSnapshot("fresh")); + // Backdate "old" two hours. + const oldPath = join(dir, "state", "old.json"); + const twoHoursAgo = new Date(Date.now() - 2 * 60 * 60 * 1000); + utimesSync(oldPath, twoHoursAgo, twoHoursAgo); + + store.gcOlderThan(60 * 60 * 1000); // 1 hour ttl + + expect(store.read("old")).toBeNull(); + expect(store.read("fresh")?.sessionId).toBe("fresh"); + }); + + test("gcOlderThan on a non-existent state dir does not throw", () => { + const empty = new SnapshotStore(createTestLogger(), { + dir: join(dir, "does-not-exist"), + }); + expect(() => empty.gcOlderThan()).not.toThrow(); + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.ts new file mode 100644 index 0000000..625e23d --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/snapshot-store.ts @@ -0,0 +1,161 @@ +// Per-session snapshot store: persists each Codex session's resumable processor +// state to one JSON file under the plugin's writable data directory, so a +// restarted server can rehydrate the session's in-progress trace. +// +// This is Codex-owned (not part of the generic event server): *where* a plugin +// may persist state is the coding agent's call. Codex hands plugins a writable +// PLUGIN_DATA directory; that's our base. Each agent that wants resume support +// brings its own store with whatever location rules its host imposes. +// +// Like the event recorder and the span logger, this never throws: a persistence +// failure must not be able to break a Codex turn. Every operation is wrapped and +// failures are logged and swallowed. Writes are atomic (temp file + rename) so a +// crash mid-write can't leave a half-written snapshot that fails to parse. + +import { + mkdirSync, + readdirSync, + readFileSync, + renameSync, + rmSync, + statSync, + writeFileSync, +} from "node:fs"; +import { join } from "node:path"; +import type { Logger } from "../../log.ts"; +import type { CodexSnapshot } from "./state-snapshot.ts"; + +/** Subdirectory of the data dir that holds per-session snapshots. */ +const STATE_SUBDIR = "state"; + +/** Default age after which an orphaned snapshot is garbage-collected (7 days). + * Orphans happen when a server crashes before a session's root span ends (which + * is when the snapshot is normally deleted). */ +export const DEFAULT_SNAPSHOT_TTL_MS = 7 * 24 * 60 * 60 * 1000; + +/** + * Resolve the plugin's writable data directory. Precedence matches the rest of + * the plugin (server Config + settings): explicit override, then Codex's + * PLUGIN_DATA, then a temp fallback so the binary still runs standalone. + */ +function dataDir(env: NodeJS.ProcessEnv): string { + return ( + env.BRAINTRUST_EVENT_SERVER_LOG_DIR || + env.PLUGIN_DATA || + `${env.TMPDIR || "/tmp"}/braintrust-event-server` + ); +} + +/** + * Map a session id to its snapshot filename. Session ids are UUIDs, but sanitize + * defensively so a stray id can never escape the state directory. + */ +function snapshotFileName(sessionId: string): string { + const safe = sessionId.replace(/[^A-Za-z0-9._-]/g, "_"); + return `${safe}.json`; +} + +export interface SnapshotStoreOptions { + /** Override the data dir base. Defaults to the resolved PLUGIN_DATA. */ + dir?: string; + env?: NodeJS.ProcessEnv; +} + +/** + * Reads and writes per-session {@link CodexSnapshot}s under + * `/state/.json`. All methods are best-effort and never + * throw. + */ +export class SnapshotStore { + private readonly dir: string; + private readonly logger: Logger; + + constructor(logger: Logger, options: SnapshotStoreOptions = {}) { + this.logger = logger; + const base = options.dir ?? dataDir(options.env ?? process.env); + this.dir = join(base, STATE_SUBDIR); + } + + private pathFor(sessionId: string): string { + return join(this.dir, snapshotFileName(sessionId)); + } + + /** Load a session's snapshot, or null if absent/unreadable/malformed. */ + read(sessionId: string): CodexSnapshot | null { + let raw: string; + try { + raw = readFileSync(this.pathFor(sessionId), "utf8"); + } catch { + return null; // absent is the common case + } + try { + return JSON.parse(raw) as CodexSnapshot; + } catch (err) { + this.logger.warn("snapshot store: malformed snapshot; ignoring", { + sessionId, + error: String(err), + }); + return null; + } + } + + /** Persist a session's snapshot atomically. Never throws. */ + write(sessionId: string, snapshot: CodexSnapshot): void { + const finalPath = this.pathFor(sessionId); + const tmpPath = `${finalPath}.${process.pid}.tmp`; + try { + mkdirSync(this.dir, { recursive: true }); + writeFileSync(tmpPath, JSON.stringify(snapshot)); + renameSync(tmpPath, finalPath); + } catch (err) { + this.logger.error("snapshot store: write failed", { + sessionId, + error: String(err), + }); + // Best-effort cleanup of the temp file so it doesn't linger. + try { + rmSync(tmpPath, { force: true }); + } catch { + // ignore + } + } + } + + /** Delete a session's snapshot (e.g. once its root span has ended). */ + delete(sessionId: string): void { + try { + rmSync(this.pathFor(sessionId), { force: true }); + } catch (err) { + this.logger.error("snapshot store: delete failed", { + sessionId, + error: String(err), + }); + } + } + + /** + * Remove snapshots older than `ttlMs` (by file mtime). Called at server + * startup to clean up orphans left by a previous crash. Never throws. + */ + gcOlderThan(ttlMs: number = DEFAULT_SNAPSHOT_TTL_MS): void { + let entries: string[]; + try { + entries = readdirSync(this.dir); + } catch { + return; // no state dir yet: nothing to GC + } + const cutoff = Date.now() - ttlMs; + for (const name of entries) { + if (!name.endsWith(".json")) continue; + const full = join(this.dir, name); + try { + if (statSync(full).mtimeMs < cutoff) { + rmSync(full, { force: true }); + this.logger.debug("snapshot store: gc removed stale snapshot", { file: name }); + } + } catch (err) { + this.logger.debug("snapshot store: gc skip", { file: name, error: String(err) }); + } + } + } +} diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts new file mode 100644 index 0000000..63efdc9 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, test } from "bun:test"; +import { + type CodexSnapshot, + isCompatibleSnapshot, + redactReportingConfig, + SNAPSHOT_SCHEMA_VERSION, +} from "./state-snapshot.ts"; + +function snapshot(overrides: Partial = {}): CodexSnapshot { + return { + pluginVersion: "1.2.3", + schemaVersion: SNAPSHOT_SCHEMA_VERSION, + sessionId: "s", + savedAt: 0, + reportingConfig: undefined, + rootSpan: null, + rootEnded: false, + rootEnrichment: {}, + mainScopePath: null, + scopes: [], + spawnTurnSpansByCallId: [], + spawnTurnSpansByAgentId: [], + compactionTriggerByTurn: [], + compactionSpansByTurn: [], + ...overrides, + }; +} + +describe("isCompatibleSnapshot", () => { + test("accepts a matching plugin + schema version", () => { + expect(isCompatibleSnapshot(snapshot(), "1.2.3")).toBe(true); + }); + + test("rejects a mismatched plugin version", () => { + expect(isCompatibleSnapshot(snapshot({ pluginVersion: "9.9.9" }), "1.2.3")).toBe(false); + }); + + test("rejects a mismatched schema version", () => { + expect( + isCompatibleSnapshot(snapshot({ schemaVersion: SNAPSHOT_SCHEMA_VERSION + 1 }), "1.2.3"), + ).toBe(false); + }); +}); + +describe("redactReportingConfig", () => { + test("strips the apiKey but keeps everything else", () => { + const redacted = redactReportingConfig({ + project: "p", + apiKey: "sk-secret", + apiUrl: "https://api", + traceToBraintrust: true, + }); + expect(redacted).toEqual({ + project: "p", + apiUrl: "https://api", + traceToBraintrust: true, + }); + expect("apiKey" in (redacted ?? {})).toBe(false); + }); + + test("returns undefined for an undefined config", () => { + expect(redactReportingConfig(undefined)).toBeUndefined(); + }); +}); diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.ts new file mode 100644 index 0000000..979fd41 --- /dev/null +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.ts @@ -0,0 +1,144 @@ +// Serializable snapshot of a CodexEventProcessor's resumable state. +// +// The processor builds a session's trace incrementally, holding all its +// bookkeeping in memory: where it is in each transcript file (byte offsets), +// which spans are still open (and their identities), the reconstructed +// conversation history used to shape LLM-call inputs, and a few side-maps that +// reconcile hook-only data with the transcript. The rollout transcript on disk +// is the durable source of truth for span *content*; this snapshot captures only +// the bookkeeping needed to pick up exactly where a previous run left off. +// +// The server is short-lived: it shuts down after an idle window (default 5 min) +// or when the user closes the session, and a later hook (a "resume", or just the +// next turn) spins up a fresh processor for the same session id. Without a +// snapshot that processor would re-read the transcript from zero with brand-new +// span ids, producing duplicate/orphaned spans. With one, it rehydrates the +// original spans (via {@link SpanFactory.rehydrateSpan}) and continues, so every +// further log()/end() merges into the same trace. +// +// This module defines the snapshot *shape* and pure mapping helpers only; the +// processor owns turning its live state (which holds Span handles) into a +// snapshot and back, since that needs the span factory. Span handles are stored +// here as {@link SpanRef} identities (span_id/root_span_id/parents), captured +// from the synchronous getters on a live span. + +import type { ReportingConfig, SpanRef } from "../../braintrust/logger.ts"; + +/** + * Bumped whenever the snapshot shape changes incompatibly. A snapshot whose + * schemaVersion doesn't match is discarded on load (the session simply starts + * fresh, losing only resume continuity), so we never try to rehydrate state we + * can't interpret. + */ +export const SNAPSHOT_SCHEMA_VERSION = 1; + +/** A conversation item (chat message or reasoning), stored verbatim. These are + * plain JSON already, so they round-trip without transformation. */ +export type ConversationItemSnapshot = Record; + +/** An open LLM span and the output items accumulated for it so far. */ +export interface OpenLlmSnapshot { + span: SpanRef; + turnId: string | undefined; + output: ConversationItemSnapshot[]; + outputPreset?: boolean; +} + +/** An open tool span and the turn it belongs to (keyed by call_id in the map). */ +export interface OpenToolSnapshot { + callId: string; + span: SpanRef; + turnId: string; +} + +/** An open turn span (keyed by turn_id in the map). */ +export interface OpenTurnSnapshot { + turnId: string; + span: SpanRef; +} + +/** Pending-subagent info captured before its root span exists. The parent turn + * span is referenced by identity so it can be rehydrated independently. */ +export interface PendingSubagentSnapshot { + agentId: string; + agentType: string | undefined; + parent: SpanRef; +} + +/** One transcript file's parsing state. Mirrors TranscriptScope, with Span + * handles replaced by SpanRef identities. */ +export interface ScopeSnapshot { + path: string; + kind: "main" | "subagent"; + offset: number; + openTurns: OpenTurnSnapshot[]; + turnsAwaitingCompletion: string[]; + conversationHistory: ConversationItemSnapshot[]; + openLlm: OpenLlmSnapshot | null; + openTools: OpenToolSnapshot[]; + model: string | undefined; + lastTurnEndTime: number | undefined; + subagentRootSpan: SpanRef | null; + subagentEnded: boolean; + pendingSubagent?: PendingSubagentSnapshot; +} + +/** A side-map entry pairing a key (turn_id or call_id or agent_id) with a span. */ +export interface SpanMapEntry { + key: string; + span: SpanRef; +} + +/** A side-map entry pairing a key with a string value. */ +export interface StringMapEntry { + key: string; + value: string; +} + +/** The full resumable state of a CodexEventProcessor. */ +export interface CodexSnapshot { + /** Plugin version that wrote this snapshot; discarded on mismatch. */ + pluginVersion: string; + /** Snapshot shape version; discarded on mismatch. */ + schemaVersion: number; + /** Session id (queueId) this snapshot belongs to. */ + sessionId: string | null; + /** Wall-clock ms when written, for age-based GC of orphans. */ + savedAt: number; + + /** + * Reporting config, minus secrets. `apiKey` is intentionally never persisted + * (re-resolved from the env / config event on resume), mirroring how the event + * recorder redacts it. + */ + reportingConfig: Omit | undefined; + + rootSpan: SpanRef | null; + rootEnded: boolean; + rootEnrichment: { source?: string; permissionMode?: string }; + + /** Path of the main scope within `scopes`, if it exists. */ + mainScopePath: string | null; + scopes: ScopeSnapshot[]; + + spawnTurnSpansByCallId: SpanMapEntry[]; + spawnTurnSpansByAgentId: SpanMapEntry[]; + compactionTriggerByTurn: StringMapEntry[]; + compactionSpansByTurn: SpanMapEntry[]; +} + +/** A parsed snapshot is usable only if it matches our plugin + schema version. */ +export function isCompatibleSnapshot(snapshot: CodexSnapshot, pluginVersion: string): boolean { + return ( + snapshot.schemaVersion === SNAPSHOT_SCHEMA_VERSION && snapshot.pluginVersion === pluginVersion + ); +} + +/** Strip the apiKey from a reporting config before persisting it. */ +export function redactReportingConfig( + config: ReportingConfig | undefined, +): Omit | undefined { + if (config === undefined) return undefined; + const { apiKey: _apiKey, ...rest } = config; + return rest; +} diff --git a/plugins/trace-codex/src/braintrust/logger.ts b/plugins/trace-codex/src/braintrust/logger.ts index a31710f..a9b8598 100644 --- a/plugins/trace-codex/src/braintrust/logger.ts +++ b/plugins/trace-codex/src/braintrust/logger.ts @@ -13,9 +13,52 @@ import type { Logger } from "../log.ts"; export type { Span, StartSpanArgs }; +/** + * The minimal, serializable identity of a span: enough to recreate a handle + * bound to the same row after a restart, so further log()/end() calls merge into + * the original span server-side (rows are keyed by span_id). Captured from a live + * span's `spanId`/`rootSpanId`/`spanParents` getters (all synchronous). + */ +export interface SpanRef { + spanId: string; + rootSpanId: string; + /** Parent span ids (empty for a root span). */ + spanParents: string[]; + /** + * The span's name/type at capture time. Re-asserted on rehydration so the + * merged row keeps the original attributes — without them the SDK would infer + * a (wrong) name from the call stack when the rehydrated handle is created. + */ + name?: string; + type?: StartSpanArgs["type"]; +} + +/** + * Capture a live span's identity for later rehydration. `name`/`type` are the + * attributes the span was created with (the SDK doesn't expose them as getters), + * passed by the caller so they can be re-asserted on resume. + */ +export function spanRef(span: Span, name?: string, type?: StartSpanArgs["type"]): SpanRef { + return { + spanId: span.spanId, + rootSpanId: span.rootSpanId, + spanParents: span.spanParents, + ...(name !== undefined ? { name } : {}), + ...(type !== undefined ? { type } : {}), + }; +} + /** Creates root spans and can flush buffered events. */ export interface SpanFactory { startSpan(args: StartSpanArgs): Span; + /** + * Recreate a handle bound to an existing span's identity (see {@link SpanRef}). + * The returned span carries the original span_id/root_span_id/parents, so + * subsequent log()/end() calls merge into that row rather than creating a new + * one. Used to resume a session's trace after the server restarted. The + * returned handle is "naked": it logs nothing until the caller does. + */ + rehydrateSpan(ref: SpanRef): Span; flush(): Promise; } @@ -79,6 +122,13 @@ export function createSpanFactory(config?: ReportingConfig, diagLogger?: Logger) }); return { startSpan: (args) => logger.startSpan(args), + rehydrateSpan: (ref) => + logger.startSpan({ + spanId: ref.spanId, + parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId }, + ...(ref.name !== undefined ? { name: ref.name } : {}), + ...(ref.type !== undefined ? { type: ref.type } : {}), + }), flush: async () => { try { await logger.flush(); diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts index caea91b..82fea54 100644 --- a/plugins/trace-codex/src/test-helpers.ts +++ b/plugins/trace-codex/src/test-helpers.ts @@ -1,7 +1,7 @@ // Shared test helpers. import { _exportsForTestingOnly, initLogger } from "braintrust"; -import type { Span, SpanFactory, StartSpanArgs } from "./braintrust/logger.ts"; +import type { Span, SpanFactory, SpanRef, StartSpanArgs } from "./braintrust/logger.ts"; import type { Logger } from "./log.ts"; /** A no-op logger for tests; never touches the filesystem. */ @@ -53,6 +53,25 @@ export function createFakeSpanFactory(): FakeSpanFactory { }, } as unknown as Span; }, + rehydrateSpan(ref: SpanRef): Span { + const fake: FakeSpan = { + id: ref.spanId, + startArgs: {}, + flushCount: 0, + endCount: 0, + }; + spans.push(fake); + return { + id: fake.id, + flush: async () => { + fake.flushCount += 1; + }, + end: () => { + fake.endCount += 1; + return 0; + }, + } as unknown as Span; + }, flush: async () => { factory.factoryFlushCount += 1; }, @@ -110,6 +129,13 @@ export function withCapturedTrace(): CapturedTrace { return { spanFactory: { startSpan: (args) => logger.startSpan(args), + rehydrateSpan: (ref) => + logger.startSpan({ + spanId: ref.spanId, + parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId }, + ...(ref.name !== undefined ? { name: ref.name } : {}), + ...(ref.type !== undefined ? { type: ref.type } : {}), + }), flush: () => logger.flush(), }, drain: async () => { @@ -137,8 +163,46 @@ export interface SpanTree { children: SpanTree[]; } +/** + * Merge captured span rows that share a span_id into one logical span, the way + * Braintrust merges rows server-side. Multiple rows for one span_id occur when a + * span is logged across several calls (e.g. start, then output, then end) and + * especially when a span is rehydrated after a server restart: the resumed + * handle re-emits rows under the original id. Later non-empty fields win; + * metadata and span_attributes are shallow-merged so partial updates accumulate. + */ +export function mergeCapturedSpans(spans: CapturedSpan[]): CapturedSpan[] { + const byId = new Map(); + const order: string[] = []; + for (const span of spans) { + const existing = byId.get(span.span_id); + if (existing === undefined) { + byId.set(span.span_id, { ...span }); + order.push(span.span_id); + continue; + } + // Merge: later rows override, but don't clobber an existing value with + // undefined (a partial row that omits a field shouldn't erase it). + const merged: CapturedSpan = { ...existing }; + for (const [key, value] of Object.entries(span) as [keyof CapturedSpan, unknown][]) { + if (value === undefined) continue; + if (key === "metadata" || key === "span_attributes" || key === "metrics") { + merged[key] = { + ...(existing[key] as Record | undefined), + ...(value as Record), + } as never; + } else { + merged[key] = value as never; + } + } + byId.set(span.span_id, merged); + } + return order.map((id) => byId.get(id) as CapturedSpan); +} + /** Build a single-rooted tree from flat captured spans (via span_parents). */ -export function spansToTree(spans: CapturedSpan[]): SpanTree | null { +export function spansToTree(rawSpans: CapturedSpan[]): SpanTree | null { + const spans = mergeCapturedSpans(rawSpans); if (spans.length === 0) return null; const root = spans.find( From 4157ec1498da13328e736f2390786ebbc212222e Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Mon, 22 Jun 2026 18:52:21 -0600 Subject: [PATCH 13/17] rm non-blocking flush option worker auto-flushes on an empty queue already so this isn't needed. the only time we want to call flush should also require blocking (when we stop event and we are configured to block). --- plugins/trace-codex/AGENTS.md | 16 +++--- plugins/trace-codex/scripts/smoke-test.sh | 12 +++-- plugins/trace-codex/src/client/client.ts | 31 ++++++------ .../trace-codex/src/server/enqueue-client.ts | 24 ++++----- plugins/trace-codex/src/server/routes.test.ts | 48 ++---------------- plugins/trace-codex/src/server/routes.ts | 39 ++++----------- plugins/trace-codex/src/server/server.test.ts | 49 +++++++++++++++++++ plugins/trace-codex/src/server/server.ts | 1 - 8 files changed, 105 insertions(+), 115 deletions(-) diff --git a/plugins/trace-codex/AGENTS.md b/plugins/trace-codex/AGENTS.md index 332223d..2b698c1 100644 --- a/plugins/trace-codex/AGENTS.md +++ b/plugins/trace-codex/AGENTS.md @@ -98,15 +98,15 @@ critical path. The terminal event is the one place where blocking is even an option, and it's **opt-in**: -- On a **terminal event** (`Stop`), the client always calls `/flush`, but by - default it fires the flush and returns immediately so the turn isn't stalled. - Setting `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` makes the client instead wait for the - server to confirm buffered spans reached Braintrust. The blocking mode exists - for short-lived hosts (e.g. a CI job that ends right after the last turn) that - would otherwise tear the process tree down before the final spans are +- On a **terminal event** (`Stop`), the client calls `/flush` **only** when + `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` is set, blocking until the server confirms the + queue has drained and buffered spans reached Braintrust. The blocking mode + exists for short-lived hosts (e.g. a CI job that ends right after the last turn) + that would otherwise tear the process tree down before the final spans are delivered. When enabled, the wait is bounded by a timeout (`/flush` gives up - rather than hanging the turn). In normal interactive use the server stays alive - and flushes off the critical path, so no spans are lost without blocking. + rather than hanging the turn). By default the client does nothing on `Stop`: the + long-lived server flushes on its own when the queue goes idle, so in normal + interactive use no spans are lost and the turn isn't stalled by a flush request. When adding behavior, prefer enqueue-and-return. Reach for a blocking request/await only when data would otherwise be lost, keep it off the diff --git a/plugins/trace-codex/scripts/smoke-test.sh b/plugins/trace-codex/scripts/smoke-test.sh index 8b6ed54..f148621 100755 --- a/plugins/trace-codex/scripts/smoke-test.sh +++ b/plugins/trace-codex/scripts/smoke-test.sh @@ -120,6 +120,11 @@ export BRAINTRUST_PROJECT=trace-codex-smoke # Keep the background event server short-lived so it doesn't linger after CI. export BRAINTRUST_EVENT_SERVER_IDLE_TIMEOUT_MS=15000 export BRAINTRUST_EVENT_SERVER_IDLE_CHECK_INTERVAL_MS=1000 +# Block on the terminal-event flush so the final spans are delivered before +# `codex exec` returns. Without this the hook just enqueues and relies on the +# server's idle-drain flush, which may not fire before this short-lived run tears +# the process tree down — making the assertion below flaky. +export BRAINTRUST_PLUGIN_BLOCK_ON_STOP=true # Pin the release the launcher downloads, when requested. The launcher strips an # optional leading 'v', so either form is fine here. @@ -148,9 +153,10 @@ if [ "$codex_status" -ne 0 ]; then scrub <"$CODEX_LOG" >&2 || true fi -# The hook client blocks on a synchronous flush at session end, so by the time -# codex exec returns the trace should already be delivered. Stop the collector -# (via cleanup) to make it write its summary, then assert. +# With BRAINTRUST_PLUGIN_BLOCK_ON_STOP=true (set above), the hook client blocks +# on a synchronous flush at session end, so by the time codex exec returns the +# trace should already be delivered. Stop the collector (via cleanup) to make it +# write its summary, then assert. cleanup COLLECTOR_PID="" # already stopped; avoid double-kill in the EXIT trap diff --git a/plugins/trace-codex/src/client/client.ts b/plugins/trace-codex/src/client/client.ts index e918823..ab90ff7 100644 --- a/plugins/trace-codex/src/client/client.ts +++ b/plugins/trace-codex/src/client/client.ts @@ -39,11 +39,12 @@ export interface HookClientOptions { */ terminalEvents?: readonly string[]; /** - * Whether the terminal-event flush blocks until the server confirms all - * events are processed and spans are delivered (block=true), versus a - * fire-and-forget signal that returns immediately (block=false). Defaults to - * false; set true (via BRAINTRUST_PLUGIN_BLOCK_ON_STOP) to guarantee delivery - * before Codex exits, e.g. in programmatic/CI runs. + * Whether to block on a terminal event until the server confirms all events + * are processed and spans are delivered. Defaults to false. When false the + * client does nothing on a terminal event — the long-lived server flushes on + * its own when the queue goes idle. Set true (via BRAINTRUST_PLUGIN_BLOCK_ON_STOP) + * to guarantee delivery before Codex exits, e.g. in programmatic/CI runs where + * the process tree is torn down before the idle flush can fire. */ blockOnStop?: boolean; } @@ -86,15 +87,15 @@ export async function runHookClient( } } - // On a terminal event, flush. When blockOnStop is set, block until the server - // confirms the queue has drained and buffered spans are delivered — important - // for a CI job (or any short-lived host) that ends right after the agent's - // last turn, before the background server's idle timeout fires, which would - // otherwise lose the final spans. Otherwise fire-and-forget so the turn isn't - // stalled; the background server delivers spans on its own. - if (sawTerminal) { - const block = options.blockOnStop ?? false; - const flushed = await postFlush(config, logger, { block }); - logger.debug("flush requested on terminal event", { flushed, block }); + // On a terminal event, only flush when blockOnStop is set: block until the + // server confirms the queue has drained and buffered spans are delivered — + // important for a CI job (or any short-lived host) that ends right after the + // agent's last turn, before the background server's idle timeout fires, which + // would otherwise lose the final spans. Otherwise do nothing: the long-lived + // server flushes on its own when the queue goes idle, so the turn isn't + // stalled by a flush request. + if (sawTerminal && (options.blockOnStop ?? false)) { + const flushed = await postFlush(config, logger); + logger.debug("flush requested on terminal event", { flushed }); } } diff --git a/plugins/trace-codex/src/server/enqueue-client.ts b/plugins/trace-codex/src/server/enqueue-client.ts index f857ebd..e00f680 100644 --- a/plugins/trace-codex/src/server/enqueue-client.ts +++ b/plugins/trace-codex/src/server/enqueue-client.ts @@ -31,31 +31,25 @@ export async function postEnqueue( } /** - * Ask the server to flush. Returns true on a 2xx response. Never throws. + * Ask the server to flush, blocking until it confirms. Returns true on a 2xx + * response. Never throws. * - * When `block` is true (the default), the server waits until everything - * enqueued so far has been processed and buffered spans have reached the backend - * before responding. The hook uses this after a terminal event so the final + * The server waits until everything enqueued so far has been processed and + * buffered spans have reached the backend before responding. The hook uses this + * after a terminal event (only when block-on-stop is configured) so the final * spans are delivered before the process tree is torn down (e.g. a CI job ending * right after the agent's last turn, before the idle timeout fires). - * - * When `block` is false, the server just signals every processor to flush its - * buffered state and responds immediately, without waiting for the queue to - * drain — for periodic/best-effort flushes that must not stall the caller. */ export async function postFlush( config: Pick, logger: Logger, - options: { block?: boolean } = {}, ): Promise { - const block = options.block ?? true; try { - const res = await fetch(`${serverBaseUrl(config)}/flush?block=${block}`, { + const res = await fetch(`${serverBaseUrl(config)}/flush`, { method: "POST", - // A blocking flush waits on the server (bounded by FLUSH_TIMEOUT_MS); allow - // a little more here so we receive its response rather than aborting. A - // non-blocking flush returns promptly, so a short timeout suffices. - signal: AbortSignal.timeout(block ? 12_000 : 2_000), + // The server bounds its wait by FLUSH_TIMEOUT_MS; allow a little more here + // so we receive its response rather than aborting first. + signal: AbortSignal.timeout(12_000), }); if (!res.ok) { logger.warn("flush rejected", { status: res.status }); diff --git a/plugins/trace-codex/src/server/routes.test.ts b/plugins/trace-codex/src/server/routes.test.ts index 5f1d1b7..c38cc1d 100644 --- a/plugins/trace-codex/src/server/routes.test.ts +++ b/plugins/trace-codex/src/server/routes.test.ts @@ -14,7 +14,6 @@ function makeDeps(overrides: Partial = {}): RouteDeps { state: new ServerState(TEST_VERSION), logger, queue: new EventQueue({ logger }), - flushProcessors: async () => {}, onShutdownRequested: () => {}, ...overrides, }; @@ -120,7 +119,7 @@ describe("POST /flush", () => { eventData: { hook_event_name: "Stop" }, }; - test("block=true waits for the queue to drain, then returns ok", async () => { + test("waits for the queue to drain, then returns ok", async () => { const processed: string[] = []; const logger = createTestLogger(); const queue = new EventQueue({ @@ -133,54 +132,17 @@ describe("POST /flush", () => { queue.start(); const deps = makeDeps({ logger, queue }); - // Enqueue a Stop, then flush(block=true). The flush response must come only - // after the event has been processed. + // Enqueue a Stop, then flush. The flush response must come only after the + // event has been processed. await handleRequest(post("/enqueue", validEvent), deps); - const res = await handleRequest(post("/flush?block=true"), deps); + const res = await handleRequest(post("/flush"), deps); expect(res.status).toBe(200); - expect(await res.json()).toEqual({ ok: true, blocked: true }); + expect(await res.json()).toEqual({ ok: true }); expect(processed).toEqual(["Stop"]); await queue.stop(); }); - - test("block=false signals processors and returns immediately (no queue drain)", async () => { - let flushSignals = 0; - // The consumer is never started, so the enqueued event stays unprocessed and - // the queue never drains. A non-blocking flush must still return promptly, - // having signalled the processors. - const logger = createTestLogger(); - const queue = new EventQueue({ logger }); - const deps = makeDeps({ - logger, - queue, - flushProcessors: async () => { - flushSignals += 1; - }, - }); - - await handleRequest(post("/enqueue", validEvent), deps); - expect(queue.size).toBe(1); // still queued, not drained - const res = await handleRequest(post("/flush?block=false"), deps); - - expect(res.status).toBe(200); - expect(await res.json()).toEqual({ ok: true, blocked: false }); - expect(flushSignals).toBe(1); - expect(queue.size).toBe(1); // flush did not wait for / cause a drain - }); - - test("defaults to non-blocking when no block param is given", async () => { - let flushSignals = 0; - const deps = makeDeps({ - flushProcessors: async () => { - flushSignals += 1; - }, - }); - const res = await handleRequest(post("/flush"), deps); - expect(await res.json()).toEqual({ ok: true, blocked: false }); - expect(flushSignals).toBe(1); - }); }); describe("unknown routes", () => { diff --git a/plugins/trace-codex/src/server/routes.ts b/plugins/trace-codex/src/server/routes.ts index 81655e3..74ce636 100644 --- a/plugins/trace-codex/src/server/routes.ts +++ b/plugins/trace-codex/src/server/routes.ts @@ -28,11 +28,6 @@ export interface RouteDeps { logger: Logger; /** Queue that /enqueue pushes events onto. */ queue: EventQueue; - /** - * Signal every active processor to flush its buffered state now, without - * waiting for the queue to drain. Used by the non-blocking /flush path. - */ - flushProcessors: () => Promise; /** Invoked when /shutdown is hit, after the response is constructed. */ onShutdownRequested: () => void; } @@ -50,7 +45,7 @@ const SERVICE_UNAVAILABLE = () => json({ error: "shutting_down" }, 503); const FLUSH_TIMEOUT_MS = 10_000; export async function handleRequest(req: Request, deps: RouteDeps): Promise { - const { state, logger, queue, flushProcessors, onShutdownRequested } = deps; + const { state, logger, queue, onShutdownRequested } = deps; const url = new URL(req.url); const path = url.pathname; @@ -87,24 +82,15 @@ export async function handleRequest(req: Request, deps: RouteDeps): Promise] (block defaults to false) + // POST /flush // - // block=true: wait until everything enqueued so far has been processed AND - // buffered spans have been flushed to the backend, then respond. Used on - // terminal events / shutdown in short-lived environments (e.g. CI) where the - // background server won't survive to flush on idle, so the final spans must - // be delivered before the process tree is torn down. - // - // block=false: don't wait for the queue to drain. Just send the flush signal - // through to every active processor (each delivers its buffered spans) and - // respond immediately. Use this for periodic/best-effort flushes that must - // not stall the caller. + // Wait until everything enqueued so far has been processed AND buffered spans + // have been flushed to the backend, then respond. Used on terminal events / + // shutdown in short-lived environments (e.g. CI) where the background server + // won't survive to flush on idle, so the final spans must be delivered before + // the process tree is torn down. In normal interactive use the client doesn't + // call this — the server flushes on its own when the queue goes idle. if (path === "/flush" && req.method === "POST") { - const block = parseBoolParam(url.searchParams.get("block")); - if (!block) { - await flushProcessors(); - return json({ ok: true, blocked: false }); - } const flushed = await Promise.race([ queue.drained().then(() => true), // Bound the wait so a hung backend flush can't hold the request lock (and @@ -115,7 +101,7 @@ export async function handleRequest(req: Request, deps: RouteDeps): Promise; diff --git a/plugins/trace-codex/src/server/server.test.ts b/plugins/trace-codex/src/server/server.test.ts index e26689c..f85f690 100644 --- a/plugins/trace-codex/src/server/server.test.ts +++ b/plugins/trace-codex/src/server/server.test.ts @@ -127,4 +127,53 @@ describe("startServer", () => { await server.stop(); } }); + + test("/flush waits for the queue to drain and processors to flush", async () => { + // /flush must return only after every enqueued event has been processed AND + // processors have been flushed (the idle flush runs before drained() + // resolves). Records ordering to prove flush happened before the response. + const events: string[] = []; + const factory: EventProcessorFactory = () => ({ + process: async () => { + await sleep(20); + events.push("process"); + }, + flush: () => { + events.push("flush"); + }, + }); + + const server = startServer( + testConfig({ idleTimeoutMs: 60_000, idleCheckIntervalMs: 60_000 }), + new Map([["test", factory]]), + createTestLogger(), + ); + + try { + const enqueue = await fetch(`http://127.0.0.1:${server.port}/enqueue`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + queueId: "s1", + eventSource: "test", + eventSourceVersion: null, + eventName: "Stop", + eventData: {}, + }), + }); + expect(enqueue.status).toBe(200); + + const flushRes = await fetch(`http://127.0.0.1:${server.port}/flush`, { + method: "POST", + }); + expect(flushRes.status).toBe(200); + expect(await flushRes.json()).toEqual({ ok: true }); + + // By the time /flush returns, the event was processed and the idle flush + // ran. + expect(events).toEqual(["process", "flush"]); + } finally { + await server.stop(); + } + }); }); diff --git a/plugins/trace-codex/src/server/server.ts b/plugins/trace-codex/src/server/server.ts index 0601f5f..06a79b0 100644 --- a/plugins/trace-codex/src/server/server.ts +++ b/plugins/trace-codex/src/server/server.ts @@ -99,7 +99,6 @@ export function startServer( state, logger: log, queue, - flushProcessors: () => registry.flushAll(), onShutdownRequested: () => { void stop(); }, From 58c97d013a1d3a54aaa7833e147ac06252909cd0 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Tue, 23 Jun 2026 14:11:41 -0600 Subject: [PATCH 14/17] correct llm span durations --- .../src/agents/codex/event-processor.test.ts | 432 +++++++++++++++++- .../src/agents/codex/event-processor.ts | 206 +++++++-- .../src/agents/codex/resume.test.ts | 88 +++- .../src/agents/codex/state-snapshot.ts | 11 +- 4 files changed, 689 insertions(+), 48 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 88f8197..7f5da2f 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -207,7 +207,10 @@ describe("CodexEventProcessor: turn spans", () => { span_attributes: { name: "turn: t2", type: "task" }, output: "second", ended: true, - children: [{ span_attributes: { name: "exec_command", type: "tool" }, ended: true }], + children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, + { span_attributes: { name: "exec_command", type: "tool" }, ended: true }, + ], }, ], }, @@ -265,7 +268,7 @@ describe("CodexEventProcessor: turn spans", () => { describe("CodexEventProcessor: span timing (from transcript timestamps)", () => { // :10Z=1767225610, :11=...611, :12=...612, :13=...613, :14=...614, :15=...615. - test("llm and tool spans use their own transcript record timestamps", async () => { + test("an llm span starts at its turn's start (first child), not its output record", async () => { await assertProducesTrace( [ sessionStart(), @@ -279,8 +282,11 @@ describe("CodexEventProcessor: span timing (from transcript timestamps)", () => timestamp: "2026-01-01T00:00:11Z", payload: { type: "task_started", turn_id: "t1" }, }), - // LLM call: opens at the assistant message (:12), requests a tool (:13), - // result arrives (:14), and the call closes at token_count (:15). + // LLM call: the assistant message lands at (:12) — but the model's work + // for this call began at the turn's start (:11), since this is the first + // child. It requests a tool (:13) — that's its last output, so the llm + // span ends there. The tool result arrives (:14) and the token_count + // (:15) supplies metrics only (it does NOT define the llm span's end). transcript({ type: "response_item", timestamp: "2026-01-01T00:00:12Z", @@ -328,14 +334,18 @@ describe("CodexEventProcessor: span timing (from transcript timestamps)", () => ended: true, children: [ { - // LLM call: opened at the message (:12), closed at token_count (:15). + // LLM call: starts at the turn's start (:11, first child), not + // the message record (:12); ENDS when it emits the tool call (:13), + // its last output — NOT at token_count (:15). This is the fix for + // the llm span swallowing the tool's execution. span_attributes: { name: "llm", type: "llm" }, - metrics: { start: 1767225612, end: 1767225615 }, + metrics: { start: 1767225611, end: 1767225613 }, ended: true, }, { // Tool span uses its own record timestamps: function_call (:13) - // to function_call_output (:14). Ordered after the llm by start. + // to function_call_output (:14). Contiguous with the llm (no + // overlap): llm ends at :13, tool starts at :13. span_attributes: { name: "exec_command", type: "tool" }, metrics: { start: 1767225613, end: 1767225614 }, ended: true, @@ -346,6 +356,249 @@ describe("CodexEventProcessor: span timing (from transcript timestamps)", () => }, ); }); + + // When an llm call is the only child of its turn, its span should span almost + // the whole turn (start at turn start, end at token_count just before + // task_complete) — not appear near-instant. + test("a lone llm span fills its turn", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // The model "thinks" for several seconds; its output only lands at :15. + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:15Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "answer" }], + }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:15Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:16Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "answer" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + metrics: { start: 1767225611, end: 1767225616 }, + children: [ + { + // Starts at turn start (:11), not the output record (:15). + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225615 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // Regression for the "front-of-turn gap" bug: a model call whose ONLY output is + // a tool call (no assistant message or readable reasoning before it) must still + // open an llm span, so the model's work (often many seconds of thinking before + // it emits the tool call) is covered rather than leaving an unaccounted gap + // between the turn start and the first child. + test("a tool-only model call still opens an llm span filling the front of the turn", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // The model "thinks" for ~20s, then its first (and only) output for this + // call is a tool call at :30 — no preceding message or reasoning. + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:30Z", + payload: { + type: "function_call", + name: "exec_command", + call_id: "c1", + metadata: { turn_id: "t1" }, + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:32Z", + payload: { type: "function_call_output", call_id: "c1", output: "ok" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:32Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:33Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + metrics: { start: 1767225611, end: 1767225633 }, + children: [ + { + // The llm span covers the model's work from the turn start (:11) + // until it emits the tool call (:30, its only output) — filling + // the ~20s "thinking" gap. It does NOT extend to the token_count + // (:32); that would overlap the tool, which is impossible. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225630 }, + ended: true, + }, + { + // The tool runs :30 -> :32, contiguous with (not inside) the llm. + span_attributes: { name: "exec_command", type: "tool" }, + metrics: { start: 1767225630, end: 1767225632 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + // Two model calls separated by a tool: the second call must start where the + // tool ended (the model's most recent sibling), so the spans tile the turn + // contiguously instead of each looking instant. + test("a later llm span starts where the prior sibling (a tool) ended", async () => { + await assertProducesTrace( + [ + sessionStart(), + transcript({ + type: "session_meta", + timestamp: "2026-01-01T00:00:10Z", + payload: { cwd: "/work", id: "session-1" }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:11Z", + payload: { type: "task_started", turn_id: "t1" }, + }), + // Call 1: opens at turn start (:11) on its first output item (an + // assistant message at :11), then emits a tool call (:12, its last + // output → llm-1 ends there). The token_count (:13) is metrics-only. + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:11Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "let me check" }], + }, + }), + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:12Z", + payload: { + type: "function_call", + name: "exec_command", + call_id: "c1", + metadata: { turn_id: "t1" }, + }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:13Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + // Tool runs :12 -> :14 (output lands after call 1's token_count). + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:14Z", + payload: { type: "function_call_output", call_id: "c1", output: "ok" }, + }), + // Call 2: the model resumes after the tool result. Its output lands at + // :18, but it began working when the tool returned (:14). + transcript({ + type: "response_item", + timestamp: "2026-01-01T00:00:18Z", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "done" }], + }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:18Z", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 5 } } }, + }), + transcript({ + type: "event_msg", + timestamp: "2026-01-01T00:00:19Z", + payload: { type: "task_complete", turn_id: "t1", last_agent_message: "done" }, + }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + children: [ + { + // Call 1: turn start (:11) -> its last output, the tool call + // (:12). Contiguous with the tool; does NOT extend to :13/:14. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225611, end: 1767225612 }, + ended: true, + }, + { + // Tool: :12 -> :14. + span_attributes: { name: "exec_command", type: "tool" }, + metrics: { start: 1767225612, end: 1767225614 }, + ended: true, + }, + { + // Call 2: starts at the tool's end (:14), the most recent sibling + // end — NOT its own output record (:18). Ends at its last output, + // the message (:18). The token_count (:18) is metrics-only. + span_attributes: { name: "llm", type: "llm" }, + metrics: { start: 1767225614, end: 1767225618 }, + ended: true, + }, + ], + }, + ], + }, + ); + }); }); describe("CodexEventProcessor: tool spans", () => { @@ -375,6 +628,9 @@ describe("CodexEventProcessor: tool spans", () => { output: "done", ended: true, children: [ + // The model call that emitted the tool call (its only output) opens + // an llm span, so the model's work isn't an unaccounted gap. + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { span_attributes: { name: "exec_command", type: "tool" }, input: '{"cmd":"ls"}', @@ -414,6 +670,7 @@ describe("CodexEventProcessor: tool spans", () => { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { span_attributes: { name: "apply_patch", type: "tool" }, input: "*** Begin Patch", @@ -448,6 +705,9 @@ describe("CodexEventProcessor: tool spans", () => { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [ + // Both tool calls come from one model call, so a single llm span + // opens (on the first tool call) and closes at turn end. + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { span_attributes: { name: "exec_command", type: "tool" }, output: "a", ended: true }, { span_attributes: { name: "apply_patch", type: "tool" }, output: "b", ended: true }, ], @@ -476,6 +736,7 @@ describe("CodexEventProcessor: tool spans", () => { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { span_attributes: { name: "exec_command", type: "tool" }, input: "{}", @@ -542,6 +803,7 @@ describe("CodexEventProcessor: permissions", () => { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { span_attributes: { name: "exec_command", type: "tool" }, metadata: { @@ -599,6 +861,8 @@ describe("CodexEventProcessor: permissions", () => { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [ + // Both attempts come from one model call, so a single llm span opens. + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { // The failed sandboxed attempt: no permission annotation. span_attributes: { name: "exec_command", type: "tool" }, @@ -651,6 +915,7 @@ describe("CodexEventProcessor: permissions", () => { span_attributes: { name: "turn: t1", type: "task" }, ended: true, children: [ + { span_attributes: { name: "llm", type: "llm" }, ended: true }, { span_attributes: { name: "exec_command", type: "tool" }, metadata: { permission: undefined }, @@ -1652,4 +1917,157 @@ describe("CodexEventProcessor: subagents", () => { trace.cleanup(); } }); + + // LLM-span start timing is derived per-turn, per-scope, so a subagent's calls + // must never influence the main session's call timing (or vice versa) even + // though both run concurrently. Each scope tracks its own turn start / + // last-child-end, keyed by its own transcript file. + test("llm span start timing is isolated per scope across concurrent subagents", async () => { + const reader = new MultiFileTranscriptReader(); + const w = (path: string, record: Record) => + reader.append(path, JSON.stringify(record)); + const trace = withCapturedTrace(); + try { + const processor = new CodexEventProcessor( + "s", + createTestLogger(), + () => trace.spanFactory, + reader, + ); + await processor.process(configEvent({ session_id: "s" })); + + // Main turn starts at :10 and spawns a subagent; its own model call's output + // lands much later (:30), but the call began at the turn start (:10). + w(MAIN, { + timestamp: "2026-01-01T00:00:09Z", + type: "session_meta", + payload: { id: "s", cwd: "/work/app" }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:10Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "main-1" }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:10Z", + type: "response_item", + payload: { + type: "function_call", + name: "spawn_agent", + call_id: "call_spawn", + arguments: "{}", + metadata: { turn_id: "main-1" }, + }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:11Z", + type: "response_item", + payload: { type: "function_call_output", call_id: "call_spawn", output: "{}" }, + }); + await processor.process(sessionStart({ session_id: "s", transcript_path: MAIN })); + await processor.process( + postToolUse({ + session_id: "s", + transcript_path: MAIN, + tool_name: "spawn_agent", + tool_use_id: "call_spawn", + tool_response: JSON.stringify({ agent_id: "agent-1" }), + }), + ); + + // Subagent turn starts at :20 (its own clock); its model output lands at + // :25 and closes at :25. Its llm span must start at the SUBAGENT turn start + // (:20), not be affected by the main scope's :10/:11 boundaries. + w(SUB, { + timestamp: "2026-01-01T00:00:20Z", + type: "session_meta", + payload: { id: "agent-1" }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:20Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "sub-1" }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:25Z", + type: "response_item", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "sub" }], + }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:25Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } }, + }); + w(SUB, { + timestamp: "2026-01-01T00:00:26Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "sub-1", last_agent_message: "sub done" }, + }); + await processor.process( + subagentStart({ + session_id: "s", + transcript_path: SUB, + agent_id: "agent-1", + agent_type: "default", + }), + ); + await processor.process( + subagentStop({ + session_id: "s", + transcript_path: MAIN, + agent_transcript_path: SUB, + agent_id: "agent-1", + }), + ); + + // Main session's own model call: output lands at :30, closes at :30. Its + // llm span must start at the MAIN turn start (:10) — the most recent child + // of main-1 is the spawn_agent tool (ended :11), so it starts there. + w(MAIN, { + timestamp: "2026-01-01T00:00:30Z", + type: "response_item", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "main" }], + }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:30Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } }, + }); + w(MAIN, { + timestamp: "2026-01-01T00:00:31Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "main-1", last_agent_message: "all done" }, + }); + await processor.process(stop({ session_id: "s", transcript_path: MAIN, turn_id: "main-1" })); + await processor.flush(); + + const tree = spansToTree(await trace.drain()); + const mainTurn = tree?.children.find((c) => c.name === "turn: main-1"); + // Main llm: opens on the spawn_agent tool call (the model's first output, + // before any message), so it starts at the main turn's start (:10 = + // 1767225610) and runs until its token_count (:30). The single llm call + // spans the spawn tool and the later message. + const mainLlm = mainTurn?.children.find((c) => c.name === "llm"); + expect(mainLlm?.metrics?.start).toBe(1767225610); + expect(mainLlm?.metrics?.end).toBe(1767225630); + + // Subagent llm: starts at the SUBAGENT turn start (:20 = 1767225620), wholly + // independent of the main scope's boundaries; ends at its token_count (:25). + const subRoot = mainTurn?.children.find((c) => c.name === "subagent: agent-1"); + const subTurn = subRoot?.children.find((c) => c.name === "turn: sub-1"); + const subLlm = subTurn?.children.find((c) => c.name === "llm"); + expect(subLlm?.metrics?.start).toBe(1767225620); + expect(subLlm?.metrics?.end).toBe(1767225625); + } finally { + trace.cleanup(); + } + }); }); diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 5f21d41..6de6fcd 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -186,6 +186,15 @@ interface OpenLlm { turnId: string | undefined; /** Items produced by this call: reasoning, assistant text, and/or tool calls. */ output: ConversationItem[]; + /** + * Timestamp (Unix seconds) of this call's last model-output item (message / + * reasoning / tool call). This is when the model finished generating and is + * used as the span's end time — NOT the closing token_count's timestamp. Codex + * writes the token_count after the tool result (at the same instant as the + * tool output), so ending the span there would wrongly stretch it to swallow + * the tool's execution. The token_count only supplies token metrics. + */ + lastOutputTime: number | undefined; /** * When true, the span's output was already set at creation (e.g. a compaction * call, whose output is the replacement history), so closing it must not @@ -194,6 +203,25 @@ interface OpenLlm { outputPreset?: boolean; } +/** + * An open turn span plus the timing we need to place its child LLM spans. The + * transcript records a model call's output only when it lands (just before the + * call's token_count), with no record at request-send time — so an LLM span + * stamped from its first output item would look near-instant. Instead we start + * each LLM span at the end of the most recently-ended child of this turn + * (`lastChildEndTime`), or at the turn's own start (`startTime`) when it's the + * first child. That makes a call's span cover request→response, and consecutive + * children tile the turn. All per-turn (hence per-scope), so concurrent + * subagents never affect each other's timing. + */ +interface OpenTurn { + span: Span; + /** Turn start (Unix seconds), used as the first child's start time. */ + startTime: number | undefined; + /** Max end time (Unix seconds) among this turn's already-ended children. */ + lastChildEndTime: number | undefined; +} + /** * All parsing state tied to a single transcript file. The main session has one * scope (its parent is the root span); each subagent gets its own scope (its @@ -215,8 +243,9 @@ interface TranscriptScope { readonly parentSpan: () => Span | null; /** Byte offset already consumed from `path`. */ offset: number; - /** Turn spans currently open in this scope, keyed by turn_id. */ - readonly openTurns: Map; + /** Turn spans currently open in this scope, keyed by turn_id (with the timing + * used to place their child LLM spans). */ + readonly openTurns: Map; /** turn_ids whose Stop has fired but whose task_complete hasn't been seen. */ readonly turnsAwaitingCompletion: Set; /** OpenAI-style chat history (messages + reasoning), used to reconstruct each @@ -822,9 +851,9 @@ export class CodexEventProcessor implements EventProcessor { const endArgs = endTime !== undefined ? { endTime } : undefined; this.endOpenLlmSpan(scope, endTime); this.closeAllOpenTools(scope, endArgs); - for (const [, span] of scope.openTurns) { + for (const [, turn] of scope.openTurns) { try { - span.end(endArgs); + turn.span.end(endArgs); } catch (err) { this.logger.error("codex processor: failed to end dangling subagent turn span", { queueId: this.queueId, @@ -1098,7 +1127,7 @@ export class CodexEventProcessor implements EventProcessor { turnName, "task", ); - scope.openTurns.set(turnId, turnSpan); + scope.openTurns.set(turnId, { span: turnSpan, startTime, lastChildEndTime: undefined }); this.logger.info("codex processor: opened turn span", { queueId: this.queueId, turnId, @@ -1122,13 +1151,14 @@ export class CodexEventProcessor implements EventProcessor { // did: the conversation before it (input) and the kept context after (output). private handleCompacted(scope: TranscriptScope, record: TranscriptRecord): void { const turnId = this.latestOpenTurnId(scope); - const turnSpan = this.latestOpenTurn(scope); - if (turnId === undefined || turnSpan === undefined) { + const turn = this.latestOpenTurn(scope); + if (turnId === undefined || turn === undefined) { this.logger.debug("codex processor: compacted record with no open turn span", { queueId: this.queueId, }); return; } + const turnSpan = turn.span; const payload = (record.payload ?? {}) as Record; const replacement = Array.isArray(payload.replacement_history) ? (payload.replacement_history as unknown[]) @@ -1169,7 +1199,10 @@ export class CodexEventProcessor implements EventProcessor { // clear marker for the summary, which Codex stores encrypted and // does not expose to us. // Closed by the turn's token_count (metrics) like any other model call. - const startTime = isoToUnixSeconds(record.timestamp); + // Start the span at the turn's start (compaction is the sole child), not the + // record timestamp (which is when the compaction result lands), so its + // duration reflects the compaction call rather than looking instant. + const startTime = this.llmStartTimeFor(turn) ?? isoToUnixSeconds(record.timestamp); const before = scope.conversationHistory.map((item) => ({ ...item })); const output = compactionOutput(replacement); const compactionLlmName = scope.model ?? "compaction"; @@ -1190,8 +1223,16 @@ export class CodexEventProcessor implements EventProcessor { ); // Reuse the open-llm slot so the turn's token_count closes it with usage. // outputPreset: the replacement history is already logged as the output, so - // the closing token_count must not overwrite it. - scope.openLlm = { span, turnId, output: [], outputPreset: true }; + // the closing token_count must not overwrite it. lastOutputTime is the + // compacted record's time (when the compaction output landed); the closing + // token_count provides metrics only. + scope.openLlm = { + span, + turnId, + output: [], + outputPreset: true, + lastOutputTime: isoToUnixSeconds(record.timestamp), + }; } catch (err) { this.logger.error("codex processor: failed to open compaction llm span", { queueId: this.queueId, @@ -1229,15 +1270,15 @@ export class CodexEventProcessor implements EventProcessor { // prompt also arrives as a response_item user message (handleMessageItem), // which is the canonical source for history. This event only sets the turn's // input for display. - const turnSpan = this.latestOpenTurn(scope); - if (turnSpan === undefined) { + const turn = this.latestOpenTurn(scope); + if (turn === undefined) { this.logger.debug("codex processor: user_message with no open turn span", { queueId: this.queueId, }); return; } try { - turnSpan.log({ input: prompt }); + turn.span.log({ input: prompt }); } catch (err) { this.logger.error("codex processor: failed to set turn input", { queueId: this.queueId, @@ -1259,14 +1300,15 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - const turnSpan = scope.openTurns.get(turnId); - if (turnSpan === undefined) { + const turn = scope.openTurns.get(turnId); + if (turn === undefined) { this.logger.debug("codex processor: task_complete with no open turn span", { queueId: this.queueId, turnId, }); return; } + const turnSpan = turn.span; const endTime = isoToUnixSeconds(record.timestamp); // Backstop: close any spans still open without their own closing record // (a trailing model call with no token_count, or a tool with no output) at @@ -1305,22 +1347,26 @@ export class CodexEventProcessor implements EventProcessor { // (child of the active turn) on first use with input = the conversation so far. private ensureLlmSpan(scope: TranscriptScope, record: TranscriptRecord): void { if (scope.openLlm !== null) return; - const turnSpan = this.latestOpenTurn(scope); - if (turnSpan === undefined) { + const turn = this.latestOpenTurn(scope); + if (turn === undefined) { this.logger.debug("codex processor: model output with no open turn; skipping llm span", { queueId: this.queueId, }); return; } const turnId = this.latestOpenTurnId(scope); - const startTime = isoToUnixSeconds(record.timestamp); + // Start the span where the model's work for this call actually began — the + // end of the turn's most recently-ended child, or the turn's start when this + // is the first child — NOT the record timestamp (which is when the model's + // output landed, just before its token_count, yielding a near-instant span). + const startTime = this.llmStartTimeFor(turn) ?? isoToUnixSeconds(record.timestamp); // Snapshot the conversation as this call's input (the transcript doesn't log // the real request, so this is a best-effort reconstruction). const input = scope.conversationHistory.map((item) => ({ ...item })); const llmName = scope.model ?? "llm"; try { const span = this.trackSpan( - turnSpan.startSpan({ + turn.span.startSpan({ name: llmName, type: "llm", ...(startTime !== undefined ? { startTime } : {}), @@ -1329,7 +1375,7 @@ export class CodexEventProcessor implements EventProcessor { llmName, "llm", ); - scope.openLlm = { span, turnId, output: [] }; + scope.openLlm = { span, turnId, output: [], lastOutputTime: startTime }; this.logger.info("codex processor: opened llm span", { queueId: this.queueId, turnId, @@ -1343,6 +1389,19 @@ export class CodexEventProcessor implements EventProcessor { } } + // Record that the open LLM call produced an output item at `record`'s time, + // advancing its lastOutputTime. The LLM span ends at this time (when the model + // last generated), not at the closing token_count (which Codex writes after + // the tool result). No-op if no LLM span is open. + private noteLlmOutputTime(scope: TranscriptScope, record: TranscriptRecord): void { + if (scope.openLlm === null) return; + const t = isoToUnixSeconds(record.timestamp); + if (t === undefined) return; + if (scope.openLlm.lastOutputTime === undefined || t > scope.openLlm.lastOutputTime) { + scope.openLlm.lastOutputTime = t; + } + } + // Handle a response_item message: an assistant message is this call's output // (opens/feeds the LLM span); user/developer messages are input context. All // are appended to conversation history as chat messages for reconstructing @@ -1359,6 +1418,7 @@ export class CodexEventProcessor implements EventProcessor { if (role === "assistant") { this.ensureLlmSpan(scope, record); scope.openLlm?.output.push(msg); + this.noteLlmOutputTime(scope, record); } scope.conversationHistory.push(msg); } @@ -1370,6 +1430,9 @@ export class CodexEventProcessor implements EventProcessor { // encrypted, so items with no readable summary only open the span. private handleReasoningItem(scope: TranscriptScope, record: TranscriptRecord): void { this.ensureLlmSpan(scope, record); + // The reasoning item is a model output at this time, even when its summary is + // encrypted (empty) and adds nothing readable — so advance the span's end. + this.noteLlmOutputTime(scope, record); const payload = (record.payload ?? {}) as Record; const summary = reasoningSummary(payload.summary); if (summary === undefined) return; @@ -1387,11 +1450,20 @@ export class CodexEventProcessor implements EventProcessor { const info = (payload.info ?? {}) as Record; const usage = (info.last_token_usage ?? {}) as Record; const metrics = tokenMetrics(usage); - const endTime = isoToUnixSeconds(record.timestamp); - const { span, output, outputPreset } = scope.openLlm; + const { span, turnId, output, outputPreset, lastOutputTime } = scope.openLlm; + // End the span when the model last generated (its last output item), NOT at + // this token_count: Codex writes the token_count after the tool result (at + // the same instant as the tool output), so using it would stretch the span to + // swallow the tool's execution — making the llm span overlap the tool it + // called, which is causally impossible. The token_count only supplies metrics. + // Fall back to the token_count time if we somehow have no output time. + const endTime = lastOutputTime ?? isoToUnixSeconds(record.timestamp); try { span.log(outputPreset ? { metrics } : { output: llmOutput(output), metrics }); span.end(endTime !== undefined ? { endTime } : undefined); + // This LLM call is a child of its turn; advance the turn's boundary so the + // next child (LLM or tool) starts where this one ended (its last output). + this.noteChildEnded(scope, turnId, endTime); this.logger.info("codex processor: ended llm span", { queueId: this.queueId }); } catch (err) { this.logger.error("codex processor: failed to end llm span", { @@ -1404,12 +1476,17 @@ export class CodexEventProcessor implements EventProcessor { } // End any open LLM span without a closing token_count (e.g. on turn/root end). + // Prefer the model's last-output time (when generation actually ended) over the + // caller's fallback endTime (the turn/root end), so a trailing call isn't + // stretched to the turn boundary. private endOpenLlmSpan(scope: TranscriptScope, endTime: number | undefined): void { if (scope.openLlm === null) return; - const { span, output } = scope.openLlm; + const { span, turnId, output, lastOutputTime } = scope.openLlm; + const effectiveEnd = lastOutputTime ?? endTime; try { if (output.length > 0) span.log({ output: llmOutput(output) }); - span.end(endTime !== undefined ? { endTime } : undefined); + span.end(effectiveEnd !== undefined ? { endTime: effectiveEnd } : undefined); + this.noteChildEnded(scope, turnId, effectiveEnd); } catch (err) { this.logger.error("codex processor: failed to end dangling llm span", { queueId: this.queueId, @@ -1442,9 +1519,9 @@ export class CodexEventProcessor implements EventProcessor { // function_call uses `arguments` (a JSON string); custom_tool_call uses `input`. const input = payload.arguments ?? payload.input; - // A tool call is part of the assistant's response: record it as an assistant - // message carrying a tool_call, both in the open LLM call's output and in the - // conversation history (input context for later calls). + // Record the tool call as an assistant message carrying a tool_call in the + // conversation history (input context for later calls). Done regardless of + // whether we end up creating a span, so reconstructed history stays faithful. const argsString = typeof input === "string" ? input : JSON.stringify(input ?? {}); const toolCallMsg: ChatMessage = { role: "assistant", @@ -1457,7 +1534,6 @@ export class CodexEventProcessor implements EventProcessor { }, ], }; - scope.openLlm?.output.push(toolCallMsg); scope.conversationHistory.push(toolCallMsg); if (callId === undefined) { @@ -1474,8 +1550,8 @@ export class CodexEventProcessor implements EventProcessor { }); return; } - const turnSpan = scope.openTurns.get(turnId); - if (turnSpan === undefined) { + const turn = scope.openTurns.get(turnId); + if (turn === undefined) { this.logger.warn("codex processor: tool call with no open turn span; skipping", { queueId: this.queueId, turnId, @@ -1483,6 +1559,19 @@ export class CodexEventProcessor implements EventProcessor { }); return; } + const turnSpan = turn.span; + + // A tool call IS a model output, so it belongs to the current LLM call. Open + // the LLM span if one isn't already open (a call whose only output is a tool + // call — no preceding assistant message or readable reasoning — would + // otherwise leave the model's work, often many seconds, with no llm span, + // appearing as an unaccounted gap at the front of the turn). Record the tool + // call as this LLM call's output too. Done only once we know the tool call is + // valid (real open turn), so a tool call for an unknown turn opens nothing. + this.ensureLlmSpan(scope, record); + scope.openLlm?.output.push(toolCallMsg); + // The model emitted this tool call now — that's when its generation ended. + this.noteLlmOutputTime(scope, record); if (scope.openTools.has(callId)) { this.logger.warn("codex processor: duplicate call_id; keeping existing tool span", { queueId: this.queueId, @@ -1563,6 +1652,9 @@ export class CodexEventProcessor implements EventProcessor { try { entry.span.log({ output }); entry.span.end(endTime !== undefined ? { endTime } : undefined); + // A tool is a child of its turn; advance the turn's boundary so a following + // LLM call (the model resuming after the tool result) starts here. + this.noteChildEnded(scope, entry.turnId, endTime); this.logger.info("codex processor: ended tool span", { queueId: this.queueId, callId }); } catch (err) { this.logger.error("codex processor: failed to end tool span", { @@ -1621,12 +1713,37 @@ export class CodexEventProcessor implements EventProcessor { // The most recently opened turn that is still open in this scope (used to // attach a user_message or an LLM span, which carry no turn_id). - private latestOpenTurn(scope: TranscriptScope): Span | undefined { - let last: Span | undefined; - for (const span of scope.openTurns.values()) last = span; + private latestOpenTurn(scope: TranscriptScope): OpenTurn | undefined { + let last: OpenTurn | undefined; + for (const turn of scope.openTurns.values()) last = turn; return last; } + // The start time (Unix seconds) to use for an LLM span opening now under the + // given turn: the end of the turn's most recently-ended child if any, else the + // turn's own start. This makes the LLM span cover request->response (the + // transcript has no record at request-send time) and lets consecutive children + // tile the turn. Per-turn state, so concurrent subagents never interfere. + private llmStartTimeFor(turn: OpenTurn): number | undefined { + return turn.lastChildEndTime ?? turn.startTime; + } + + // Record that a child of the given turn ended at `endTime`, advancing the + // turn's lastChildEndTime so the next LLM span starts where this child ended. + // No-op when endTime is unknown or the turn isn't open. + private noteChildEnded( + scope: TranscriptScope, + turnId: string | undefined, + endTime: number | undefined, + ): void { + if (turnId === undefined || endTime === undefined) return; + const turn = scope.openTurns.get(turnId); + if (turn === undefined) return; + if (turn.lastChildEndTime === undefined || endTime > turn.lastChildEndTime) { + turn.lastChildEndTime = endTime; + } + } + // The turn_id of the most recently opened still-open turn in this scope. private latestOpenTurnId(scope: TranscriptScope): string | undefined { let last: string | undefined; @@ -1661,9 +1778,9 @@ export class CodexEventProcessor implements EventProcessor { if (scope !== null) { this.endOpenLlmSpan(scope, endTime); this.closeAllOpenTools(scope, endArgs); - for (const [turnId, span] of scope.openTurns) { + for (const [turnId, turn] of scope.openTurns) { try { - span.end(endArgs); + turn.span.end(endArgs); } catch (err) { this.logger.error("codex processor: failed to end dangling turn span on root end", { queueId: this.queueId, @@ -1803,9 +1920,11 @@ export class CodexEventProcessor implements EventProcessor { path: scope.path, kind: scope.kind, offset: scope.offset, - openTurns: Array.from(scope.openTurns, ([turnId, span]) => ({ + openTurns: Array.from(scope.openTurns, ([turnId, turn]) => ({ turnId, - span: this.refFor(span), + span: this.refFor(turn.span), + startTime: turn.startTime, + lastChildEndTime: turn.lastChildEndTime, })), turnsAwaitingCompletion: Array.from(scope.turnsAwaitingCompletion), conversationHistory: toItemSnapshots(scope.conversationHistory), @@ -1816,6 +1935,7 @@ export class CodexEventProcessor implements EventProcessor { span: this.refFor(scope.openLlm.span), turnId: scope.openLlm.turnId, output: toItemSnapshots(scope.openLlm.output), + lastOutputTime: scope.openLlm.lastOutputTime, outputPreset: scope.openLlm.outputPreset, }, openTools: Array.from(scope.openTools, ([callId, entry]) => ({ @@ -1914,7 +2034,16 @@ export class CodexEventProcessor implements EventProcessor { kind: s.kind, parentSpan: isMain ? () => this.rootSpan : () => scope.subagentRootSpan, offset: s.offset, - openTurns: new Map(s.openTurns.map((t) => [t.turnId, rehydrate(t.span)])), + openTurns: new Map( + s.openTurns.map((t) => [ + t.turnId, + { + span: rehydrate(t.span), + startTime: t.startTime, + lastChildEndTime: t.lastChildEndTime, + }, + ]), + ), turnsAwaitingCompletion: new Set(s.turnsAwaitingCompletion), conversationHistory: fromItemSnapshots(s.conversationHistory), openLlm: @@ -1924,6 +2053,7 @@ export class CodexEventProcessor implements EventProcessor { span: rehydrate(s.openLlm.span), turnId: s.openLlm.turnId, output: fromItemSnapshots(s.openLlm.output), + lastOutputTime: s.openLlm.lastOutputTime, outputPreset: s.openLlm.outputPreset, }, openTools: new Map( diff --git a/plugins/trace-codex/src/agents/codex/resume.test.ts b/plugins/trace-codex/src/agents/codex/resume.test.ts index c618a7e..293234d 100644 --- a/plugins/trace-codex/src/agents/codex/resume.test.ts +++ b/plugins/trace-codex/src/agents/codex/resume.test.ts @@ -32,7 +32,7 @@ import { type TraceEntry, taskComplete, taskStarted, - type transcript, + transcript, turnContext, userMessage, } from "./test-helpers.ts"; @@ -309,6 +309,8 @@ describe("CodexEventProcessor: resume from snapshot", () => { output: "tool done", ended: true, children: [ + // The model call that emitted the tool call opens an llm span. + { span_attributes: { name: "gpt-5.5", type: "llm" }, ended: true }, { span_attributes: { name: "shell", type: "tool" }, output: "ok", @@ -324,6 +326,90 @@ describe("CodexEventProcessor: resume from snapshot", () => { } }); + // The per-turn timing used to place LLM spans (turn start + last-child-end) + // must survive a restart, so an LLM call that opens AFTER the restart still + // starts at the right place (here: the turn's start, since it's the first + // child) rather than at its post-restart output record. + test("turn timing survives a restart so a resumed llm span starts correctly", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-timing"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: the turn opens at :11 but the model hasn't produced output yet when + // the server shuts down (no llm span exists yet — only the open turn). + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "turn_context", + payload: { model: "gpt-5.5" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + // The open turn's start time was persisted. + const snap = store.read(queueId); + expect(snap?.scopes[0]?.openTurns[0]?.turnId).toBe("turn-1"); + expect(snap?.scopes[0]?.openTurns[0]?.startTime).toBe(1767225611); + + // Run 2: a fresh processor. The model output finally lands (:18) and the + // call closes (:18). The resumed llm span must start at the turn's start + // (:11, restored), not at its output record (:18). + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:18Z", + type: "response_item", + payload: { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "hi" }], + }, + }), + transcript({ + timestamp: "2026-01-01T00:00:18Z", + type: "event_msg", + payload: { type: "token_count", info: { last_token_usage: { total_tokens: 1 } } }, + }), + transcript({ + timestamp: "2026-01-01T00:00:19Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "hi" }, + }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const spans = await trace.drain(); + const tree = spansToTree(spans); + const turn = tree?.children.find((c) => c.name === "turn: turn-1"); + const llm = turn?.children.find((c) => c.name === "gpt-5.5" || c.name === "llm"); + // Started at the restored turn start (:11), ended at token_count (:18). + expect(llm?.metrics?.start).toBe(1767225611); + expect(llm?.metrics?.end).toBe(1767225618); + } finally { + trace.cleanup(); + } + }); + test("a version-incompatible snapshot is discarded and the session starts fresh", async () => { const trace = withCapturedTrace(); try { diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.ts index 979fd41..23fb629 100644 --- a/plugins/trace-codex/src/agents/codex/state-snapshot.ts +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.ts @@ -30,7 +30,7 @@ import type { ReportingConfig, SpanRef } from "../../braintrust/logger.ts"; * fresh, losing only resume continuity), so we never try to rehydrate state we * can't interpret. */ -export const SNAPSHOT_SCHEMA_VERSION = 1; +export const SNAPSHOT_SCHEMA_VERSION = 3; /** A conversation item (chat message or reasoning), stored verbatim. These are * plain JSON already, so they round-trip without transformation. */ @@ -41,6 +41,8 @@ export interface OpenLlmSnapshot { span: SpanRef; turnId: string | undefined; output: ConversationItemSnapshot[]; + /** Time (Unix seconds) of the call's last model-output item; the span's end. */ + lastOutputTime: number | undefined; outputPreset?: boolean; } @@ -51,10 +53,15 @@ export interface OpenToolSnapshot { turnId: string; } -/** An open turn span (keyed by turn_id in the map). */ +/** An open turn span (keyed by turn_id in the map), plus the timing used to + * place its child LLM spans. */ export interface OpenTurnSnapshot { turnId: string; span: SpanRef; + /** Turn start (Unix seconds), used as the first child's start time. */ + startTime: number | undefined; + /** Max end time (Unix seconds) among the turn's already-ended children. */ + lastChildEndTime: number | undefined; } /** Pending-subagent info captured before its root span exists. The parent turn From 7c42c247d8245d1905c7e9d5704a61fd5b426d1f Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Tue, 23 Jun 2026 14:52:39 -0600 Subject: [PATCH 15/17] fix session resume bug --- .../src/agents/codex/event-processor.ts | 99 +++++--- .../src/agents/codex/resume.test.ts | 213 +++++++++++++++++- .../src/agents/codex/snapshot-store.test.ts | 4 +- .../src/agents/codex/state-snapshot.test.ts | 4 +- .../src/agents/codex/state-snapshot.ts | 21 +- plugins/trace-codex/src/braintrust/logger.ts | 21 +- plugins/trace-codex/src/test-helpers.ts | 1 + 7 files changed, 310 insertions(+), 53 deletions(-) diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 6de6fcd..55fbfe2 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -423,6 +423,11 @@ export class CodexEventProcessor implements EventProcessor { private reportingConfig: ReportingConfig | undefined; private rootSpan: Span | null = null; private rootEnded = false; + // The time (Unix seconds) the root span was ended, if it has been. Persisted so + // that on resume — where rehydrating the root necessarily re-emits its row — we + // can re-assert the end time and keep the root closed, rather than leaving it + // looking open (a rehydrated handle is created with no end). + private rootEndTime: number | undefined; // Root enrichment from the SessionStart hook (source/permission_mode), which // the transcript lacks. Applied when the root span is created, or patched onto // the root if it already exists. @@ -461,7 +466,10 @@ export class CodexEventProcessor implements EventProcessor { // doesn't expose these as getters, but a rehydrated span must re-assert them // (else the SDK infers a wrong name from the call stack on resume), so we // remember them here and feed them into each SpanRef at serialize time. - private readonly spanAttrs = new Map(); + private readonly spanAttrs = new Map< + string, + { name: string; type: SpanRef["type"]; startTime: number | undefined } + >(); // Optional per-session state store. When present, the processor persists a // snapshot of its resumable state on flush and, on its first event, rehydrates // any snapshot left by a previous server run for this session (so a session @@ -815,6 +823,7 @@ export class CodexEventProcessor implements EventProcessor { }), subagentName, "task", + startTime, ); this.logger.info("codex processor: opened subagent root span", { queueId: this.queueId, @@ -1057,6 +1066,7 @@ export class CodexEventProcessor implements EventProcessor { }), spanName, "task", + startTime, ); this.logger.info("codex processor: opened root span", { queueId: this.queueId, @@ -1126,6 +1136,7 @@ export class CodexEventProcessor implements EventProcessor { }), turnName, "task", + startTime, ); scope.openTurns.set(turnId, { span: turnSpan, startTime, lastChildEndTime: undefined }); this.logger.info("codex processor: opened turn span", { @@ -1171,8 +1182,12 @@ export class CodexEventProcessor implements EventProcessor { try { turnSpan.setAttributes({ name: "compaction", type: "task" }); // Remember the rename so a resumed snapshot re-asserts "compaction", not - // the original "turn: ..." name. - this.spanAttrs.set(turnSpan.spanId, { name: "compaction", type: "task" }); + // the original "turn: ..." name. Preserve the span's original start time. + this.spanAttrs.set(turnSpan.spanId, { + name: "compaction", + type: "task", + startTime: this.spanAttrs.get(turnSpan.spanId)?.startTime, + }); turnSpan.log({ metadata: { compaction: { @@ -1220,6 +1235,7 @@ export class CodexEventProcessor implements EventProcessor { }), compactionLlmName, "llm", + startTime, ); // Reuse the open-llm slot so the turn's token_count closes it with usage. // outputPreset: the replacement history is already logged as the output, so @@ -1374,6 +1390,7 @@ export class CodexEventProcessor implements EventProcessor { }), llmName, "llm", + startTime, ); scope.openLlm = { span, turnId, output: [], lastOutputTime: startTime }; this.logger.info("codex processor: opened llm span", { @@ -1598,6 +1615,7 @@ export class CodexEventProcessor implements EventProcessor { }), toolName, "tool", + startTime, ); scope.openTools.set(callId, { span: toolSpan, turnId }); // Remember the TURN span that ran a spawn_agent tool (main scope only) so a @@ -1771,6 +1789,7 @@ export class CodexEventProcessor implements EventProcessor { private endRootSpan(endTime: number | undefined): void { if (this.rootSpan === null || this.rootEnded) return; this.rootEnded = true; + this.rootEndTime = endTime; const endArgs = endTime !== undefined ? { endTime } : undefined; const scope = this.mainScope; // Backstop: close any open LLM span, any still-open tool spans, then any @@ -1898,15 +1917,22 @@ export class CodexEventProcessor implements EventProcessor { // Remember the name/type a span was created (or renamed) with, so it can be // re-asserted on rehydration. Returns the span unchanged for call-site // convenience. - private trackSpan(span: Span, name: string, type: SpanRef["type"]): Span { - this.spanAttrs.set(span.spanId, { name, type }); + private trackSpan( + span: Span, + name: string, + type: SpanRef["type"], + startTime: number | undefined, + ): Span { + this.spanAttrs.set(span.spanId, { name, type, startTime }); return span; } - // Capture a span's identity plus its remembered name/type for the snapshot. + // Capture a span's identity plus its remembered name/type/start for the + // snapshot, so a rehydrated handle keeps them (and isn't re-stamped to the + // resume time). private refFor(span: Span): SpanRef { const attrs = this.spanAttrs.get(span.spanId); - return spanRef(span, attrs?.name, attrs?.type); + return spanRef(span, attrs?.name, attrs?.type, attrs?.startTime); } // Build a serializable snapshot of all resumable state. Span handles are @@ -1967,25 +1993,23 @@ export class CodexEventProcessor implements EventProcessor { reportingConfig: redactReportingConfig(this.reportingConfig), rootSpan: this.rootSpan === null ? null : this.refFor(this.rootSpan), rootEnded: this.rootEnded, + rootEndTime: this.rootEndTime, rootEnrichment: { ...this.rootEnrichment }, mainScopePath: this.mainScope?.path ?? null, scopes, - spawnTurnSpansByCallId: Array.from(this.spawnTurnSpansByCallId, ([key, span]) => ({ - key, - span: this.refFor(span), - })), - spawnTurnSpansByAgentId: Array.from(this.spawnTurnSpansByAgentId, ([key, span]) => ({ - key, - span: this.refFor(span), - })), + // We deliberately do NOT persist the span-bearing side-maps + // (spawnTurnSpansBy*, compactionSpansByTurn). They retain references to + // spans that have since CLOSED, and rehydrating a closed span re-opens it + // (the SDK stamps a fresh start and no end), producing duplicate, never- + // closing spans on resume. The only things they enable across a restart are + // rare in-flight reconciliations (nesting a subagent spawned right before + // the restart; back-filling a compaction trigger whose hook lands after the + // restart) — acceptable to lose. The trigger map is plain strings (no span + // refs), so it's harmless to keep. compactionTriggerByTurn: Array.from(this.compactionTriggerByTurn, ([key, value]) => ({ key, value, })), - compactionSpansByTurn: Array.from(this.compactionSpansByTurn, ([key, span]) => ({ - key, - span: this.refFor(span), - })), }; } @@ -2010,19 +2034,39 @@ export class CodexEventProcessor implements EventProcessor { } const factory = this.spanFactory; - // Rehydrate a handle and re-remember its name/type, so if this resumed + // Rehydrate a handle and re-remember its name/type/start, so if this resumed // processor is itself snapshotted again, the attributes survive the next hop. const rehydrate = (ref: SpanRef): Span => { const span = factory.rehydrateSpan(ref); if (ref.name !== undefined && ref.type !== undefined) { - this.spanAttrs.set(span.spanId, { name: ref.name, type: ref.type }); + this.spanAttrs.set(span.spanId, { + name: ref.name, + type: ref.type, + startTime: ref.startTime, + }); } return span; }; this.rootSpan = snapshot.rootSpan === null ? null : rehydrate(snapshot.rootSpan); this.rootEnded = snapshot.rootEnded; + this.rootEndTime = snapshot.rootEndTime; this.rootEnrichment = { ...snapshot.rootEnrichment }; + // Rehydrating the root necessarily re-emits its row with no end time. If the + // root was already ended before the restart, re-assert that end now so it + // stays closed (a later turn can still attach as a child of an ended root). + if (this.rootEnded && this.rootSpan !== null) { + try { + this.rootSpan.end( + this.rootEndTime !== undefined ? { endTime: this.rootEndTime } : undefined, + ); + } catch (err) { + this.logger.error("codex processor: failed to re-end root span on restore", { + queueId: this.queueId, + error: String(err), + }); + } + } for (const s of snapshot.scopes) { const isMain = s.path === snapshot.mainScopePath; @@ -2077,18 +2121,13 @@ export class CodexEventProcessor implements EventProcessor { if (isMain) this.mainScope = scope; } - for (const e of snapshot.spawnTurnSpansByCallId) { - this.spawnTurnSpansByCallId.set(e.key, rehydrate(e.span)); - } - for (const e of snapshot.spawnTurnSpansByAgentId) { - this.spawnTurnSpansByAgentId.set(e.key, rehydrate(e.span)); - } + // Span-bearing side-maps (spawnTurnSpansBy*, compactionSpansByTurn) are not + // persisted — see serialize() — so there is nothing to rehydrate for them, + // which is what keeps a resume from re-opening already-closed spans. Only the + // plain-string trigger map is restored. for (const e of snapshot.compactionTriggerByTurn) { this.compactionTriggerByTurn.set(e.key, e.value); } - for (const e of snapshot.compactionSpansByTurn) { - this.compactionSpansByTurn.set(e.key, rehydrate(e.span)); - } } // Read and process new transcript records for every scope, retrying on a diff --git a/plugins/trace-codex/src/agents/codex/resume.test.ts b/plugins/trace-codex/src/agents/codex/resume.test.ts index 293234d..488bee4 100644 --- a/plugins/trace-codex/src/agents/codex/resume.test.ts +++ b/plugins/trace-codex/src/agents/codex/resume.test.ts @@ -21,6 +21,7 @@ import { CodexEventProcessor } from "./event-processor.ts"; import { SnapshotStore } from "./snapshot-store.ts"; import type { CodexSnapshot } from "./state-snapshot.ts"; import { + compacted, configEvent, FakeTranscriptReader, functionCall, @@ -32,6 +33,7 @@ import { type TraceEntry, taskComplete, taskStarted, + tokenCount, transcript, turnContext, userMessage, @@ -426,13 +428,11 @@ describe("CodexEventProcessor: resume from snapshot", () => { reportingConfig: { traceToBraintrust: true }, rootSpan: { spanId: "stale", rootSpanId: "stale", spanParents: [] }, rootEnded: false, + rootEndTime: undefined, rootEnrichment: {}, mainScopePath: "/old.jsonl", scopes: [], - spawnTurnSpansByCallId: [], - spawnTurnSpansByAgentId: [], compactionTriggerByTurn: [], - compactionSpansByTurn: [], }); const deps = { queueId, reader, factory: trace.spanFactory, store }; @@ -461,4 +461,211 @@ describe("CodexEventProcessor: resume from snapshot", () => { trace.cleanup(); } }); + + // Regression for the duplicate/never-closing compaction span: a compaction that + // FINISHED before the restart must not be re-opened on resume. (Previously the + // compaction span lived forever in a side-map and was rehydrated — re-emitting + // it with a fresh start and no end.) + test("a completed compaction is not re-opened on resume", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-compact"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a turn, then a compaction turn that completes, all before shutdown. + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "turn_context", + payload: { model: "gpt-5.5" }, + }), + // A normal turn. + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + userMessage({ message: "hi" }), + taskComplete({ turn_id: "turn-1", last_agent_message: "hello" }), + stop({ turn_id: "turn-1" }), + // A compaction turn that fully completes. + transcript({ + timestamp: "2026-01-01T00:00:20Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-2" }, + }), + compacted({ + replacement_history: [{ type: "message", role: "user", content: [] }], + window_id: 1, + }), + tokenCount({ total_tokens: 5 }), + transcript({ + timestamp: "2026-01-01T00:00:22Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-2", last_agent_message: "" }, + }), + // A non-terminal hook drives the catch-up that consumes the above. + preToolUse({ session_id: queueId }), + ], + deps, + ); + + // The compaction span finished in run 1. + let spans = await trace.drain(); + const compaction1 = spans.find((s) => s.span_attributes?.name === "compaction"); + expect(compaction1?.metrics?.end).toBeDefined(); + const compactionId = compaction1?.span_id; + + // Run 2: a new processor resumes and runs another turn. + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:30Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-3" }, + }), + userMessage({ message: "again" }), + taskComplete({ turn_id: "turn-3", last_agent_message: "ok" }), + stop({ turn_id: "turn-3" }), + ], + deps, + ); + + spans = await trace.drain(); + // The compaction span must NOT have been re-emitted (reopened) on resume: + // run 2 produces no row for its span_id. + expect(spans.some((s) => s.span_id === compactionId)).toBe(false); + } finally { + trace.cleanup(); + } + }); + + // The root span, ended before a restart, must keep its original start AND end + // after resume — rehydrating it re-emits its row, so we re-assert the end. + test("a root ended before a restart keeps its original start and end", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-rootend"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + // Run 1: a full turn; its Stop ends the root at the turn's end (:13). + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + userMessage({ message: "hi" }), + transcript({ + timestamp: "2026-01-01T00:00:13Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "hello" }, + }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const snap = store.read(queueId); + expect(snap?.rootEnded).toBe(true); + expect(snap?.rootEndTime).toBe(1767225613); + + // Run 2: a new processor resumes (a second turn arrives). + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:20Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-2" }, + }), + userMessage({ message: "again" }), + taskComplete({ turn_id: "turn-2", last_agent_message: "ok" }), + stop({ turn_id: "turn-2" }), + ], + deps, + ); + + const spans = await trace.drain(); + const tree = spansToTree(spans); + // The root kept its original start (:10) and end (:13) — not corrupted to + // the resume time, not left open. + expect(tree?.metrics?.start).toBe(1767225610); + expect(tree?.metrics?.end).toBe(1767225613); + // The second turn still attached under the same root. + expect(tree?.children.some((c) => c.name === "turn: turn-2")).toBe(true); + } finally { + trace.cleanup(); + } + }); + + // An OPEN span spanning the restart must keep its original start (the SDK would + // otherwise stamp the rehydration time). Here the turn opens at :11 in run 1 + // and is closed in run 2. + test("an open turn span keeps its original start across a restart", async () => { + const trace = withCapturedTrace(); + try { + const queueId = "sess-openstart"; + const reader = new FakeTranscriptReader(); + const store = new MemorySnapshotStore(); + const deps = { queueId, reader, factory: trace.spanFactory, store }; + + await runThrough( + [ + configEvent({ session_id: queueId }), + transcript({ + timestamp: "2026-01-01T00:00:10Z", + type: "session_meta", + payload: { id: queueId, cwd: "/tmp/proj" }, + }), + transcript({ + timestamp: "2026-01-01T00:00:11Z", + type: "event_msg", + payload: { type: "task_started", turn_id: "turn-1" }, + }), + userMessage({ message: "hi" }), + sessionStart({ session_id: queueId, source: "startup" }), + preToolUse({ session_id: queueId }), + ], + deps, + ); + + await runThrough( + [ + transcript({ + timestamp: "2026-01-01T00:00:30Z", + type: "event_msg", + payload: { type: "task_complete", turn_id: "turn-1", last_agent_message: "done" }, + }), + stop({ turn_id: "turn-1" }), + ], + deps, + ); + + const tree = spansToTree(await trace.drain()); + const turn = tree?.children.find((c) => c.name === "turn: turn-1"); + // Start preserved from run 1 (:11), end from run 2 (:30) — not the resume time. + expect(turn?.metrics?.start).toBe(1767225611); + expect(turn?.metrics?.end).toBe(1767225630); + } finally { + trace.cleanup(); + } + }); }); diff --git a/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts index 5eb1b34..2a85a85 100644 --- a/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts +++ b/plugins/trace-codex/src/agents/codex/snapshot-store.test.ts @@ -15,13 +15,11 @@ function makeSnapshot(sessionId: string): CodexSnapshot { reportingConfig: { project: "p", traceToBraintrust: true }, rootSpan: { spanId: "r", rootSpanId: "r", spanParents: [] }, rootEnded: false, + rootEndTime: undefined, rootEnrichment: { source: "startup" }, mainScopePath: "/t.jsonl", scopes: [], - spawnTurnSpansByCallId: [], - spawnTurnSpansByAgentId: [], compactionTriggerByTurn: [], - compactionSpansByTurn: [], }; } diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts index 63efdc9..5498993 100644 --- a/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.test.ts @@ -15,13 +15,11 @@ function snapshot(overrides: Partial = {}): CodexSnapshot { reportingConfig: undefined, rootSpan: null, rootEnded: false, + rootEndTime: undefined, rootEnrichment: {}, mainScopePath: null, scopes: [], - spawnTurnSpansByCallId: [], - spawnTurnSpansByAgentId: [], compactionTriggerByTurn: [], - compactionSpansByTurn: [], ...overrides, }; } diff --git a/plugins/trace-codex/src/agents/codex/state-snapshot.ts b/plugins/trace-codex/src/agents/codex/state-snapshot.ts index 23fb629..0564ee4 100644 --- a/plugins/trace-codex/src/agents/codex/state-snapshot.ts +++ b/plugins/trace-codex/src/agents/codex/state-snapshot.ts @@ -30,7 +30,7 @@ import type { ReportingConfig, SpanRef } from "../../braintrust/logger.ts"; * fresh, losing only resume continuity), so we never try to rehydrate state we * can't interpret. */ -export const SNAPSHOT_SCHEMA_VERSION = 3; +export const SNAPSHOT_SCHEMA_VERSION = 4; /** A conversation item (chat message or reasoning), stored verbatim. These are * plain JSON already, so they round-trip without transformation. */ @@ -90,12 +90,6 @@ export interface ScopeSnapshot { pendingSubagent?: PendingSubagentSnapshot; } -/** A side-map entry pairing a key (turn_id or call_id or agent_id) with a span. */ -export interface SpanMapEntry { - key: string; - span: SpanRef; -} - /** A side-map entry pairing a key with a string value. */ export interface StringMapEntry { key: string; @@ -122,16 +116,23 @@ export interface CodexSnapshot { rootSpan: SpanRef | null; rootEnded: boolean; + /** When the root was ended (Unix seconds), if it has been; re-asserted on + * resume so the rehydrated root row stays closed. */ + rootEndTime: number | undefined; rootEnrichment: { source?: string; permissionMode?: string }; /** Path of the main scope within `scopes`, if it exists. */ mainScopePath: string | null; scopes: ScopeSnapshot[]; - spawnTurnSpansByCallId: SpanMapEntry[]; - spawnTurnSpansByAgentId: SpanMapEntry[]; + /** + * Compaction triggers ("manual"/"auto") keyed by turn_id. Plain strings (no + * span references), so safe to carry across a restart. The span-bearing + * side-maps (spawn-agent turn spans, compaction spans) are deliberately NOT + * persisted: they hold references to spans that may have closed, and + * rehydrating a closed span re-opens it. + */ compactionTriggerByTurn: StringMapEntry[]; - compactionSpansByTurn: SpanMapEntry[]; } /** A parsed snapshot is usable only if it matches our plugin + schema version. */ diff --git a/plugins/trace-codex/src/braintrust/logger.ts b/plugins/trace-codex/src/braintrust/logger.ts index a9b8598..4b95023 100644 --- a/plugins/trace-codex/src/braintrust/logger.ts +++ b/plugins/trace-codex/src/braintrust/logger.ts @@ -31,20 +31,32 @@ export interface SpanRef { */ name?: string; type?: StartSpanArgs["type"]; + /** + * The span's original start time (Unix seconds). Re-asserted on rehydration so + * the merged row keeps its original start — without it the SDK stamps the start + * to the moment of rehydration (the resume time), corrupting the span's timing. + */ + startTime?: number; } /** - * Capture a live span's identity for later rehydration. `name`/`type` are the - * attributes the span was created with (the SDK doesn't expose them as getters), - * passed by the caller so they can be re-asserted on resume. + * Capture a live span's identity for later rehydration. `name`/`type`/`startTime` + * are the attributes the span was created with (the SDK doesn't expose them as + * getters), passed by the caller so they can be re-asserted on resume. */ -export function spanRef(span: Span, name?: string, type?: StartSpanArgs["type"]): SpanRef { +export function spanRef( + span: Span, + name?: string, + type?: StartSpanArgs["type"], + startTime?: number, +): SpanRef { return { spanId: span.spanId, rootSpanId: span.rootSpanId, spanParents: span.spanParents, ...(name !== undefined ? { name } : {}), ...(type !== undefined ? { type } : {}), + ...(startTime !== undefined ? { startTime } : {}), }; } @@ -128,6 +140,7 @@ export function createSpanFactory(config?: ReportingConfig, diagLogger?: Logger) parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId }, ...(ref.name !== undefined ? { name: ref.name } : {}), ...(ref.type !== undefined ? { type: ref.type } : {}), + ...(ref.startTime !== undefined ? { startTime: ref.startTime } : {}), }), flush: async () => { try { diff --git a/plugins/trace-codex/src/test-helpers.ts b/plugins/trace-codex/src/test-helpers.ts index 82fea54..2f99eeb 100644 --- a/plugins/trace-codex/src/test-helpers.ts +++ b/plugins/trace-codex/src/test-helpers.ts @@ -135,6 +135,7 @@ export function withCapturedTrace(): CapturedTrace { parentSpanIds: { parentSpanIds: ref.spanParents, rootSpanId: ref.rootSpanId }, ...(ref.name !== undefined ? { name: ref.name } : {}), ...(ref.type !== undefined ? { type: ref.type } : {}), + ...(ref.startTime !== undefined ? { startTime: ref.startTime } : {}), }), flush: () => logger.flush(), }, From aa3de8e876cb5aa6f9243bf5250e1c8b620df788 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Tue, 23 Jun 2026 15:31:16 -0600 Subject: [PATCH 16/17] codex proxy util --- plugins/trace-codex/Makefile | 12 +- plugins/trace-codex/scripts/token-proxy.ts | 372 +++++++++++++++++++++ 2 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 plugins/trace-codex/scripts/token-proxy.ts diff --git a/plugins/trace-codex/Makefile b/plugins/trace-codex/Makefile index 862ac4c..ead81c7 100644 --- a/plugins/trace-codex/Makefile +++ b/plugins/trace-codex/Makefile @@ -13,6 +13,10 @@ # installed plugin, traced to a local mock collector, and # assert >=1 trace row. Requires codex, bun, and # OPENAI_API_KEY; optionally SMOKE_VERSION to pin a release. +# make token-proxy Run the token-counting proxy (debugging aid) that sits +# between Codex and OpenAI and prints each call's token usage +# for comparison against the trace. Requires OPENAI_API_KEY. +# See scripts/token-proxy.ts for the config.toml snippet. # make clean Remove build artifacts. SHELL := /bin/bash @@ -28,7 +32,7 @@ EXPECTED_VERSION := $(shell sed -n 's/.*"version"[[:space:]]*:[[:space:]]*"\([^" BIN := bin/codex-hook BASE_URL := http://127.0.0.1:$(TEST_PORT) -.PHONY: test lint typecheck unit build integration smoke clean +.PHONY: test lint typecheck unit build integration smoke token-proxy clean test: lint typecheck unit integration @@ -93,6 +97,12 @@ smoke: @echo "==> Smoke test" SMOKE_RELEASE_VERSION="$(SMOKE_VERSION)" sh scripts/smoke-test.sh +# Token-counting proxy for verifying trace-codex's token accounting. Run it, then +# point Codex at it via a custom model_provider (see scripts/token-proxy.ts). +token-proxy: + @echo "==> Token proxy (Ctrl-C for the session total)" + bun run scripts/token-proxy.ts + clean: @echo "==> Cleaning" rm -rf bin diff --git a/plugins/trace-codex/scripts/token-proxy.ts b/plugins/trace-codex/scripts/token-proxy.ts new file mode 100644 index 0000000..3344402 --- /dev/null +++ b/plugins/trace-codex/scripts/token-proxy.ts @@ -0,0 +1,372 @@ +// Token-counting proxy for verifying trace-codex's token accounting. +// +// Codex has no per-session token readout we can compare against, so this is a +// standalone debugging tool: a local HTTP proxy that sits between Codex and the +// real OpenAI API. It forwards every request through unchanged and tees the +// streaming response, parsing each model call's `usage` straight off the wire. +// You then compare these ground-truth numbers against what trace-codex records +// in Braintrust. +// +// It is NOT part of either plugin's runtime — purely a manual verification aid. +// +// ── Usage ────────────────────────────────────────────────────────────────── +// 1. Run the proxy: +// bun run scripts/token-proxy.ts +// It listens on http://127.0.0.1:53800 and, by default, forwards to the +// ChatGPT backend (https://chatgpt.com) — for ChatGPT-login Codex, the +// common case. It logs every request it receives. +// +// 2. Point Codex at it via a custom provider in ~/.codex/config.toml. These +// keys are USER-LEVEL only and MUST go ABOVE any [table] headers (in TOML, +// keys after a `[table]` header belong to that table), e.g. at the very top +// of the file: +// +// For ChatGPT login (auth_mode=chatgpt — check ~/.codex/auth.json): +// model_provider = "token-proxy" +// [model_providers.token-proxy] +// name = "token-proxy" +// # base_url must contain "/backend-api" so Codex uses ChatGPT path style; +// # the proxy forwards this exact path to https://chatgpt.com. +// base_url = "http://127.0.0.1:53800/backend-api/codex" +// requires_openai_auth = true # relay the ChatGPT token + account headers +// wire_api = "responses" +// +// For API-key login instead: run the proxy with +// TOKEN_PROXY_UPSTREAM=https://api.openai.com bun run scripts/token-proxy.ts +// and use: +// model_provider = "token-proxy" +// [model_providers.token-proxy] +// name = "token-proxy" +// base_url = "http://127.0.0.1:53800/v1" +// env_key = "OPENAI_API_KEY" +// wire_api = "responses" +// +// 3. Run a Codex session as usual. Each model call prints a usage line, and a +// running session total is kept. A machine-readable NDJSON log is written to +// TOKEN_PROXY_OUT (default: token-proxy.ndjson) — one line per call plus a +// final totals line — so you can diff against the trace. +// +// ── How usage is read ──────────────────────────────────────────────────────── +// Codex uses the Responses API with SSE streaming. The terminal +// `response.completed` event carries `response.usage`: +// { input_tokens, output_tokens, total_tokens, +// input_tokens_details: { cached_tokens }, +// output_tokens_details: { reasoning_tokens } } +// We pass the stream through byte-for-byte to Codex while parsing a copy, so the +// proxy is transparent. Non-streaming JSON responses are handled too (we read +// `usage` from the parsed body) for completeness. + +import { appendFileSync, writeFileSync } from "node:fs"; + +const PORT = Number(process.env.TOKEN_PROXY_PORT || "53800"); +// Upstream ORIGIN the proxy forwards to (scheme://host, no path). The proxy is a +// transparent tunnel: it forwards the exact request path Codex builds onto this +// origin. Default is the ChatGPT host Codex uses when logged in via ChatGPT +// (auth_mode=chatgpt) — the common case. For an API-key login, set +// TOKEN_PROXY_UPSTREAM=https://api.openai.com (and use the api-key provider). +const UPSTREAM = ( + process.env.TOKEN_PROXY_UPSTREAM || + process.env.OPENAI_BASE_URL || + "https://chatgpt.com" +).replace(/\/+$/, ""); +const OUT = process.env.TOKEN_PROXY_OUT || "/tmp/token-proxy.ndjson"; +// When set, dump the raw upstream SSE event blocks to TOKEN_PROXY_DEBUG_OUT so we +// can inspect the exact event shape (the ChatGPT backend may differ from the +// public Responses API). +const DEBUG = process.env.TOKEN_PROXY_DEBUG === "1"; +const DEBUG_OUT = process.env.TOKEN_PROXY_DEBUG_OUT || "token-proxy-raw.log"; + +/** Per-call usage, normalized to the keys we care about. */ +interface Usage { + input_tokens: number; + cached_input_tokens: number; + output_tokens: number; + reasoning_output_tokens: number; + total_tokens: number; +} + +const ZERO_USAGE: Usage = { + input_tokens: 0, + cached_input_tokens: 0, + output_tokens: 0, + reasoning_output_tokens: 0, + total_tokens: 0, +}; + +// Running totals across the proxy's lifetime (one session, typically). +let callCount = 0; +const sessionTotal: Usage = { ...ZERO_USAGE }; + +function log(message: string): void { + process.stderr.write(`token-proxy: ${message}\n`); +} + +// Truncate the NDJSON log on startup so each run is a fresh capture. +try { + writeFileSync(OUT, ""); +} catch (err) { + log(`could not open ${OUT}: ${String(err)} (file logging disabled)`); +} + +if (DEBUG) { + try { + writeFileSync(DEBUG_OUT, ""); + log(`DEBUG on: dumping raw upstream SSE blocks to ${DEBUG_OUT}`); + } catch { + // ignore + } +} + +function record(line: unknown): void { + try { + appendFileSync(OUT, `${JSON.stringify(line)}\n`); + } catch { + // best-effort; the stderr line is the primary output + } +} + +function num(v: unknown): number { + return typeof v === "number" && Number.isFinite(v) ? v : 0; +} + +/** + * Pull usage out of a Responses API `response` object (the shape under both the + * `response.completed` SSE event and a non-streaming JSON body). Returns null if + * there is no usage block. + */ +function usageFromResponse(response: unknown): Usage | null { + if (response === null || typeof response !== "object") return null; + const usage = (response as { usage?: unknown }).usage; + if (usage === null || typeof usage !== "object") return null; + const u = usage as Record; + const inputDetails = (u.input_tokens_details ?? {}) as Record; + const outputDetails = (u.output_tokens_details ?? {}) as Record; + return { + input_tokens: num(u.input_tokens), + cached_input_tokens: num(inputDetails.cached_tokens), + output_tokens: num(u.output_tokens), + reasoning_output_tokens: num(outputDetails.reasoning_tokens), + total_tokens: num(u.total_tokens), + }; +} + +/** Record a call's usage: bump totals, print a line, append to the NDJSON log. */ +function reportUsage(usage: Usage, model: string | undefined): void { + callCount += 1; + sessionTotal.input_tokens += usage.input_tokens; + sessionTotal.cached_input_tokens += usage.cached_input_tokens; + sessionTotal.output_tokens += usage.output_tokens; + sessionTotal.reasoning_output_tokens += usage.reasoning_output_tokens; + sessionTotal.total_tokens += usage.total_tokens; + + log( + `call #${callCount}${model ? ` (${model})` : ""}: ` + + `input=${usage.input_tokens} (cached=${usage.cached_input_tokens}) ` + + `output=${usage.output_tokens} (reasoning=${usage.reasoning_output_tokens}) ` + + `total=${usage.total_tokens} || session total=${sessionTotal.total_tokens}`, + ); + record({ kind: "call", call: callCount, model, usage, sessionTotal: { ...sessionTotal } }); +} + +/** + * A TransformStream that passes bytes through untouched while scanning the SSE + * text for the terminal usage. Codex receives the exact upstream bytes; we just + * watch a copy. We look for `response.completed` (and tolerate `response.usage` + * appearing on other terminal events) and parse the embedded JSON object's + * usage. The model id is pulled from the same event when present. + */ +function makeUsageSniffer(): TransformStream { + const decoder = new TextDecoder(); + let buffer = ""; + let reported = false; + + const tryParseEvent = (block: string): void => { + if (block.trim().length === 0) return; + if (DEBUG) { + try { + appendFileSync(DEBUG_OUT, `${block}\n---\n`); + } catch { + // ignore + } + } + // An SSE event block is lines like "event: ...\ndata: {json}". Concatenate + // all data: lines (the Responses API sends single-line JSON, but be safe). + const dataLines: string[] = []; + for (const line of block.split("\n")) { + const trimmed = line.replace(/\r$/, ""); + if (trimmed.startsWith("data:")) dataLines.push(trimmed.slice(5).trimStart()); + } + if (dataLines.length === 0) return; + const data = dataLines.join(""); + if (data === "[DONE]" || data.length === 0) return; + let parsed: unknown; + try { + parsed = JSON.parse(data); + } catch { + return; // not JSON we can read; ignore + } + if (parsed === null || typeof parsed !== "object") return; + const obj = parsed as Record; + // Find a usage block wherever it lives: nested under `response` (the + // completed/incomplete events) or at the top level. Report each one we see — + // a single model call's stream carries usage once, so the last one wins if + // there are several. (Don't short-circuit on the first: some backends emit a + // partial usage earlier in the stream.) + const response = (obj.response ?? obj) as Record; + const usage = usageFromResponse(response) ?? usageFromResponse(obj); + if (usage === null) return; + const model = + typeof response.model === "string" + ? response.model + : typeof obj.model === "string" + ? obj.model + : undefined; + reportUsage(usage, model); + reported = true; + }; + + // Whole-body accumulator for the non-SSE JSON fallback, capped so a huge + // response can't balloon memory (usage lives in the small completed event / + // top-level object, so a few MB is plenty). + let fullBody = ""; + const FULL_BODY_CAP = 4_000_000; + + // Last-resort: try to parse the entire body as one JSON object and pull usage + // from it (covers non-streaming JSON responses). + const tryParseWholeBody = (): void => { + if (reported || fullBody.length === 0) return; + let parsed: unknown; + try { + parsed = JSON.parse(fullBody); + } catch { + return; + } + if (parsed === null || typeof parsed !== "object") return; + const obj = parsed as Record; + const response = (obj.response ?? obj) as Record; + const usage = usageFromResponse(response) ?? usageFromResponse(obj); + if (usage === null) return; + const model = + typeof response.model === "string" + ? response.model + : typeof obj.model === "string" + ? obj.model + : undefined; + reportUsage(usage, model); + reported = true; + }; + + return new TransformStream({ + transform(chunk, controller) { + controller.enqueue(chunk); // pass through immediately, unmodified + const text = decoder.decode(chunk, { stream: true }); + buffer += text; + if (fullBody.length < FULL_BODY_CAP) fullBody += text; + // SSE events are separated by a blank line. Process complete blocks. + let sep = buffer.indexOf("\n\n"); + while (sep !== -1) { + const block = buffer.slice(0, sep); + buffer = buffer.slice(sep + 2); + tryParseEvent(block); + sep = buffer.indexOf("\n\n"); + } + }, + flush() { + if (buffer.length > 0) tryParseEvent(buffer); + // SSE parsing found nothing — try the whole body as a single JSON object. + tryParseWholeBody(); + if (!reported) { + log(`call completed with no usage found in response (path may be non-model)`); + } + }, + }); +} + +/** Build the upstream URL: UPSTREAM + the part of the path after our `/v1`. */ +function upstreamUrl(reqUrl: string): string { + // Transparent tunnel: forward the exact path + query Codex built onto the + // upstream origin. (The provider's base_url is configured so Codex builds the + // correct downstream path; the proxy never rewrites it.) + const { pathname, search } = new URL(reqUrl); + return `${UPSTREAM}${pathname}${search}`; +} + +const server = Bun.serve({ + port: PORT, + hostname: "127.0.0.1", + // Generous: model calls can stream for minutes. + idleTimeout: 0, + async fetch(req) { + const target = upstreamUrl(req.url); + const path = new URL(req.url).pathname; + // Log every request so it's obvious when Codex is (or isn't) hitting us. + log(`${req.method} ${path} -> ${target}`); + + // Forward the request verbatim (method, headers, body). Codex already set + // the Authorization header (ChatGPT token or API key) and the account/session + // headers, so we just relay them. Drop `host` so fetch derives the correct + // upstream host (otherwise it carries our 127.0.0.1:PORT and breaks routing). + const headers = new Headers(req.headers); + headers.delete("host"); + let upstream: Response; + try { + upstream = await fetch(target, { + method: req.method, + headers, + body: req.body, + // `duplex: "half"` is required to stream a request body; not in the DOM + // RequestInit type, so widen via a cast. + ...({ duplex: "half" } as Record), + redirect: "manual", + }); + } catch (err) { + log(`upstream fetch failed for ${req.method} ${path}: ${String(err)}`); + return new Response(JSON.stringify({ error: "proxy_upstream_failed" }), { + status: 502, + headers: { "content-type": "application/json" }, + }); + } + + const contentType = upstream.headers.get("content-type") ?? ""; + log(` <- ${upstream.status} ${contentType || "(no content-type)"}`); + + // Always tee the body through the usage sniffer. We do NOT gate on + // content-type: the ChatGPT backend streams SSE with no (or an unexpected) + // content-type header, so matching "text/event-stream" misses it. The sniffer + // passes bytes through unchanged and looks for usage in either SSE `data:` + // events or a whole-body JSON object, so it's safe for any response. + if (upstream.body) { + const teed = upstream.body.pipeThrough(makeUsageSniffer()); + return new Response(teed, { + status: upstream.status, + statusText: upstream.statusText, + headers: upstream.headers, + }); + } + + // No body (e.g. a HEAD or 204): pass straight through. + return new Response(upstream.body, { + status: upstream.status, + statusText: upstream.statusText, + headers: upstream.headers, + }); + }, +}); + +log(`listening on http://127.0.0.1:${server.port} -> ${UPSTREAM}`); +log(`writing per-call usage to ${OUT}`); + +function summarize(): void { + log( + `SESSION TOTAL over ${callCount} call(s): ` + + `input=${sessionTotal.input_tokens} (cached=${sessionTotal.cached_input_tokens}) ` + + `output=${sessionTotal.output_tokens} (reasoning=${sessionTotal.reasoning_output_tokens}) ` + + `total=${sessionTotal.total_tokens}`, + ); + record({ kind: "session_total", calls: callCount, sessionTotal: { ...sessionTotal } }); + server.stop(true); + process.exit(0); +} + +process.on("SIGTERM", summarize); +process.on("SIGINT", summarize); From 0b08f647e75afa3444f687e5ee47701d922efb49 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Tue, 23 Jun 2026 16:04:09 -0600 Subject: [PATCH 17/17] update docs --- plugins/trace-codex/AGENTS.md | 2 +- plugins/trace-codex/README.md | 60 ++++++++++++++----- plugins/trace-codex/config.json.example | 2 +- plugins/trace-codex/scripts/smoke-test.sh | 4 +- .../trace-codex/src/agents/codex/register.ts | 2 +- .../src/agents/codex/settings.test.ts | 20 +++---- .../trace-codex/src/agents/codex/settings.ts | 14 ++--- plugins/trace-codex/src/client/client.ts | 8 +-- plugins/trace-codex/src/index.ts | 4 +- 9 files changed, 73 insertions(+), 43 deletions(-) diff --git a/plugins/trace-codex/AGENTS.md b/plugins/trace-codex/AGENTS.md index 2b698c1..1c0edc1 100644 --- a/plugins/trace-codex/AGENTS.md +++ b/plugins/trace-codex/AGENTS.md @@ -99,7 +99,7 @@ The terminal event is the one place where blocking is even an option, and it's **opt-in**: - On a **terminal event** (`Stop`), the client calls `/flush` **only** when - `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` is set, blocking until the server confirms the + `BRAINTRUST_FLUSH_ON_TURN_END` is set, blocking until the server confirms the queue has drained and buffered spans reached Braintrust. The blocking mode exists for short-lived hosts (e.g. a CI job that ends right after the last turn) that would otherwise tear the process tree down before the final spans are diff --git a/plugins/trace-codex/README.md b/plugins/trace-codex/README.md index 2049697..b2d059c 100644 --- a/plugins/trace-codex/README.md +++ b/plugins/trace-codex/README.md @@ -12,6 +12,50 @@ codex plugin add trace-codex@braintrust-codex-plugins TRACE_TO_BRAINTRUST=true BRAINTRUST_PROJECT=my-coding-agent codex ``` +## Using the plugin in CI + +Tracing `codex exec` runs in CI works. A few notes: + +- **Block on flush.** Set `BRAINTRUST_FLUSH_ON_TURN_END=true` so the final spans are delivered before the job exits (a short-lived CI job can otherwise tear down before the background server's idle-drain flush fires). +- **Trust the hooks non-interactively.** Pass `codex exec --dangerously-bypass-hook-trust` to skip the one-time interactive hook-trust prompt. (Only for plugins you trust.) + +GitHub Actions example: + +```yaml +name: codex-traced +on: [workflow_dispatch] + +jobs: + run: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: Install Codex CLI + run: npm install -g @openai/codex # or your preferred install method + + - name: Install the trace-codex plugin (pinned to a release tag) + # TODO: replace with desired version ---------------------------------vvvvvvvvvvvvvvvvvv + run: | + codex plugin marketplace add braintrustdata/braintrust-codex-plugin@trace-codex-v0.0.X + codex plugin add trace-codex@braintrust-codex-plugins + + - name: Run a traced Codex session + env: + TRACE_TO_BRAINTRUST: "true" + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + BRAINTRUST_PROJECT: my-coding-agent # <--- TODO: replace with your project name + BRAINTRUST_FLUSH_ON_TURN_END: "true" + run: | + codex exec \ + --skip-git-repo-check \ + --dangerously-bypass-hook-trust \ + --sandbox read-only \ + "summarize the changes in this repo" +``` + +If your plugin release repo is private, also expose a token the launcher can use to download the binary (`GH_TOKEN` or `GITHUB_TOKEN`, or an authenticated `gh`). + ## Configuration There are two ways to configure the plugin. **Environment variables always win over the config file** @@ -30,7 +74,7 @@ Every setting can be provided as a `config.json` key or as an environment variab | `project` | `BRAINTRUST_PROJECT` | _(unset)_ | Project to log traces into. | | `apiUrl` | `BRAINTRUST_API_URL` | api.braintrust.dev | Braintrust API URL. | | `additionalMetadata` | `BRAINTRUST_ADDITIONAL_METADATA` | _(unset)_ | JSON object of extra metadata merged into the root span. Standard keys (`session_id`, `model`, `project`, etc.) take precedence on conflict. | -| `blockOnStop` | `BRAINTRUST_PLUGIN_BLOCK_ON_STOP` | `false` | When `true`, the hook blocks on each turn's `Stop` until the server confirms all spans are flushed. Use in programmatic/CI runs to guarantee traces are delivered before Codex exits. | +| `flushOnTurnEnd` | `BRAINTRUST_FLUSH_ON_TURN_END` | `false` | When `true`, the hook blocks at each turn's end (the `Stop` event) until the server confirms all spans are flushed. Use in programmatic/CI runs to guarantee traces are delivered before Codex exits. | | `recordFile` | `BRAINTRUST_EVENT_SERVER_RECORD_FILE` | _(unset)_ | If set, record every event to this NDJSON file (for `replay`). | ### Resuming sessions @@ -59,17 +103,3 @@ Advanced plugin settings for debugging or developing the plugin | _(none)_ | `BRAINTRUST_EVENT_SERVER_LOG_DIR` | `$PLUGIN_DATA` | Directory for logs, pidfile, and `config.json`. | The config file is read by the hook client only at the moment it boots the background server (the running server keeps the config it started with). To pick up config changes, stop the server (or wait for it to idle out) so the next event re-boots it. - -## Smoke test - -`make smoke` runs a full end-to-end check: it starts a local mock Braintrust collector, runs a real `codex exec "say hi"` session with tracing pointed at the mock, and asserts at least one trace row was reported. This exercises the whole path (Codex hook -> background server -> Braintrust SDK flush) without touching a real Braintrust account. - -Requirements: `codex` and `bun` on `PATH`, `OPENAI_API_KEY` set, and the plugin installed in Codex (run `../../install.sh trace-codex` for a local dev install). - -```bash -OPENAI_API_KEY=sk-... make smoke -# Pin the release whose codex-hook binary the launcher downloads: -OPENAI_API_KEY=sk-... make smoke SMOKE_VERSION=0.0.1 -``` - -CI runs the same smoke test (macOS arm64/x64 and Linux) after every release via `.github/workflows/smoke.yaml`. That workflow can also be dispatched manually from the Actions tab against any published release version. diff --git a/plugins/trace-codex/config.json.example b/plugins/trace-codex/config.json.example index c998275..b9b0d93 100644 --- a/plugins/trace-codex/config.json.example +++ b/plugins/trace-codex/config.json.example @@ -6,7 +6,7 @@ "appUrl": "https://www.braintrust.dev", "project": "my-codex-project", "additionalMetadata": { "team": "platform", "env": "dev" }, - "blockOnStop": false, + "flushOnTurnEnd": false, "recordFile": "", "port": 52734, "idleTimeoutMs": 300000, diff --git a/plugins/trace-codex/scripts/smoke-test.sh b/plugins/trace-codex/scripts/smoke-test.sh index f148621..d91d646 100755 --- a/plugins/trace-codex/scripts/smoke-test.sh +++ b/plugins/trace-codex/scripts/smoke-test.sh @@ -124,7 +124,7 @@ export BRAINTRUST_EVENT_SERVER_IDLE_CHECK_INTERVAL_MS=1000 # `codex exec` returns. Without this the hook just enqueues and relies on the # server's idle-drain flush, which may not fire before this short-lived run tears # the process tree down — making the assertion below flaky. -export BRAINTRUST_PLUGIN_BLOCK_ON_STOP=true +export BRAINTRUST_FLUSH_ON_TURN_END=true # Pin the release the launcher downloads, when requested. The launcher strips an # optional leading 'v', so either form is fine here. @@ -153,7 +153,7 @@ if [ "$codex_status" -ne 0 ]; then scrub <"$CODEX_LOG" >&2 || true fi -# With BRAINTRUST_PLUGIN_BLOCK_ON_STOP=true (set above), the hook client blocks +# With BRAINTRUST_FLUSH_ON_TURN_END=true (set above), the hook client blocks # on a synchronous flush at session end, so by the time codex exec returns the # trace should already be delivered. Stop the collector (via cleanup) to make it # write its summary, then assert. diff --git a/plugins/trace-codex/src/agents/codex/register.ts b/plugins/trace-codex/src/agents/codex/register.ts index 980ed15..f4b194d 100644 --- a/plugins/trace-codex/src/agents/codex/register.ts +++ b/plugins/trace-codex/src/agents/codex/register.ts @@ -20,7 +20,7 @@ export interface Agent { /** * Event names that terminate a turn/session. After enqueuing one of these, * the hook asks the server to flush. By default the flush is fire-and-forget - * so the turn isn't stalled; setting BRAINTRUST_PLUGIN_BLOCK_ON_STOP makes the + * so the turn isn't stalled; setting BRAINTRUST_FLUSH_ON_TURN_END makes the * hook block until the server confirms the final spans are delivered before * the process tree is torn down (e.g. in CI). Codex only exposes a per-turn * "Stop" hook today; there is no session-end event. diff --git a/plugins/trace-codex/src/agents/codex/settings.test.ts b/plugins/trace-codex/src/agents/codex/settings.test.ts index afa5bfe..068816c 100644 --- a/plugins/trace-codex/src/agents/codex/settings.test.ts +++ b/plugins/trace-codex/src/agents/codex/settings.test.ts @@ -70,14 +70,14 @@ describe("loadSettingsFile", () => { expect(loadSettingsFile(path)).toEqual({}); }); - test("parses blockOnStop boolean; ignores non-boolean", () => { - expect(loadSettingsFile(write(JSON.stringify({ blockOnStop: true })))).toEqual({ - blockOnStop: true, + test("parses flushOnTurnEnd boolean; ignores non-boolean", () => { + expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: true })))).toEqual({ + flushOnTurnEnd: true, }); - expect(loadSettingsFile(write(JSON.stringify({ blockOnStop: false })))).toEqual({ - blockOnStop: false, + expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: false })))).toEqual({ + flushOnTurnEnd: false, }); - expect(loadSettingsFile(write(JSON.stringify({ blockOnStop: "yes" })))).toEqual({}); + expect(loadSettingsFile(write(JSON.stringify({ flushOnTurnEnd: "yes" })))).toEqual({}); }); test("missing file returns {}", () => { @@ -136,11 +136,11 @@ describe("applySettingsToEnv", () => { expect(env.BRAINTRUST_ADDITIONAL_METADATA).toBe('{"team":"platform"}'); }); - test("maps blockOnStop to BRAINTRUST_PLUGIN_BLOCK_ON_STOP", () => { + test("maps flushOnTurnEnd to BRAINTRUST_FLUSH_ON_TURN_END", () => { const env: NodeJS.ProcessEnv = {}; - const applied = applySettingsToEnv({ blockOnStop: true }, env); - expect(env.BRAINTRUST_PLUGIN_BLOCK_ON_STOP).toBe("true"); - expect(applied).toEqual(["blockOnStop"]); + const applied = applySettingsToEnv({ flushOnTurnEnd: true }, env); + expect(env.BRAINTRUST_FLUSH_ON_TURN_END).toBe("true"); + expect(applied).toEqual(["flushOnTurnEnd"]); }); }); diff --git a/plugins/trace-codex/src/agents/codex/settings.ts b/plugins/trace-codex/src/agents/codex/settings.ts index cb64a32..3373539 100644 --- a/plugins/trace-codex/src/agents/codex/settings.ts +++ b/plugins/trace-codex/src/agents/codex/settings.ts @@ -26,12 +26,12 @@ export interface Settings { /** Master switch: when false or unset, no traces are reported to Braintrust. */ traceToBraintrust?: boolean; /** - * When true, the hook blocks on the Stop event until the server confirms all - * events are processed and spans are flushed. Useful in programmatic/CI runs - * to guarantee traces are delivered before Codex exits. Defaults to false - * (fire-and-forget flush so the turn isn't stalled). + * When true, the hook blocks at each turn's end (the Stop event) until the + * server confirms all events are processed and spans are flushed. Useful in + * programmatic/CI runs to guarantee traces are delivered before Codex exits. + * Defaults to false (fire-and-forget flush so the turn isn't stalled). */ - blockOnStop?: boolean; + flushOnTurnEnd?: boolean; /** Extra metadata merged into the root span (standard keys win on conflict). */ additionalMetadata?: Record; /** If set, record every event to this NDJSON file (for replay). */ @@ -51,7 +51,7 @@ export const SETTINGS_TO_ENV: Record = { appUrl: "BRAINTRUST_APP_URL", project: "BRAINTRUST_PROJECT", traceToBraintrust: "TRACE_TO_BRAINTRUST", - blockOnStop: "BRAINTRUST_PLUGIN_BLOCK_ON_STOP", + flushOnTurnEnd: "BRAINTRUST_FLUSH_ON_TURN_END", additionalMetadata: "BRAINTRUST_ADDITIONAL_METADATA", recordFile: "BRAINTRUST_EVENT_SERVER_RECORD_FILE", port: "BRAINTRUST_EVENT_SERVER_PORT", @@ -63,7 +63,7 @@ const SETTING_KEYS = Object.keys(SETTINGS_TO_ENV) as Array; const NUMBER_KEYS = new Set(["port", "idleTimeoutMs", "idleCheckIntervalMs"]); -const BOOLEAN_KEYS = new Set(["traceToBraintrust", "blockOnStop"]); +const BOOLEAN_KEYS = new Set(["traceToBraintrust", "flushOnTurnEnd"]); /** * The plugin's writable data directory, where config.json lives. Resolved diff --git a/plugins/trace-codex/src/client/client.ts b/plugins/trace-codex/src/client/client.ts index ab90ff7..264306b 100644 --- a/plugins/trace-codex/src/client/client.ts +++ b/plugins/trace-codex/src/client/client.ts @@ -42,11 +42,11 @@ export interface HookClientOptions { * Whether to block on a terminal event until the server confirms all events * are processed and spans are delivered. Defaults to false. When false the * client does nothing on a terminal event — the long-lived server flushes on - * its own when the queue goes idle. Set true (via BRAINTRUST_PLUGIN_BLOCK_ON_STOP) + * its own when the queue goes idle. Set true (via BRAINTRUST_FLUSH_ON_TURN_END) * to guarantee delivery before Codex exits, e.g. in programmatic/CI runs where * the process tree is torn down before the idle flush can fire. */ - blockOnStop?: boolean; + flushOnTurnEnd?: boolean; } export async function runHookClient( @@ -87,14 +87,14 @@ export async function runHookClient( } } - // On a terminal event, only flush when blockOnStop is set: block until the + // On a terminal event, only flush when flushOnTurnEnd is set: block until the // server confirms the queue has drained and buffered spans are delivered — // important for a CI job (or any short-lived host) that ends right after the // agent's last turn, before the background server's idle timeout fires, which // would otherwise lose the final spans. Otherwise do nothing: the long-lived // server flushes on its own when the queue goes idle, so the turn isn't // stalled by a flush request. - if (sawTerminal && (options.blockOnStop ?? false)) { + if (sawTerminal && (options.flushOnTurnEnd ?? false)) { const flushed = await postFlush(config, logger); logger.debug("flush requested on terminal event", { flushed }); } diff --git a/plugins/trace-codex/src/index.ts b/plugins/trace-codex/src/index.ts index d6f5ce1..81853fe 100644 --- a/plugins/trace-codex/src/index.ts +++ b/plugins/trace-codex/src/index.ts @@ -71,8 +71,8 @@ async function main(): Promise { try { await runHookClient(config, logger, agent.buildEvents, { terminalEvents: agent.terminalEvents, - // Settings (above) have mapped BRAINTRUST_PLUGIN_BLOCK_ON_STOP into env. - blockOnStop: parseBoolEnv(process.env.BRAINTRUST_PLUGIN_BLOCK_ON_STOP), + // Settings (above) have mapped BRAINTRUST_FLUSH_ON_TURN_END into env. + flushOnTurnEnd: parseBoolEnv(process.env.BRAINTRUST_FLUSH_ON_TURN_END), }); } catch (err) { logger.error("unexpected hook client error", { error: String(err) });