diff --git a/kubernetes/scripts/create_x509_user_config.py b/kubernetes/scripts/create_x509_user_config.py index 38d7cbd4..aa8a516f 100755 --- a/kubernetes/scripts/create_x509_user_config.py +++ b/kubernetes/scripts/create_x509_user_config.py @@ -121,4 +121,4 @@ def build_kubectl_config(tmpdir: str) -> None: Path(tmpdir, f"{user}.config"), # from Path(f"{user}.config"), # to ) - print(f"Config generated. Saved to {user}.config in current directory.") # noqa: T201 + print(f"Config generated. Saved to {user}.config in current directory.") diff --git a/kubernetes/scripts/lint_manifests.py b/kubernetes/scripts/lint_manifests.py index ba237dfd..d45dd524 100644 --- a/kubernetes/scripts/lint_manifests.py +++ b/kubernetes/scripts/lint_manifests.py @@ -32,4 +32,4 @@ def get_all_manifests() -> list[str]: arg = " -f ".join([""] + get_all_manifests()) os.system("kubectl diff" + arg) # noqa: S605 elif sys.argv[1] == "find": - print("\n".join(get_all_manifests())) # noqa: T201 + print("\n".join(get_all_manifests())) diff --git a/kubernetes/scripts/memray_profile/Dockerfile b/kubernetes/scripts/memray_profile/Dockerfile new file mode 100644 index 00000000..89501cea --- /dev/null +++ b/kubernetes/scripts/memray_profile/Dockerfile @@ -0,0 +1,13 @@ +# Probe image for on-demand memory profiling of k8s Python services. +# Multi-platform build required (dev machines are arm64, cluster is amd64): +# +# docker buildx build --platform linux/amd64,linux/arm64 --push \ +# -t ghcr.io/python-discord/memray-probe:latest \ +# kubernetes/scripts/memray_profile/ +FROM python:3.14-slim + +RUN apt-get update && apt-get install -y --no-install-recommends util-linux \ + && rm -rf /var/lib/apt/lists/* +RUN pip install --no-cache-dir memray==1.19.3 + +ENTRYPOINT ["/bin/sh"] diff --git a/kubernetes/scripts/memray_profile/__init__.py b/kubernetes/scripts/memray_profile/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kubernetes/scripts/memray_profile/__main__.py b/kubernetes/scripts/memray_profile/__main__.py new file mode 100644 index 00000000..a170b764 --- /dev/null +++ b/kubernetes/scripts/memray_profile/__main__.py @@ -0,0 +1,100 @@ +""" +Profile memory usage of a running Kubernetes Python service with memray. + +Injects an ephemeral debug container, uses sys.remote_exec to start a +memray.Tracker in the target process, waits for the trace, and copies +a flamegraph report back locally. + + python -m memray_profile deploy/king-arthur -n bots + python -m memray_profile deploy/site -n web --duration 60 +""" + +import argparse +import subprocess +from datetime import UTC, datetime +from pathlib import Path + +from ._constants import PROBE_REPORT, PROBE_TRACE, TARGET_TRACE +from ._kubectl import die, kubectl +from ._pod import find_python_pid, get_containers, resolve_pod +from ._probe import inject_probe, inject_tracker, wait_for_probe + + +def main() -> None: + p = argparse.ArgumentParser(description="Profile memory of a k8s Python service with memray.") + p.add_argument("target", help="Pod name or workload ref (deploy/x, sts/x)") + p.add_argument("-n", "--namespace", default="default") + p.add_argument("-c", "--container", help="Target container (default: first)") + p.add_argument("-p", "--pid", type=int, help="Skip PID auto-detection") + p.add_argument("-d", "--duration", type=int, default=30, metavar="SEC") + p.add_argument("--report-type", choices=["flamegraph", "tree"], default="flamegraph") + p.add_argument("--trace-path", default=TARGET_TRACE) + p.add_argument("--raw", action="store_true", help="Copy raw .bin instead of rendered report") + p.add_argument("--output-dir", type=Path, default=Path.cwd()) + args = p.parse_args() + + pod = resolve_pod(args.target, args.namespace) + containers = get_containers(pod, args.namespace) + container = args.container or containers[0] + if container not in containers: + die(f"Container {container!r} not in {pod}. Have: {', '.join(containers)}") + print(f"Target: {pod} / {container}") + + ts = datetime.now(UTC).strftime("%Y%m%d%H%M%S") + probe = f"memray-{ts}" + + inject_probe(pod, args.namespace, container, probe) + wait_for_probe(pod, args.namespace, probe) + + if args.pid: + pid = args.pid + else: + pid = find_python_pid(pod, args.namespace, probe) + print(f"Python PID: {pid}") + + inject_tracker(pod, args.namespace, probe, pid, args.duration, args.trace_path) + + # Grab output + args.output_dir.mkdir(parents=True, exist_ok=True) + trace_on_target = f"/proc/{pid}/root{args.trace_path}" + + if args.raw: + out = args.output_dir / f"memray_{pod}_{ts}.bin" + src = trace_on_target + else: + # Copy trace into the probe container, render the report there + kubectl( + "exec", + pod, + "-n", + args.namespace, + "-c", + probe, + "--", + "sh", + "-c", + f"cp {trace_on_target} {PROBE_TRACE}", + capture=True, + check=True, + ) + if args.report_type == "flamegraph": + report_cmd = f"memray flamegraph -o {PROBE_REPORT} {PROBE_TRACE}" + else: + report_cmd = f"memray tree {PROBE_TRACE} > {PROBE_REPORT}" + print(f"Generating {args.report_type}...") + kubectl("exec", pod, "-n", args.namespace, "-c", probe, "--", "sh", "-c", report_cmd, capture=False) + + suffix = ".html" if args.report_type == "flamegraph" else ".txt" + out = args.output_dir / f"memray_{pod}_{ts}{suffix}" + src = PROBE_REPORT + + print(f"Copying to {out}...") + subprocess.run( # noqa: S603 + ["kubectl", "cp", "-n", args.namespace, "-c", probe, f"{pod}:{src}", str(out)], + check=True, + ) + print(f"\nDone: {out}") + + +if __name__ == "__main__": + main() diff --git a/kubernetes/scripts/memray_profile/_constants.py b/kubernetes/scripts/memray_profile/_constants.py new file mode 100644 index 00000000..3fc0fdc6 --- /dev/null +++ b/kubernetes/scripts/memray_profile/_constants.py @@ -0,0 +1,6 @@ +PROBE_IMAGE = "ghcr.io/python-discord/memray-probe:latest" +MEMRAY_VERSION = "1.19.3" +PROBE_TRACE = "/tmp/memray_trace.bin" # noqa: S108 +PROBE_REPORT = "/tmp/memray_report.html" # noqa: S108 +TARGET_TRACE = "/tmp/memray_trace.bin" # noqa: S108 +READY_MARKER = "MEMRAY_PROBE_READY" diff --git a/kubernetes/scripts/memray_profile/_kubectl.py b/kubernetes/scripts/memray_profile/_kubectl.py new file mode 100644 index 00000000..a286d12d --- /dev/null +++ b/kubernetes/scripts/memray_profile/_kubectl.py @@ -0,0 +1,21 @@ +import subprocess +import sys +from typing import NoReturn + + +def die(msg: str) -> NoReturn: + print(f"Error: {msg}", file=sys.stderr) + sys.exit(1) + + +def kubectl(*args: str, capture: bool = True, check: bool = True) -> subprocess.CompletedProcess: + try: + return subprocess.run( # noqa: S603 + ["kubectl", *args], + capture_output=capture, + text=True, + check=check, + ) + except subprocess.CalledProcessError as exc: + stderr = (exc.stderr or "").strip().rsplit("\n", 1)[-1] or f"exit code {exc.returncode}" + die(f"kubectl {' '.join(args[:3])}... failed: {stderr}") diff --git a/kubernetes/scripts/memray_profile/_pod.py b/kubernetes/scripts/memray_profile/_pod.py new file mode 100644 index 00000000..7adcc7e1 --- /dev/null +++ b/kubernetes/scripts/memray_profile/_pod.py @@ -0,0 +1,77 @@ +import json + +from ._kubectl import die, kubectl + +_WORKLOAD_KINDS = { + "deploy": "deployments", + "deployment": "deployments", + "sts": "statefulsets", + "statefulset": "statefulsets", +} + +_FIND_PIDS_SH = r""" +for d in /proc/[0-9]*/; do + pid=$(basename "$d") + exe=$(readlink "$d/exe" 2>/dev/null) || true + case "${exe:-$(cut -d '' -f1 < "$d/cmdline" 2>/dev/null)}" in + *python*) printf '%s %s\n' "$pid" "$(tr '\0' ' ' < "$d/cmdline" 2>/dev/null)" ;; + esac +done +""" + + +def resolve_pod(target: str, namespace: str) -> str: + if "/" not in target: + return target + + kind, name = target.split("/", 1) + resource = _WORKLOAD_KINDS.get(kind.lower()) + if not resource: + die(f"Unsupported resource kind {kind!r}. Use a pod name, deploy/, or sts/") + + workload = json.loads(kubectl("get", resource, name, "-n", namespace, "-o", "json").stdout) + labels = workload["spec"]["selector"]["matchLabels"] + selector = ",".join(f"{k}={v}" for k, v in labels.items()) + + result = kubectl( + "get", + "pods", + "-n", + namespace, + "-l", + selector, + "--field-selector=status.phase=Running", + "-o", + "jsonpath={.items[0].metadata.name}", + ) + pod = result.stdout.strip() + if not pod: + die(f"No running pods for {target} in {namespace!r}") + return pod + + +def get_containers(pod: str, namespace: str) -> list[str]: + result = kubectl("get", "pod", pod, "-n", namespace, "-o", "jsonpath={.spec.containers[*].name}") + return result.stdout.strip().split() + + +def find_python_pid(pod: str, namespace: str, probe: str) -> int: + result = kubectl("exec", pod, "-n", namespace, "-c", probe, "--", "sh", "-c", _FIND_PIDS_SH) + entries = [] + for line in result.stdout.strip().splitlines(): + pid_str, _, cmdline = line.partition(" ") + entries.append((int(pid_str), cmdline.strip())) + + if not entries: + die(f"No Python process found in {pod}") + + for pid, cmdline in entries: + print(f" PID {pid}: {cmdline}") + + # Prefer non-PID-1 processes (PID 1 is usually tini/dumb-init) + candidates = [(p, c) for p, c in entries if p != 1] or entries + + if len(candidates) > 1: + die(f"Multiple Python PIDs found: {', '.join(str(p) for p, _ in candidates)}. Use --pid.") + + return candidates[0][0] diff --git a/kubernetes/scripts/memray_profile/_probe.py b/kubernetes/scripts/memray_profile/_probe.py new file mode 100644 index 00000000..db154fed --- /dev/null +++ b/kubernetes/scripts/memray_profile/_probe.py @@ -0,0 +1,142 @@ +import json +import time + +from ._constants import MEMRAY_VERSION, PROBE_IMAGE, READY_MARKER +from ._kubectl import die, kubectl + + +def inject_probe(pod: str, namespace: str, target_container: str, probe_name: str) -> None: + startup = ( + f"python3 -c 'import memray' 2>/dev/null || pip install -q memray=={MEMRAY_VERSION} && " + f"echo {READY_MARKER} && sleep 3600" + ) + spec = json.dumps( + { + "spec": { + "ephemeralContainers": [ + { + "name": probe_name, + "image": PROBE_IMAGE, + "command": ["/bin/sh", "-c", startup], + "targetContainerName": target_container, + "securityContext": { + "capabilities": {"add": ["SYS_PTRACE", "SYS_ADMIN"]}, + "seccompProfile": {"type": "Unconfined"}, + "runAsUser": 0, + "runAsNonRoot": False, + "allowPrivilegeEscalation": True, + }, + } + ] + } + } + ) + kubectl( + "patch", + "pod", + pod, + "-n", + namespace, + "--subresource=ephemeralcontainers", + "--type=strategic", + "-p", + spec, + capture=False, + check=True, + ) + + +def wait_for_probe(pod: str, namespace: str, probe_name: str, timeout: int = 120) -> None: + print("Waiting for probe...") + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + raw = kubectl("get", "pod", pod, "-n", namespace, "-o", "json") + statuses = json.loads(raw.stdout).get("status", {}).get("ephemeralContainerStatuses", []) + status = next((s for s in statuses if s["name"] == probe_name), None) + + if status: + state = status.get("state", {}) + if "terminated" in state: + die(f"Probe exited early (code {state['terminated'].get('exitCode', '?')})") + if "running" in state: + logs = kubectl("logs", pod, "-n", namespace, "-c", probe_name, check=False) + if READY_MARKER in logs.stdout: + print("Probe ready.") + return + + time.sleep(3) + + die(f"Probe didn't start within {timeout}s") + + +def inject_tracker(pod: str, namespace: str, probe: str, pid: int, duration: int, trace_path: str) -> None: + """Write a memray script into the target and use sys.remote_exec to run it.""" + if pid == 1: + die("Can't profile PID 1 — add tini or dumb-init so Python isn't the init process.") + + inject = "/tmp/_memray_inject.py" # noqa: S108 + script = ( + "import memray as _m, builtins as _b, threading as _t, time as _time\n" + f"_b._memray_tracker = _m.Tracker('{trace_path}', native_traces=True, trace_python_allocators=True)\n" + "_b._memray_tracker.__enter__()\n" + "def _stop():\n" + f" _time.sleep({duration})\n" + " if hasattr(_b, '_memray_tracker'):\n" + " _b._memray_tracker.__exit__(None, None, None)\n" + " del _b._memray_tracker\n" + "_t.Thread(target=_stop, daemon=True).start()\n" + ) + + # Place it in the target's filesystem via /proc//root + kubectl( + "exec", + pod, + "-n", + namespace, + "-c", + probe, + "--", + "sh", + "-c", + f"cat > /proc/{pid}/root{inject} << 'EOF'\n{script}EOF", + capture=True, + check=True, + ) + + # sys.remote_exec needs to read the target's ELF .so files, so we nsenter + # the mount namespace and run the target's own python (via its PATH). + print(f"Attaching to PID {pid}...") + kubectl( + "exec", + pod, + "-n", + namespace, + "-c", + probe, + "--", + "sh", + "-c", + f"target_path=$(tr '\\0' '\\n' < /proc/{pid}/environ | grep '^PATH=' | head -1 | cut -d= -f2-) && " + f'nsenter --mount=/proc/{pid}/ns/mnt -- env PATH="$target_path" ' + f"python -c \"import sys; sys.remote_exec({pid}, '{inject}')\"", + capture=False, + check=True, + ) + + print(f"Profiling for {duration}s...") + time.sleep(duration + 2) + + kubectl( + "exec", + pod, + "-n", + namespace, + "-c", + probe, + "--", + "rm", + "-f", + f"/proc/{pid}/root{inject}", + capture=True, + check=False, + ) diff --git a/pyproject.toml b/pyproject.toml index 3b9c305e..9a2e5cad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,9 @@ ignore = [ "COM812", "COM819", "D206", "E111", "E114", "E117", "E501", "ISC001", "Q000", "Q001", "Q002", "Q003", "W191", ] +[tool.ruff.lint.extend-per-file-ignores] +"kubernetes/scripts/**/*" = ["D103", "PLR0913", "S607", "T201"] + [tool.ruff.lint.isort] order-by-type = false case-sensitive = true