diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9228673c3..fe9368dd0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2250,7 +2250,7 @@ glm5-fp8-b200-sglang-agentic: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } glm5-fp8-b300-sglang: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: b300 @@ -2269,7 +2269,7 @@ glm5-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp8-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: b300 diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh index 1d0c4236e..730cc3950 100644 --- a/benchmarks/single_node/glm5_fp8_b300.sh +++ b/benchmarks/single_node/glm5_fp8_b300.sh @@ -25,7 +25,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" -export SGL_ENABLE_JIT_DEEPGEMM=1 +# Workaround for sgl-project/sglang#25551: v0.5.12 DeepGemm TMA-descriptor +# regression on B300 (sm_120) crashes CUDA graph capture with +# CUDA_ERROR_ILLEGAL_ADDRESS. Disabling JIT DeepGemm bypasses the affected +# kernel path. Restore to =1 once the upstream regression is fixed. +export SGL_ENABLE_JIT_DEEPGEMM=0 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh index db586dad8..0d4290dd3 100755 --- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_b300_mtp.sh @@ -25,7 +25,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" -export SGL_ENABLE_JIT_DEEPGEMM=1 +# Workaround for sgl-project/sglang#25551: v0.5.12 DeepGemm TMA-descriptor +# regression on B300 (sm_120) crashes CUDA graph capture with +# CUDA_ERROR_ILLEGAL_ADDRESS. Disabling JIT DeepGemm bypasses the affected +# kernel path. Restore to =1 once the upstream regression is fixed. +export SGL_ENABLE_JIT_DEEPGEMM=0 export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 39fa14a5c..88c2f40a5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2747,3 +2747,11 @@ description: - "Update SGLang image from v0.5.9-rocm700-mi30x to v0.5.12-rocm700-mi30x" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1425 + +- config-keys: + - glm5-fp8-b300-sglang + - glm5-fp8-b300-sglang-mtp + description: + - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130" + - "Disable JIT DeepGemm (SGL_ENABLE_JIT_DEEPGEMM=0) to bypass v0.5.12 DeepGemm TMA-descriptor regression on B300 — see sgl-project/sglang#25551" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1421