SemiAnalysisAI · functionstackx · May 17, 2026 · May 17, 2026 · May 18, 2026 · May 18, 2026
@@ -2397,7 +2397,7 @@ qwen3.5-fp8-b200-sglang-mtp:
 
 
 qwen3.5-fp8-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: b300
@@ -2416,7 +2416,7 @@ qwen3.5-fp8-b300-sglang-mtp:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
 
 qwen3.5-fp8-b300-sglang:
-  image: lmsysorg/sglang:v0.5.10.post1-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: b300

diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh
@@ -40,6 +40,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --kv-cache-dtype fp8_e4m3 \
 --mamba-ssm-dtype bfloat16 \
 --attention-backend trtllm_mha \
+--mm-attention-backend triton_attn \
 --moe-runner-backend flashinfer_trtllm \
 --cuda-graph-max-bs $CONC \
 --max-running-requests $CONC \

diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
@@ -40,6 +40,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod
 --kv-cache-dtype fp8_e4m3 \
 --mamba-ssm-dtype bfloat16 \
 --attention-backend trtllm_mha \
+--mm-attention-backend triton_attn \
 --moe-runner-backend flashinfer_trtllm \
 --cuda-graph-max-bs $CONC \
 --max-running-requests $CONC \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2723,3 +2723,11 @@
   description:
     - "Add MTP/EAGLE speculative-decoding sibling for glm5-fp8-h200-sglang on lmsysorg/sglang:v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1480
+
+- config-keys:
+    - qwen3.5-fp8-b300-sglang
+    - qwen3.5-fp8-b300-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.10.post1-cu130 (off) / v0.5.11-cu130 (mtp) to v0.5.12-cu130"
+    - "Add --mm-attention-backend triton_attn to bypass flash-attn cute sm_103 assertion (see sgl-project/sglang#25564)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451