From 231533870a566e70643a8ebee852d7cb94680700 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 17 May 2026 19:43:13 -0400
Subject: [PATCH 1/2] Update qwen3.5-fp8-b300-sglang (+mtp) SGLang image to
 v0.5.12-cu130

---
 .github/configs/nvidia-master.yaml | 4 ++--
 perf-changelog.yaml                | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ff6386708..12b8d9481 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2397,7 +2397,7 @@ qwen3.5-fp8-b200-sglang-mtp:
     
 
 qwen3.5-fp8-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: b300
@@ -2416,7 +2416,7 @@ qwen3.5-fp8-b300-sglang-mtp:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
 
 qwen3.5-fp8-b300-sglang:
-  image: lmsysorg/sglang:v0.5.10.post1-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: b300
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index fc763d93b..a2bd1b0a0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3022,3 +3022,10 @@
   description:
     - "Update SGLang image from nightly-dev-cu13-20260518-c67b2870 to nightly-dev-cu13-20260519-dbac4647"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1492
+
+- config-keys:
+    - qwen3.5-fp8-b300-sglang
+    - qwen3.5-fp8-b300-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451

From e1d3a18155deb2cff8dbd4e2b79d80ad0b0fa90d Mon Sep 17 00:00:00 2001
From: claude-fix-bot <claude-fix-bot@local>
Date: Mon, 18 May 2026 02:55:32 -0400
Subject: [PATCH 2/2] fix(qwen3.5_fp8_b300): use --mm-attention-backend
 triton_attn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same workaround as PR #1422 — bypass the broken flash-attn cute kernel
sm_103 assertion in the Qwen-3.5-VL vision encoder by switching only
the multi-modal attention path to triton_attn. Text decoder still uses
--attention-backend trtllm_mha.

See sgl-project/sglang#25564 + Dao-AILab/flash-attention#2572 for the
upstream root cause and the in-flight fix.
---
 benchmarks/single_node/qwen3.5_fp8_b300.sh     | 1 +
 benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh
index b87d25e91..cbceb6f1b 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh
@@ -40,6 +40,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --kv-cache-dtype fp8_e4m3 \
 --mamba-ssm-dtype bfloat16 \
 --attention-backend trtllm_mha \
+--mm-attention-backend triton_attn \
 --moe-runner-backend flashinfer_trtllm \
 --cuda-graph-max-bs $CONC \
 --max-running-requests $CONC \
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
index a0c5f4828..ca3b87120 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
@@ -40,6 +40,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod
 --kv-cache-dtype fp8_e4m3 \
 --mamba-ssm-dtype bfloat16 \
 --attention-backend trtllm_mha \
+--mm-attention-backend triton_attn \
 --moe-runner-backend flashinfer_trtllm \
 --cuda-graph-max-bs $CONC \
 --max-running-requests $CONC \