From 231533870a566e70643a8ebee852d7cb94680700 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 17 May 2026 19:43:13 -0400 Subject: [PATCH 1/2] Update qwen3.5-fp8-b300-sglang (+mtp) SGLang image to v0.5.12-cu130 --- .github/configs/nvidia-master.yaml | 4 ++-- perf-changelog.yaml | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ff6386708..12b8d9481 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2397,7 +2397,7 @@ qwen3.5-fp8-b200-sglang-mtp: qwen3.5-fp8-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: b300 @@ -2416,7 +2416,7 @@ qwen3.5-fp8-b300-sglang-mtp: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp8-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: b300 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fc763d93b..a2bd1b0a0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3022,3 +3022,10 @@ description: - "Update SGLang image from nightly-dev-cu13-20260518-c67b2870 to nightly-dev-cu13-20260519-dbac4647" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1492 + +- config-keys: + - qwen3.5-fp8-b300-sglang + - qwen3.5-fp8-b300-sglang-mtp + description: + - "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451 From e1d3a18155deb2cff8dbd4e2b79d80ad0b0fa90d Mon Sep 17 00:00:00 2001 From: claude-fix-bot Date: Mon, 18 May 2026 02:55:32 -0400 Subject: [PATCH 2/2] fix(qwen3.5_fp8_b300): use --mm-attention-backend triton_attn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same workaround as PR #1422 — bypass the broken flash-attn cute kernel sm_103 assertion in the Qwen-3.5-VL vision encoder by switching only the multi-modal attention path to triton_attn. Text decoder still uses --attention-backend trtllm_mha. See sgl-project/sglang#25564 + Dao-AILab/flash-attention#2572 for the upstream root cause and the in-flight fix. --- benchmarks/single_node/qwen3.5_fp8_b300.sh | 1 + benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh index b87d25e91..cbceb6f1b 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh @@ -40,6 +40,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --kv-cache-dtype fp8_e4m3 \ --mamba-ssm-dtype bfloat16 \ --attention-backend trtllm_mha \ +--mm-attention-backend triton_attn \ --moe-runner-backend flashinfer_trtllm \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh index a0c5f4828..ca3b87120 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh @@ -40,6 +40,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod --kv-cache-dtype fp8_e4m3 \ --mamba-ssm-dtype bfloat16 \ --attention-backend trtllm_mha \ +--mm-attention-backend triton_attn \ --moe-runner-backend flashinfer_trtllm \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \