From f6a104844150be84062c7f42324349bee7ae8360 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 17 May 2026 22:16:44 -0400 Subject: [PATCH 1/3] Update qwen3.5-fp4-b300-sglang (+mtp) SGLang image to v0.5.12-cu130 Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130 Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- perf-changelog.yaml | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ff6386708..b90c65a33 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2435,7 +2435,7 @@ qwen3.5-fp8-b300-sglang: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-fp4-b300-sglang: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 model-prefix: qwen3.5 runner: b300 @@ -2456,7 +2456,7 @@ qwen3.5-fp4-b300-sglang: - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } qwen3.5-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 model-prefix: qwen3.5 runner: b300 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fc763d93b..619cce8e7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3022,3 +3022,10 @@ description: - "Update SGLang image from nightly-dev-cu13-20260518-c67b2870 to nightly-dev-cu13-20260519-dbac4647" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1492 + +- config-keys: + - qwen3.5-fp4-b300-sglang + - qwen3.5-fp4-b300-sglang-mtp + description: + - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475 From e0930b33cad3198784352ca6a344ebcfd8fa5eea Mon Sep 17 00:00:00 2001 From: claude-fix-bot Date: Mon, 18 May 2026 12:27:41 -0400 Subject: [PATCH 2/3] fix(qwen3.5_fp4_b300): use --mm-attention-backend triton_attn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same workaround as #1422 (bf16) and #1451 (fp8) — bypass the broken flash-attn cute kernel sm_103 assertion in the Qwen-3.5-VL vision encoder by switching only the multi-modal attention path to triton_attn. Text decoder still uses --attention-backend trtllm_mha. See sgl-project/sglang#25564 (root cause: cutedsl Arch enum aliasing on non-cu13 path collapses sm_100..sm_110f range to exclude sm_103) and Dao-AILab/flash-attention#2572 for the upstream fix in flight. --- benchmarks/single_node/qwen3.5_fp4_b300.sh | 2 +- benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/qwen3.5_fp4_b300.sh index e3ae6a6e4..18b6cda09 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b300.sh +++ b/benchmarks/single_node/qwen3.5_fp4_b300.sh @@ -73,7 +73,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ ---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ +--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \ $EXTRA_ARGS --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh index 033c0408a..9cb5d5464 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh @@ -73,7 +73,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ ---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ +--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \ $EXTRA_ARGS --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --tokenizer-worker-num 6 --stream-interval 30 \ --speculative-algorithm EAGLE \ From b8f0cd58e5de1282a5551331043a71ad3b0a5c9a Mon Sep 17 00:00:00 2001 From: claude-fix-bot Date: Mon, 18 May 2026 12:50:53 -0400 Subject: [PATCH 3/3] Re-trigger sweep (previous Run Sweep run stuck pending with 0 jobs)