diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..1caa10cae 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2416,7 +2416,7 @@ qwen3.5-fp8-b200-sglang-mtp: qwen3.5-fp8-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: b300 @@ -2435,7 +2435,7 @@ qwen3.5-fp8-b300-sglang-mtp: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp8-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: b300 diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh index b87d25e91..cbceb6f1b 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh @@ -40,6 +40,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --kv-cache-dtype fp8_e4m3 \ --mamba-ssm-dtype bfloat16 \ --attention-backend trtllm_mha \ +--mm-attention-backend triton_attn \ --moe-runner-backend flashinfer_trtllm \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh index a0c5f4828..ca3b87120 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh @@ -40,6 +40,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod --kv-cache-dtype fp8_e4m3 \ --mamba-ssm-dtype bfloat16 \ --attention-backend trtllm_mha \ +--mm-attention-backend triton_attn \ --moe-runner-backend flashinfer_trtllm \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 684d40dcc..e1e4be928 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3068,3 +3068,10 @@ description: - "Bump image to rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1548 + +- config-keys: + - qwen3.5-fp8-b300-sglang + - qwen3.5-fp8-b300-sglang-mtp + description: + - "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451