Vllm运行Qwen3.6-36B-A3B的启动参数

我的是AMD AI9 365 +32G内存的笔记本

#!/bin/bash
export HSA_OVERRIDE_GFX_VERSION=11.5.0
export PYTORCH_ALLOC_CONF=expandable_segments:True

docker run --rm --network=host --ipc=host \
    --device=/dev/kfd --device=/dev/dri --group-add=video \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -e HF_ENDPOINT=https://hf-mirror.com \
    -e TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \
    -e TORCH_BLAS_PREFER_HIPBLASLT=1 \
    -e FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE  \
    -e VLLM_ROCM_USE_AITER=1 \
    -e HIP_VISIBLE_DEVICES=0 \
    -e OMP_NUM_THREADS=12 \
    --entrypoint python \
    vllm/vllm-openai-rocm:latest \
    -m vllm.entrypoints.openai.api_server \
    --model cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit \
    --dtype bfloat16 \
    --tensor-parallel-size 1 \
    --default-chat-template-kwargs "{\"enable_thinking\": false}" \ #close thinking mode
    --enable-expert-parallel \
    --language-model-only \
    --max-model-len 65536 \
    --gpu-memory-utilization 0.95 \
    --max-num-seqs 1 \
    --kv-cache-dtype fp8 \
    --generation-config vllm