目前要部署成功,主要是各种组件的版本要最新
torch: 2.9.1+cu130
vllm: nightly
transformers: 5.0.0rc3
安装命令
conda create -n vllm-glm47 python=3.12 -y
conda activate vllm-glm47
pip install torch==2.9.1+cu130 --index-url https://download.pytorch.org/whl/cu130
pip list
pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly/cu130
pip list
pip install -U transformers==5.0.0rc3
启动命令_start-vllm.sh
BIN_PATH=$(cd `dirname $0`; pwd)
cd $BIN_PATH
#source /home/dgx/ai/miniconda3/bin/activate
#conda activate vllm-nightly
#uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly/cu130 --extra-index-url https://download.pytorch.org/whl/cu130
export CUDA_HOME=/usr/local/cuda
export TRITON_PTXAS_PATH="${CUDA_HOME}/bin/ptxas"
export PATH="${CUDA_HOME}/bin:$PATH"
nohup \
vllm serve /home/dgx/ai/models/models--zai-org--GLM-4.7-Flash \
--served-model-name=zai-org/GLM-4.7-Flash \
--host=0.0.0.0 \
--port=8032 \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--gpu-memory-utilization 0.7 \
--speculative-config.method mtp \
--speculative-config.num_speculative_tokens 1 \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--enable-auto-tool-choice \
> vllm_server.log 2>&1 &
启动命令start-vllm.sh
BIN_PATH=$(cd `dirname $0`; pwd)
cd $BIN_PATH
./_start-vllm.sh && tail -f ./vllm_server.log