|
|
--- |
|
|
base_model: |
|
|
- MiniMaxAI/MiniMax-M2 |
|
|
--- |
|
|
|
|
|
modelopt NVFP4 quantized MiniMax-M2 |
|
|
|
|
|
Instructions from another user, running on RTX Pro 6000 Blackwell: |
|
|
|
|
|
Running this model on vllm/vllm-openai:nightly has been met with mixed results. Sometimes working, sometimes not. |
|
|
|
|
|
I noticed that I could not run this model with just the base vllm v0.12.0 because it was built using cuda 12.9. |
|
|
Other discussions on this model show that the model is working with vllm v0.12.0 when using cuda 13. |
|
|
|
|
|
I stepped through the following instructions in order to reliably build vllm & run this model using vllm 0.12.0 and cuda 13.0.2. |
|
|
|
|
|
# Instructions |
|
|
## 1. Build VLLM Image |
|
|
|
|
|
```bash |
|
|
# Clone the VLLM Repo |
|
|
if [[ ! -d vllm ]]; then |
|
|
git clone https://github.com/vllm-project/vllm.git |
|
|
fi |
|
|
|
|
|
# Checkout the v0.12.0 version of VLLM |
|
|
cd vllm |
|
|
git checkout releases/v0.12.0 |
|
|
|
|
|
# Build with cuda 13.0.2, use precompiled vllm cu130, & ubuntu 22.04 image |
|
|
DOCKER_BUILDKIT=1 \ |
|
|
docker build . \ |
|
|
--target vllm-openai \ |
|
|
--tag vllm/vllm-openai:custom-vllm-0.12.0-cuda-13.0.2-py-3.12 \ |
|
|
--file docker/Dockerfile \ |
|
|
--build-arg max_jobs=64 \ |
|
|
--build-arg nvcc_threads=16 \ |
|
|
--build-arg CUDA_VERSION=13.0.2 \ |
|
|
--build-arg PYTHON_VERSION=3.12 \ |
|
|
--build-arg VLLM_USE_PRECOMPILED=true \ |
|
|
--build-arg VLLM_MAIN_CUDA_VERSION=130 \ |
|
|
--build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ |
|
|
--build-arg RUN_WHEEL_CHECK=false \ |
|
|
; |
|
|
# ubuntu 22.04 required because 20.04 does not have cuda 13.0.2 varient |
|
|
``` |
|
|
|
|
|
## 2. Run the custom VLLM Image |
|
|
|
|
|
```yaml |
|
|
services: |
|
|
vllm: |
|
|
|
|
|
# !!! Notice !!! our custom built image is here from the build command above |
|
|
image: vllm/vllm-openai:custom-vllm-0.12.0-cuda-13.0.2-py-3.12 |
|
|
|
|
|
environment: |
|
|
# Optional |
|
|
VLLM_NO_USAGE_STATS: "1" |
|
|
DO_NOT_TRACK: "1" |
|
|
CUDA_DEVICE_ORDER: PCI_BUS_ID |
|
|
VLLM_LOGGING_LEVEL: INFO |
|
|
|
|
|
# Required (I think) |
|
|
VLLM_ATTENTION_BACKEND: FLASHINFER |
|
|
VLLM_FLASHINFER_MOE_BACKEND: throughput |
|
|
VLLM_USE_FLASHINFER_MOE_FP16: 1 |
|
|
VLLM_USE_FLASHINFER_MOE_FP8: 1 |
|
|
VLLM_USE_FLASHINFER_MOE_FP4: 1 |
|
|
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: 1 |
|
|
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: 1 |
|
|
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: 1 |
|
|
|
|
|
# Required (I think) |
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn |
|
|
NCCL_P2P_DISABLE: 1 |
|
|
|
|
|
entrypoint: /bin/bash |
|
|
command: |
|
|
- -c |
|
|
- | |
|
|
vllm serve \ |
|
|
/root/.cache/huggingface/hub/models--lukealonso--MiniMax-M2-NVFP4/snapshots/d8993b15556ab7294530f1ba50a93ad130166174/ \ |
|
|
--served-model-name minimax-m2-fp4 \ |
|
|
--gpu-memory-utilization 0.95 \ |
|
|
--pipeline-parallel-size 1 \ |
|
|
--enable-expert-parallel \ |
|
|
--tensor-parallel-size 4 \ |
|
|
--max-model-len $(( 192 * 1024 )) \ |
|
|
--max-num-seqs 32 \ |
|
|
--enable-auto-tool-choice \ |
|
|
--reasoning-parser minimax_m2_append_think \ |
|
|
--tool-call-parser minimax_m2 \ |
|
|
--all2all-backend pplx \ |
|
|
--enable-prefix-caching \ |
|
|
--enable-chunked-prefill \ |
|
|
--max-num-batched-tokens $(( 64 * 1024 )) \ |
|
|
--dtype auto \ |
|
|
--kv-cache-dtype fp8 \ |
|
|
; |
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
# VLLM CLI Arugments Explained |
|
|
|
|
|
## Required Model Arguments |
|
|
|
|
|
1. The path to the model (alternatively huggingface `lukealonso/MiniMax-M2-NVFP4`) |
|
|
- `/root/.cache/huggingface/hub/models--lukealonso--MiniMax-M2-NVFP4/snapshots/d8993b15556ab7294530f1ba50a93ad130166174/` |
|
|
2. The minimax-m2 model parsers & tool config |
|
|
- `--reasoning-parser minimax_m2_append_think` |
|
|
- `--tool-call-parser minimax_m2` |
|
|
- `--enable-auto-tool-choice` |
|
|
|
|
|
## Required Compute Arguments |
|
|
1. The parallelism mode (multi gpu, 4x in this example) |
|
|
- `--enable-expert-parallel` |
|
|
- `--pipeline-parallel-size 1` |
|
|
- `--tensor-parallel-size 4` |
|
|
- `--all2all-backend pplx` |
|
|
2. The kv cache & layer data types |
|
|
- `--kv-cache-dtype fp8` |
|
|
- `--dtype auto` |
|
|
|
|
|
## Optional Model Arguments |
|
|
1. The name of the model to present to api clients |
|
|
- `--served-model-name minimax-m2-fp4` |
|
|
2. The context size available to the model 192k max |
|
|
- `--max-model-len $(( 192 * 1024 ))` |
|
|
3. The prompt chunking size (faster time-til-first-token with large prompts) |
|
|
- `--enable-chunked-prefill` |
|
|
- `--max-num-batched-tokens $(( 64 * 1024 ))` |
|
|
|
|
|
## Optional Performance Arguments |
|
|
1. How much system VRAM can vllm use? |
|
|
- `--gpu-memory-utilization 0.95` |
|
|
2. How many parallel requests can be made to the server at once. |
|
|
- `--max-num-seqs 32` |
|
|
3. Allow KV cache sharing for overlapping prompts |
|
|
- `--enable-prefix-caching` |
|
|
|
|
|
|
|
|
|
|
|
Tested (but not extensively validated) on *2x* RTX Pro 6000 Blackwell via: |
|
|
(note that these instructions no longer work due to nightly vLLM breaking NVFP4 support) |
|
|
|
|
|
``` |
|
|
inference: |
|
|
image: vllm/vllm-openai:nightly |
|
|
container_name: inference |
|
|
ports: |
|
|
- "0.0.0.0:8000:8000" |
|
|
gpus: "all" |
|
|
shm_size: "32g" |
|
|
ipc: "host" |
|
|
ulimits: |
|
|
memlock: -1 |
|
|
nofile: 1048576 |
|
|
environment: |
|
|
- NCCL_IB_DISABLE=1 |
|
|
- NCCL_NVLS_ENABLE=0 |
|
|
- NCCL_P2P_DISABLE=0 |
|
|
- NCCL_SHM_DISABLE=0 |
|
|
- VLLM_USE_V1=1 |
|
|
- VLLM_USE_FLASHINFER_MOE_FP4=1 |
|
|
- OMP_NUM_THREADS=8 |
|
|
- SAFETENSORS_FAST_GPU=1 |
|
|
volumes: |
|
|
- /dev/shm:/dev/shm |
|
|
command: |
|
|
- lukealonso/MiniMax-M2-NVFP4 |
|
|
- --enable-auto-tool-choice |
|
|
- --tool-call-parser |
|
|
- minimax_m2 |
|
|
- --reasoning-parser |
|
|
- minimax_m2_append_think |
|
|
- --all2all-backend |
|
|
- pplx |
|
|
- --enable-expert-parallel |
|
|
- --enable-prefix-caching |
|
|
- --enable-chunked-prefill |
|
|
- --served-model-name |
|
|
- "MiniMax-M2" |
|
|
- --tensor-parallel-size |
|
|
- "2" |
|
|
- --gpu-memory-utilization |
|
|
- "0.95" |
|
|
- --max-num-batched-tokens |
|
|
- "16384" |
|
|
- --dtype |
|
|
- "auto" |
|
|
- --max-num-seqs |
|
|
- "8" |
|
|
- --kv-cache-dtype |
|
|
- fp8 |
|
|
- --host |
|
|
- "0.0.0.0" |
|
|
- --port |
|
|
- "8000" |
|
|
``` |