Update README.md

Browse files

Files changed (1) hide show

README.md +16 -16

README.md CHANGED Viewed

@@ -43,7 +43,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 if __name__ == '__main__':
     # Create an LLM.
-    llm = LLM(model="pytorch/Phi-4-mini-instruct-float8dq")
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
@@ -64,7 +64,7 @@ this is expected be resolved in pytorch 2.8.
 ## Serving
 Then we can serve with the following command:
 ```Shell
-vllm serve pytorch/Phi-4-mini-instruct-float8dq --tokenizer microsoft/Phi-4-mini-instruct -O3
 ```
 # Inference with Transformers
@@ -84,7 +84,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 torch.random.manual_seed(0)
-model_path = "pytorch/Phi-4-mini-instruct-float8dq"
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
@@ -146,7 +146,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 # Push to hub
 USER_ID = "YOUR_USER_ID"
 MODEL_NAME = model_id.split("/")[-1]
-save_to = f"{USER_ID}/{MODEL_NAME}-float8dq"
 quantized_model.push_to_hub(save_to, safe_serialization=False)
 tokenizer.push_to_hub(save_to)
@@ -189,7 +189,7 @@ We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-h
 | Benchmark                        |                |                               |
 |----------------------------------|----------------|-------------------------------|
-|                                  | Phi-4-mini-ins | Phi-4-mini-instruct-float8dq  |
 | **Popular aggregated benchmark** |                |                               |
 | mmlu (0-shot)                    | 66.73          |  66.61                        |
 | mmlu_pro (5-shot)                | 46.43          |  44.58                        |
@@ -221,9 +221,9 @@ https://github.com/EleutherAI/lm-evaluation-harness#install
 lm_eval --model hf --model_args pretrained=microsoft/Phi-4-mini-instruct --tasks hellaswag --device cuda:0 --batch_size 8
 ```
-## float8 dynamic activation and float8 weight quantization (float8dq)
 ```Shell
-lm_eval --model hf --model_args pretrained=pytorch/Phi-4-mini-instruct-float8dq --tasks hellaswag --device cuda:0 --batch_size 8
 ```
 </details>
@@ -235,7 +235,7 @@ lm_eval --model hf --model_args pretrained=pytorch/Phi-4-mini-instruct-float8dq
 | Benchmark        |                |                                |
 |------------------|----------------|--------------------------------|
-|                  | Phi-4 mini-Ins | Phi-4-mini-instruct-float8dq   |
 | Peak Memory (GB) | 8.91           | 5.70 (36% reduction)           |
@@ -249,8 +249,8 @@ We can use the following code to get a sense of peak memory usage during inferen
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
-# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq"
-model_id = "pytorch/Phi-4-mini-instruct-float8dq"
 quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -292,7 +292,7 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
 ## Results (H100 machine)
 | Benchmark                        |                |                                 |
 |----------------------------------|----------------|---------------------------------|
-|                                  | Phi-4 mini-Ins | Phi-4-mini-instruct-float8dq    |
 | latency (batch_size=1)           | 1.64s          | 1.41s (1.16x speedup)           |
 | latency (batch_size=128)         | 3.1s           | 2.72s (1.14x speedup)           |
 | serving (num_prompts=1)          | 1.35 req/s     | 1.57 req/s (1.16x speedup)      |
@@ -323,9 +323,9 @@ Run the benchmarks under `vllm` root folder:
 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model microsoft/Phi-4-mini-instruct --batch-size 1
 ```
-### float8dq
 ```Shell
-VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model pytorch/Phi-4-mini-instruct-float8dq --batch-size 1
 ```
 ## benchmark_serving
@@ -352,15 +352,15 @@ Client:
 python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer microsoft/Phi-4-mini-instruct --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model microsoft/Phi-4-mini-instruct --num-prompts 1
 ```
-### float8dq
 Server:
 ```Shell
-VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Phi-4-mini-instruct-float8dq --tokenizer microsoft/Phi-4-mini-instruct -O3
 ```
 Client:
 ```Shell
-python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer microsoft/Phi-4-mini-instruct --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model pytorch/Phi-4-mini-instruct-float8dq --num-prompts 1
 ```
 </details>

 if __name__ == '__main__':
     # Create an LLM.
+    llm = LLM(model="pytorch/Phi-4-mini-instruct-FP8")
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
 ## Serving
 Then we can serve with the following command:
 ```Shell
+vllm serve pytorch/Phi-4-mini-instruct-FP8 --tokenizer microsoft/Phi-4-mini-instruct -O3
 ```
 # Inference with Transformers
 torch.random.manual_seed(0)
+model_path = "pytorch/Phi-4-mini-instruct-FP8"
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
 # Push to hub
 USER_ID = "YOUR_USER_ID"
 MODEL_NAME = model_id.split("/")[-1]
+save_to = f"{USER_ID}/{MODEL_NAME}-FP8"
 quantized_model.push_to_hub(save_to, safe_serialization=False)
 tokenizer.push_to_hub(save_to)
 | Benchmark                        |                |                               |
 |----------------------------------|----------------|-------------------------------|
+|                                  | Phi-4-mini-ins | Phi-4-mini-instruct-FP8  |
 | **Popular aggregated benchmark** |                |                               |
 | mmlu (0-shot)                    | 66.73          |  66.61                        |
 | mmlu_pro (5-shot)                | 46.43          |  44.58                        |
 lm_eval --model hf --model_args pretrained=microsoft/Phi-4-mini-instruct --tasks hellaswag --device cuda:0 --batch_size 8
 ```
+## float8 dynamic activation and float8 weight quantization (FP8)
 ```Shell
+lm_eval --model hf --model_args pretrained=pytorch/Phi-4-mini-instruct-FP8 --tasks hellaswag --device cuda:0 --batch_size 8
 ```
 </details>
 | Benchmark        |                |                                |
 |------------------|----------------|--------------------------------|
+|                  | Phi-4 mini-Ins | Phi-4-mini-instruct-FP8   |
 | Peak Memory (GB) | 8.91           | 5.70 (36% reduction)           |
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
+# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-FP8"
+model_id = "pytorch/Phi-4-mini-instruct-FP8"
 quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 ## Results (H100 machine)
 | Benchmark                        |                |                                 |
 |----------------------------------|----------------|---------------------------------|
+|                                  | Phi-4 mini-Ins | Phi-4-mini-instruct-FP8    |
 | latency (batch_size=1)           | 1.64s          | 1.41s (1.16x speedup)           |
 | latency (batch_size=128)         | 3.1s           | 2.72s (1.14x speedup)           |
 | serving (num_prompts=1)          | 1.35 req/s     | 1.57 req/s (1.16x speedup)      |
 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model microsoft/Phi-4-mini-instruct --batch-size 1
 ```
+### FP8
 ```Shell
+VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model pytorch/Phi-4-mini-instruct-FP8 --batch-size 1
 ```
 ## benchmark_serving
 python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer microsoft/Phi-4-mini-instruct --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model microsoft/Phi-4-mini-instruct --num-prompts 1
 ```
+### FP8
 Server:
 ```Shell
+VLLM_DISABLE_COMPILE_CACHE=1 vllm serve pytorch/Phi-4-mini-instruct-FP8 --tokenizer microsoft/Phi-4-mini-instruct -O3
 ```
 Client:
 ```Shell
+python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer microsoft/Phi-4-mini-instruct --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model pytorch/Phi-4-mini-instruct-FP8 --num-prompts 1
 ```
 </details>