Instructions to use llmware/bling-phi-1_5-v0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use llmware/bling-phi-1_5-v0 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="llmware/bling-phi-1_5-v0", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("llmware/bling-phi-1_5-v0", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use llmware/bling-phi-1_5-v0 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "llmware/bling-phi-1_5-v0" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "llmware/bling-phi-1_5-v0", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/llmware/bling-phi-1_5-v0
- SGLang
How to use llmware/bling-phi-1_5-v0 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "llmware/bling-phi-1_5-v0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "llmware/bling-phi-1_5-v0", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "llmware/bling-phi-1_5-v0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "llmware/bling-phi-1_5-v0", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use llmware/bling-phi-1_5-v0 with Docker Model Runner:
docker model run hf.co/llmware/bling-phi-1_5-v0
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| def load_rag_benchmark_tester_ds(): | |
| # pull 200 question rag benchmark test dataset from LLMWare HuggingFace repo | |
| from datasets import load_dataset | |
| ds_name = "llmware/rag_instruct_benchmark_tester" | |
| dataset = load_dataset(ds_name) | |
| print("update: loading RAG Benchmark test dataset - ", dataset) | |
| test_set = [] | |
| for i, samples in enumerate(dataset["train"]): | |
| test_set.append(samples) | |
| # to view test set samples | |
| # print("rag benchmark dataset test samples: ", i, samples) | |
| return test_set | |
| def run_test(model_name, test_ds): | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print("\nRAG Performance Test - 200 questions") | |
| print("update: model - ", model_name) | |
| print("update: device - ", device) | |
| model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) | |
| model.to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| for i, entries in enumerate(test_ds): | |
| # prepare prompt packaging used in fine-tuning process | |
| new_prompt = "<human>: " + entries["context"] + "\n" + entries["query"] + "\n" + "<bot>:" | |
| inputs = tokenizer(new_prompt, return_tensors="pt") | |
| start_of_output = len(inputs.input_ids[0]) | |
| # temperature: set at 0.3 for consistency of output | |
| # max_new_tokens: set at 100 - may prematurely stop a few of the summaries | |
| outputs = model.generate( | |
| inputs.input_ids.to(device), | |
| eos_token_id=tokenizer.eos_token_id, | |
| pad_token_id=tokenizer.eos_token_id, | |
| do_sample=True, | |
| temperature=0.3, | |
| max_new_tokens=100, | |
| ) | |
| output_only = tokenizer.decode(outputs[0][start_of_output:],skip_special_tokens=True) | |
| # quick/optional post-processing clean-up of potential fine-tuning artifacts | |
| eot = output_only.find("<|endoftext|>") | |
| if eot > -1: | |
| output_only = output_only[:eot] | |
| bot = output_only.find("<bot>:") | |
| if bot > -1: | |
| output_only = output_only[bot+len("<bot>:"):] | |
| # end - post-processing | |
| print("\n") | |
| print(i, "llm_response - ", output_only) | |
| print(i, "gold_answer - ", entries["answer"]) | |
| return 0 | |
| if __name__ == "__main__": | |
| test_ds = load_rag_benchmark_tester_ds() | |
| model_name = "llmware/bling-phi-1_5b-v0" | |
| output = run_test(model_name,test_ds) | |