Image-Text-to-Text
Transformers
Safetensors
PyTorch
mllama
meta
llama
llama-3
vision
conversational
text-generation-inference
Instructions to use mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision") model = AutoModelForImageTextToText.from_pretrained("mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision
- SGLang
How to use mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision with Docker Model Runner:
docker model run hf.co/mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision
Lumimimaid v0.2 8B + Llama3.2Vision Adapter
This model was created using the script below. It is compatible with:
- Llama 3.1 8B & 70B
Respectively
- Llama Vision 3.2 11B & 90B
Merge Script
from transformers import MllamaForConditionalGeneration, MllamaProcessor, AutoModelForCausalLM
# NOTE: You need sufficient DRAM to load both models at once (otherwise, need to process layer by layer which is not shown here)
multimodal_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Original Llama vision model (11B or 90B)
text_model_path = "NeverSleep/Lumimaid-v0.2-8B" # Model to be merged (8B or 70B)
save_path = "models/merged_model"
multimodal_model = MllamaForConditionalGeneration.from_pretrained(multimodal_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
multimodal_processor = MllamaProcessor.from_pretrained(multimodal_model_path)
text_model = AutoModelForCausalLM.from_pretrained(text_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
state_dict_multimodal = multimodal_model.state_dict()
state_dict_text = text_model.state_dict()
num_decoder_layers_text = text_model.config.num_hidden_layers
num_decoder_layers_vision = multimodal_model.config.text_config.num_hidden_layers
# Find the list of inserted layers in multimodal Llama
inserted_layers = set()
for key_multimodal in state_dict_multimodal.keys():
if "language_model" in key_multimodal and "cross_attn" in key_multimodal and ".layers." in key_multimodal:
layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
if layer_num_multimodal is not None: inserted_layers.add(layer_num_multimodal)
# Here are the hard-coded list of layers added:
# inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98} $ For 90B
inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38} # For 11B
assert len(inserted_layers) == num_decoder_layers_vision - num_decoder_layers_text, "# of added layers do not match"
# Build decoder layer map from multimodal layer# to text layer#, skipping layers listed in inserted_layers
layer_map = dict()
layer_num_multimodal = 0
for layer_num_text in range(num_decoder_layers_text):
while layer_num_multimodal in inserted_layers: layer_num_multimodal += 1 # Increment to skip mismatched layers
layer_map[layer_num_multimodal] = layer_num_text
layer_num_multimodal += 1
for key_multimodal in state_dict_multimodal.keys():
if "language_model" not in key_multimodal: continue # A multi-modal param
if "cross_attn" in key_multimodal: continue # A multi-modal param
key_text = key_multimodal.replace("language_model.", "")
if "embed_tokens.weight" in key_multimodal: # Handle embed tokens separately
assert key_text in state_dict_text, f"Key not found: {key_text}"
extra_tokens = state_dict_multimodal[key_multimodal].shape[0] - state_dict_text[key_text].shape[0]
state_dict_multimodal[key_multimodal][:state_dict_text[key_text].shape[0], :].copy_(state_dict_text[key_text])
print(f"Replaced {key_multimodal} with {key_text} (preserving last {extra_tokens} tokens)")
continue
if "lm_head" in key_multimodal or "model.norm.weight" in key_multimodal: # Handle other non-decoder layers separately
assert key_text in state_dict_text, f"Key not found: {key_text}"
state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text])
print(f"Replaced {key_multimodal} with {key_text}")
continue
layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
assert layer_num_multimodal is not None, f"Unknown non-decoder key encountered: {key_multimodal}"
if layer_num_multimodal in inserted_layers: continue # Skip mismatched layers
assert layer_num_multimodal in layer_map, f"Layer not found in layer_map: {layer_num_multimodal}"
layer_num_text = layer_map[layer_num_multimodal]
key_text = key_text.replace(f".layers.{layer_num_multimodal}.", f".layers.{layer_num_text}.")
assert key_text in state_dict_text, f"Key not found: {key_text}"
state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text])
print(f"Replaced {key_multimodal} with {key_text}")
print("Merged model successfully. Saving...")
# Apply the changes
multimodal_model.load_state_dict(state_dict_multimodal)
# Create save_path if it does not exist
os.makedirs(save_path, exist_ok=True)
multimodal_model.save_pretrained(save_path, safe_serialization=True, max_shard_size="8192MB")
multimodal_processor.save_pretrained(save_path)
print(f"Model saved to {save_path}")
Model Inference:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
model_id = "mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision"
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
License
This project is licensed under the MIT License.
- Downloads last month
- 4
Model tree for mrcuddle/Lummimaid-v0.2-8B-Llama3.2Vision
Base model
NeverSleep/Lumimaid-v0.2-8B