TIGER-Lab/MMEB-train
Viewer • Updated • 2.14M • 4.77k • 18
How to use TianchengGu/UniME-V2-LLaVA-OneVision-8B with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("image-text-to-text", model="TianchengGu/UniME-V2-LLaVA-OneVision-8B")
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": "What animal is on the candy?"}
]
},
]
pipe(text=messages) # Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText
processor = AutoProcessor.from_pretrained("TianchengGu/UniME-V2-LLaVA-OneVision-8B")
model = AutoModelForImageTextToText.from_pretrained("TianchengGu/UniME-V2-LLaVA-OneVision-8B")
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": "What animal is on the candy?"}
]
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use TianchengGu/UniME-V2-LLaVA-OneVision-8B with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "TianchengGu/UniME-V2-LLaVA-OneVision-8B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "TianchengGu/UniME-V2-LLaVA-OneVision-8B",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'docker model run hf.co/TianchengGu/UniME-V2-LLaVA-OneVision-8B
How to use TianchengGu/UniME-V2-LLaVA-OneVision-8B with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "TianchengGu/UniME-V2-LLaVA-OneVision-8B" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "TianchengGu/UniME-V2-LLaVA-OneVision-8B",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "TianchengGu/UniME-V2-LLaVA-OneVision-8B" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "TianchengGu/UniME-V2-LLaVA-OneVision-8B",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'How to use TianchengGu/UniME-V2-LLaVA-OneVision-8B with Docker Model Runner:
docker model run hf.co/TianchengGu/UniME-V2-LLaVA-OneVision-8B
Tiancheng Gu*,
Kaicheng Yang*,
Kaichen Zhang,
Xiang An,
Ziyong Feng,
Yueyi Zhang,
Weidong Cai,
Jiankang Deng,
Lidong Bing
git clone https://github.com/deepglint/UniME-v2.git
cd UniME-v2
conda create -n uniMEv2 python=3.10 -y
conda activate uniMEv2
pip install -r requirements.txt
# Optional: Install Flash Attention for acceleration
# wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# pip install flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
import torch
from torch.nn import functional as F
from utils.utils import init_model_and_processor, prepare_stage_data, parse_answer_index
device="cuda"
embedding=False # adjust embedding model or rerank model
if embedding:
model_name="models/UniME-V2_qwen2VL_2B"
# model_name="models/UniME-V2_qwen2VL_7B"
# model_name="models/UniME-V2_LLaVA_onevision_8B"
text = "A man is crossing the street with a red car parked nearby."
image_path = "Figures/demo.png"
else:
model_name="models/UniME-v2-rerank_qwen25VL_7B"
text = ["A man is crossing the street with a red car parked nearby.", #! Target text
"A woman is walking her dog with a blue bicycle leaning nearby.",
"A child is riding a scooter past a green truck stopped nearby.",
"A couple is waiting for the bus beside a yellow taxi parked nearby.",
"A jogger is running along the path with a black motorcycle parked nearby."]
image_path = "Figures/demo.png"
model, processor = init_model_and_processor(model_name, device, embedding=embedding)
if embedding:
inputs_image, inputs_txt = prepare_stage_data(model_name, processor, text, image_path, embedding=embedding)
inputs_image = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs_image.items()}
inputs_txt = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs_txt.items()}
with torch.no_grad():
emb_text = model(**inputs_txt, output_hidden_states=True, return_dict=True).hidden_states[-1][:, -1, :]
emb_image = model(**inputs_image, output_hidden_states=True, return_dict=True).hidden_states[-1][:, -1, :]
emb_text = F.normalize(emb_text, dim=-1)
emb_image = F.normalize(emb_image, dim=-1)
Score = emb_image @ emb_text.T
print("Score: ", Score.item()) # qwen2VL 2B : Score: 0.62109375
else:
inputs = prepare_stage_data(model_name, processor, text, image_path, embedding=embedding)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=128, output_scores=True, return_dict_in_generate=True, do_sample=False).sequences
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print("Rerank Answer: ", parse_answer_index(output_text[0])) # qwen25VL 7B: Rerank Answer: 0
If you find this repository useful, please use the following BibTeX entry for citation.
@misc{gu2025unimev2mllmasajudgeuniversalmultimodal,
title={UniME-V2: MLLM-as-a-Judge for Universal Multimodal Embedding Learning},
author={Tiancheng Gu and Kaicheng Yang and Kaichen Zhang and Xiang An and Ziyong Feng and Yueyi Zhang and Weidong Cai and Jiankang Deng and Lidong Bing},
year={2025},
eprint={2510.13515},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2510.13515},
}
@inproceedings{unime,
title={Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs},
author={Gu, Tiancheng and Yang, Kaicheng and Feng, Ziyong and Wang, Xingjun and Zhang, Yanzhao and Long, Dingkun and Chen, Yingda and Cai, Weidong and Deng, Jiankang},
booktitle={ACM MM},
year={2025}
}
Base model
llava-hf/llava-onevision-qwen2-7b-ov-hf