binwang777
commited on
Commit
·
3136022
1
Parent(s):
e7e8b04
update model name
Browse files- README.md +9 -4
- config.json +1 -1
- rzen_embed_inference.py +2 -2
README.md
CHANGED
|
@@ -2,6 +2,11 @@
|
|
| 2 |
|
| 3 |
RzenEmbed-v2-7B is a multimodal embedding model developed and open-sourced by 360CVGroup. It achieves state-of-the-art (SOTA) results on the MMEB-V2, MMEB-Visdoc, and MMEB-Video benchmarks (as of September 29, 2025).
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
### MMEB-V2
|
| 6 |
|
| 7 |
| Model | Model Size (B) | Overall | Image-Overall | Video-Overall | Visdoc-Overall |
|
|
@@ -63,7 +68,7 @@ Retrieve images that match text captions.
|
|
| 63 |
```python
|
| 64 |
from rzen_embed_inference import RzenEmbed
|
| 65 |
|
| 66 |
-
rzen = RzenEmbed("
|
| 67 |
|
| 68 |
queries = [
|
| 69 |
"A curious kitten and a gentle puppy share a moment of connection on the grass.",
|
|
@@ -93,7 +98,7 @@ Find text captions that best match given images.
|
|
| 93 |
```python
|
| 94 |
from rzen_embed_inference import RzenEmbed
|
| 95 |
|
| 96 |
-
rzen = RzenEmbed("
|
| 97 |
|
| 98 |
queries = [
|
| 99 |
"assets/example1.jpg",
|
|
@@ -121,7 +126,7 @@ Match text queries with document images for information retrieval.
|
|
| 121 |
```python
|
| 122 |
from rzen_embed_inference import RzenEmbed
|
| 123 |
|
| 124 |
-
rzen = RzenEmbed("
|
| 125 |
|
| 126 |
queries = [
|
| 127 |
"What is the main variable being analyzed on the x-axis of these graphs?",
|
|
@@ -168,7 +173,7 @@ def extract_frames(video_path, num_frames):
|
|
| 168 |
cap.release()
|
| 169 |
return frames
|
| 170 |
|
| 171 |
-
rzen = RzenEmbed("
|
| 172 |
|
| 173 |
queries = [
|
| 174 |
"A traditional boat glides along a river lined with blooming cherry blossoms under an overcast sky in a modern cityscape.",
|
|
|
|
| 2 |
|
| 3 |
RzenEmbed-v2-7B is a multimodal embedding model developed and open-sourced by 360CVGroup. It achieves state-of-the-art (SOTA) results on the MMEB-V2, MMEB-Visdoc, and MMEB-Video benchmarks (as of September 29, 2025).
|
| 4 |
|
| 5 |
+
|
| 6 |
+
[](https://arxiv.org/abs/2510.27350)
|
| 7 |
+
[](https://github.com/360CVGroup/RzenEmbed)
|
| 8 |
+
[](https://huggingface.co/spaces/TIGER-Lab/MMEB-Leaderboard)
|
| 9 |
+
|
| 10 |
### MMEB-V2
|
| 11 |
|
| 12 |
| Model | Model Size (B) | Overall | Image-Overall | Video-Overall | Visdoc-Overall |
|
|
|
|
| 68 |
```python
|
| 69 |
from rzen_embed_inference import RzenEmbed
|
| 70 |
|
| 71 |
+
rzen = RzenEmbed("qihoo360/RzenEmbed")
|
| 72 |
|
| 73 |
queries = [
|
| 74 |
"A curious kitten and a gentle puppy share a moment of connection on the grass.",
|
|
|
|
| 98 |
```python
|
| 99 |
from rzen_embed_inference import RzenEmbed
|
| 100 |
|
| 101 |
+
rzen = RzenEmbed("qihoo360/RzenEmbed")
|
| 102 |
|
| 103 |
queries = [
|
| 104 |
"assets/example1.jpg",
|
|
|
|
| 126 |
```python
|
| 127 |
from rzen_embed_inference import RzenEmbed
|
| 128 |
|
| 129 |
+
rzen = RzenEmbed("qihoo360/RzenEmbed")
|
| 130 |
|
| 131 |
queries = [
|
| 132 |
"What is the main variable being analyzed on the x-axis of these graphs?",
|
|
|
|
| 173 |
cap.release()
|
| 174 |
return frames
|
| 175 |
|
| 176 |
+
rzen = RzenEmbed("qihoo360/RzenEmbed")
|
| 177 |
|
| 178 |
queries = [
|
| 179 |
"A traditional boat glides along a river lined with blooming cherry blossoms under an overcast sky in a modern cityscape.",
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"Qwen2VLForConditionalGeneration"
|
| 5 |
],
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "qihoo360/RzenEmbed",
|
| 3 |
"architectures": [
|
| 4 |
"Qwen2VLForConditionalGeneration"
|
| 5 |
],
|
rzen_embed_inference.py
CHANGED
|
@@ -17,7 +17,7 @@ from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
|
| 17 |
class RzenEmbed(nn.Module):
|
| 18 |
def __init__(
|
| 19 |
self,
|
| 20 |
-
model_name: str = "
|
| 21 |
model_path: Optional[str] = None,
|
| 22 |
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
| 23 |
min_image_tokens=256,
|
|
@@ -345,7 +345,7 @@ def fetch_image(image: str | Image.Image, size_factor: int = IMAGE_FACTOR) -> Im
|
|
| 345 |
|
| 346 |
|
| 347 |
if __name__ == '__main__':
|
| 348 |
-
rzen = RzenEmbed("
|
| 349 |
|
| 350 |
queries = [
|
| 351 |
"A curious kitten and a gentle puppy share a moment of connection on the grass.",
|
|
|
|
| 17 |
class RzenEmbed(nn.Module):
|
| 18 |
def __init__(
|
| 19 |
self,
|
| 20 |
+
model_name: str = "qihoo360/RzenEmbed",
|
| 21 |
model_path: Optional[str] = None,
|
| 22 |
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
| 23 |
min_image_tokens=256,
|
|
|
|
| 345 |
|
| 346 |
|
| 347 |
if __name__ == '__main__':
|
| 348 |
+
rzen = RzenEmbed("qihoo360/RzenEmbed")
|
| 349 |
|
| 350 |
queries = [
|
| 351 |
"A curious kitten and a gentle puppy share a moment of connection on the grass.",
|