Spaces:
Sleeping
Sleeping
Commit
·
ee49de0
1
Parent(s):
e674a8d
add first app.py
Browse files- app.py +35 -0
- requirements.txt +16 -0
app.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 6 |
+
print(f"Using device: {device}")
|
| 7 |
+
|
| 8 |
+
# Load model (fits in 12GB with fp16)
|
| 9 |
+
model_name = "Salesforce/instructblip-flan-t5-xl"
|
| 10 |
+
processor = InstructBlipProcessor.from_pretrained(model_name)
|
| 11 |
+
model = InstructBlipForConditionalGeneration.from_pretrained(
|
| 12 |
+
model_name,
|
| 13 |
+
torch_dtype=torch.float16,
|
| 14 |
+
device_map="auto"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Load your image
|
| 18 |
+
image = Image.open("example.jpg").convert("RGB")
|
| 19 |
+
|
| 20 |
+
# Prompt to force paragraph-level description
|
| 21 |
+
prompt = (
|
| 22 |
+
"Describe this image in a detailed paragraph of 5-7 sentences. "
|
| 23 |
+
"Mention setting, objects, colors, actions, background details, and possible context."
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
|
| 27 |
+
|
| 28 |
+
out = model.generate(
|
| 29 |
+
**inputs,
|
| 30 |
+
max_new_tokens=250, # enough for multi-sentence
|
| 31 |
+
temperature=0.7,
|
| 32 |
+
top_p=0.9
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
print(processor.batch_decode(out, skip_special_tokens=True)[0])
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face ecosystem
|
| 2 |
+
transformers==4.41.2
|
| 3 |
+
accelerate==0.33.0
|
| 4 |
+
safetensors==0.4.3
|
| 5 |
+
sentencepiece==0.2.0
|
| 6 |
+
|
| 7 |
+
# Parameter-efficient fine-tuning (LoRA)
|
| 8 |
+
peft==0.11.1
|
| 9 |
+
bitsandbytes==0.43.1
|
| 10 |
+
|
| 11 |
+
# Utilities
|
| 12 |
+
pillow==10.3.0
|
| 13 |
+
numpy==1.26.4
|
| 14 |
+
|
| 15 |
+
# Demo UI
|
| 16 |
+
gradio==4.37.2
|