amogne-vlm-LLM / app.py
amogneandualem's picture
Update app.py
3bb2461 verified
"""
🌍 Advanced Multilingual Image Describer
Using latest Vision-Language Models (VLMs) with native multilingual support
"""
import streamlit as st
import torch
from PIL import Image
import time
from datetime import datetime
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# Set page config
st.set_page_config(
page_title="Multilingual Image Describer",
page_icon="🌍",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
<style>
.st-emotion-cache-16txtl3 {
padding-top: 3rem;
}
.header-title {
text-align: center;
color: #2C3E50;
margin-bottom: 1rem;
}
.model-badge {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 5px 15px;
border-radius: 20px;
font-size: 12px;
display: inline-block;
margin: 5px;
}
.language-tag {
background: #E3F2FD;
color: #1976D2;
padding: 3px 10px;
border-radius: 15px;
font-size: 12px;
margin: 2px;
display: inline-block;
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'model' not in st.session_state:
st.session_state.model = None
if 'model_name' not in st.session_state:
st.session_state.model_name = None
if 'results' not in st.session_state:
st.session_state.results = None
# Model options (latest vision-language models)
MODEL_OPTIONS = {
"llava-hf/llava-1.5-7b-hf": {
"name": "LLaVA 1.5 (7B)",
"multilingual": True,
"languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"],
"prompt_templates": {
"en": "Describe this image in detail:",
"zh": "详细描述这张图片:",
"es": "Describe esta imagen en detalle:",
"fr": "Décrivez cette image en détail :",
"de": "Beschreiben Sie dieses Bild im Detail:",
"am": "ይህንን ምስል በዝርዝር ይግለጹ፡"
}
},
"Qwen/Qwen-VL-Chat": {
"name": "Qwen-VL-Chat",
"multilingual": True,
"languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"],
"prompt_templates": {
"en": "Describe this image in English:",
"zh": "用中文描述这张图片:",
"am": "በአማርኛ ይህንን ምስል ይግለጹ፡"
}
},
"vikhyatk/moondream2": {
"name": "Moondream 2",
"multilingual": True,
"languages": ["en", "es", "fr", "de"],
"prompt_templates": {
"en": "Describe this image:",
"zh": "描述这张图片:",
"am": "ይህንን ምስል ይግለጹ፡"
}
}
}
# Language mapping
LANGUAGE_NAMES = {
"en": "🇺🇸 English",
"zh": "🇨🇳 中文",
"am": "🇪🇹 አማርኛ",
"es": "🇪🇸 Español",
"fr": "🇫🇷 Français",
"de": "🇩🇪 Deutsch",
"ar": "🇸🇦 العربية",
"hi": "🇮🇳 हिन्दी",
"ru": "🇷🇺 Русский",
"ja": "🇯🇵 日本語",
"ko": "🇰🇷 한국어",
"it": "🇮🇹 Italiano",
"pt": "🇵🇹 Português",
"tr": "🇹🇷 Türkçe"
}
@st.cache_resource(show_spinner=True)
def load_model(model_id):
"""Load the selected vision-language model"""
try:
from transformers import AutoProcessor, AutoModelForVision2Seq
st.info(f"🚀 Loading {MODEL_OPTIONS[model_id]['name']}...")
# Load processor and model
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto" if torch.cuda.is_available() else None
)
return processor, model, model_id
except Exception as e:
st.error(f"❌ Failed to load model: {str(e)[:200]}")
return None, None, None
def generate_caption(image, model_tuple, language="en", model_id=None):
"""Generate caption using the vision-language model"""
if None in model_tuple:
return "Model not loaded"
processor, model, loaded_model_id = model_tuple
try:
# Get prompt template based on model and language
model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"])
prompt_template = model_info["prompt_templates"].get(
language,
model_info["prompt_templates"].get("en", "Describe this image:")
)
# Prepare inputs
if "llava" in loaded_model_id:
# LLaVA format
prompt = f"USER: <image>\n{prompt_template}\nASSISTANT:"
inputs = processor(text=prompt, images=image, return_tensors="pt")
elif "qwen" in loaded_model_id.lower():
# Qwen-VL format
prompt = f"<img>Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:</img>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
else:
# Default format
inputs = processor(text=prompt_template, images=image, return_tensors="pt")
# Move to device
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Generate
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True
)
# Decode
generated_text = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0].strip()
# Clean up response
if "llava" in loaded_model_id:
# Remove the prompt part
if "ASSISTANT:" in generated_text:
generated_text = generated_text.split("ASSISTANT:")[-1].strip()
return generated_text
except Exception as e:
return f"Error generating description: {str(e)[:100]}"
def main():
# Title
st.markdown("<h1 class='header-title'>🌍 Advanced Multilingual Image Describer</h1>", unsafe_allow_html=True)
# Model info
st.markdown("""
<div style="text-align: center; margin-bottom: 2rem;">
<span class='model-badge'>Latest Vision-Language Models</span>
<span class='model-badge'>Native Multilingual Support</span>
<span class='model-badge'>No Translation APIs Needed</span>
</div>
""", unsafe_allow_html=True)
# Sidebar
with st.sidebar:
st.markdown("### ⚙️ Configuration")
# Model selection
st.markdown("#### 🤖 Select Model")
model_choice = st.selectbox(
"Choose a vision-language model:",
options=list(MODEL_OPTIONS.keys()),
format_func=lambda x: MODEL_OPTIONS[x]["name"],
help="LLaVA supports most languages. Qwen-VL is faster."
)
# Show model info
model_info = MODEL_OPTIONS[model_choice]
st.caption(f"✅ Languages: {len(model_info['languages'])}")
st.caption(f"📊 Parameters: 7B+")
# Language selection
st.markdown("#### 🌐 Select Language")
available_langs = model_info["languages"]
selected_lang = st.selectbox(
"Output language:",
options=available_langs,
format_func=lambda x: LANGUAGE_NAMES.get(x, x),
index=0
)
# Show language tags
st.markdown("**Supported languages:**")
lang_tags = " ".join([
f'<span class="language-tag">{LANGUAGE_NAMES.get(lang, lang)}</span>'
for lang in available_langs[:8]
])
st.markdown(f'<div>{lang_tags}</div>', unsafe_allow_html=True)
# Image upload
st.markdown("---")
st.markdown("### 📸 Upload Image")
uploaded_file = st.file_uploader(
"Choose an image file",
type=["jpg", "jpeg", "png", "webp", "bmp"],
label_visibility="collapsed"
)
# Advanced options
with st.expander("⚡ Advanced Settings"):
max_tokens = st.slider("Max tokens", 50, 500, 200, 50)
temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1)
st.markdown("---")
# Action buttons
col1, col2 = st.columns(2)
with col1:
load_btn = st.button("🔄 Load Model", use_container_width=True)
with col2:
if st.button("🗑️ Clear", use_container_width=True):
st.session_state.results = None
st.rerun()
# Load model if requested
if load_btn or (st.session_state.model is None and uploaded_file):
with st.spinner(f"Loading {model_info['name']}..."):
processor, model, model_id = load_model(model_choice)
if processor and model:
st.session_state.model = (processor, model, model_id)
st.session_state.model_name = model_info["name"]
st.success(f"✅ {model_info['name']} loaded!")
else:
st.error("❌ Failed to load model")
# Quick stats
if st.session_state.results:
st.markdown("---")
st.markdown("### 📊 Quick Stats")
col1, col2 = st.columns(2)
with col1:
st.metric("Model", st.session_state.model_name or "N/A")
with col2:
st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang))
# Main content
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("### 📤 Input Image")
if uploaded_file:
try:
image = Image.open(uploaded_file).convert("RGB")
st.image(image, use_column_width=True)
st.caption(f"📏 Size: {image.size[0]}×{image.size[1]} pixels")
# Store for processing
st.session_state.current_image = image
except Exception as e:
st.error(f"Error loading image: {e}")
else:
st.info("👈 Upload an image to get started")
# Show placeholder
st.image(
"https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format",
caption="Upload your own image for analysis",
use_column_width=True
)
with col2:
st.markdown("### 📋 Results")
# Process image if model is loaded
if (uploaded_file and st.session_state.model and
st.session_state.current_image and
'current_image' in st.session_state):
# Generate button
if st.button("🚀 Generate Description", type="primary", use_container_width=True):
with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."):
start_time = time.time()
# Generate caption
caption = generate_caption(
st.session_state.current_image,
st.session_state.model,
selected_lang,
model_choice
)
processing_time = time.time() - start_time
# Store results
st.session_state.results = {
"caption": caption,
"language": selected_lang,
"language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang),
"model": st.session_state.model_name,
"processing_time": f"{processing_time:.2f}s",
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
# Display results
if st.session_state.results:
results = st.session_state.results
st.success(f"✅ Generated in {results['processing_time']}")
# Display caption
st.markdown("#### Generated Description")
st.markdown(f"""
<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #667eea;">
<p style="font-size: 16px; line-height: 1.6;">{results['caption']}</p>
</div>
""", unsafe_allow_html=True)
# Metadata
st.markdown("#### 📊 Analysis Details")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Model", results['model'])
with col2:
st.metric("Language", results['language_name'])
with col3:
st.metric("Time", results['processing_time'])
# Export options
st.markdown("---")
st.markdown("#### 💾 Export Results")
col1, col2 = st.columns(2)
with col1:
# JSON export
import json
json_data = json.dumps(results, indent=2, ensure_ascii=False)
st.download_button(
"📥 Download JSON",
json_data,
f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
"application/json",
use_container_width=True
)
with col2:
# Text export
text_data = f"""Image Description
Generated: {results['timestamp']}
Model: {results['model']}
Language: {results['language_name']}
Processing Time: {results['processing_time']}
DESCRIPTION:
{results['caption']}
---
Generated by Multilingual Image Describer
Powered by {results['model']}
"""
st.download_button(
"📥 Download TXT",
text_data,
f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
"text/plain",
use_container_width=True
)
# Try another language
st.markdown("---")
st.markdown("#### 🔄 Try Another Language")
if st.button("🔄 Generate in Different Language", use_container_width=True):
st.session_state.results = None
st.rerun()
elif uploaded_file and not st.session_state.model:
st.warning("⚠️ Please load the model first!")
st.info("Click '🔄 Load Model' in the sidebar")
elif not uploaded_file:
st.info("👈 Upload an image to begin")
# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;">
<p>
<strong>Powered by Latest Vision-Language Models</strong> •
<a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf" target="_blank" style="color: #667eea;">LLaVA</a> •
<a href="https://huggingface.co/Qwen/Qwen-VL-Chat" target="_blank" style="color: #667eea;">Qwen-VL</a>
</p>
<p style="font-size: 0.8em;">
Native multilingual support • No translation APIs • Direct caption generation
</p>
<p style="font-size: 0.7em; color: #999; margin-top: 15px;">
UCAS @2025 • Built with Streamlit & Transformers
</p>
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()