amogne-vlm-LLM / app.py
amogneandualem's picture
Update app.py
9646139 verified
raw
history blame
8.34 kB
"""
🌍 Multilingual Image Describer - SIMPLE
Using pre-trained multilingual model for direct captioning
"""
import streamlit as st
import torch
from PIL import Image
import requests
from io import BytesIO
import time
from datetime import datetime
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# Set page config
st.set_page_config(
page_title="Multilingual Image Describer",
page_icon="🌍",
layout="wide"
)
# Initialize session state
if 'model' not in st.session_state:
st.session_state.model = None
# Language settings
LANGUAGES = {
"en": {"name": "English", "prompt": "a photo of"},
"zh": {"name": "中文", "prompt": "一张照片"},
"am": {"name": "አማርኛ", "prompt": "የሚያሳይ ፎቶ"},
"es": {"name": "Español", "prompt": "una foto de"},
"fr": {"name": "Français", "prompt": "une photo de"},
"de": {"name": "Deutsch", "prompt": "ein Foto von"},
"ar": {"name": "العربية", "prompt": "صورة"},
"hi": {"name": "हिन्दी", "prompt": "की एक तस्वीर"},
"ru": {"name": "Русский", "prompt": "фотография"},
"ja": {"name": "日本語", "prompt": "の写真"}
}
@st.cache_resource(show_spinner="Loading multilingual model...")
def load_model():
"""Load multilingual image captioning model"""
try:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
# Using BLIP-2 with multilingual capabilities
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Move to GPU if available
if torch.cuda.is_available():
model = model.to("cuda")
return processor, model
except Exception as e:
st.error(f"Model loading error: {str(e)[:100]}")
return None, None
def generate_multilingual_caption(image, language="en"):
"""Generate caption directly in the target language"""
if st.session_state.model is None:
return "Model not loaded"
processor, model = st.session_state.model
try:
# Prepare prompt based on language
prompt_text = LANGUAGES.get(language, LANGUAGES["en"])["prompt"]
# Process image
inputs = processor(image, text=prompt_text, return_tensors="pt")
# Move to device
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Generate caption
with torch.no_grad():
outputs = model.generate(**inputs, max_length=50)
# Decode the output
caption = processor.decode(outputs[0], skip_special_tokens=True)
# Remove the prompt from the beginning if present
if caption.lower().startswith(prompt_text.lower()):
caption = caption[len(prompt_text):].strip()
return caption.strip()
except Exception as e:
return f"An image with various objects. (Error: {str(e)[:50]})"
def main():
# Title
st.title("🌍 Multilingual Image Describer")
st.markdown("Upload an image to get descriptions in multiple languages")
# Load model
with st.spinner("Loading AI model..."):
if st.session_state.model is None:
st.session_state.model = load_model()
if st.session_state.model is None:
st.error("Failed to load model. Please refresh the page.")
return
# Sidebar
with st.sidebar:
st.header("📸 Upload Image")
uploaded_file = st.file_uploader(
"Choose an image",
type=["jpg", "jpeg", "png", "webp"],
help="Upload any image file"
)
st.markdown("---")
st.header("🌐 Select Languages")
# Language selection with checkboxes
selected_languages = []
cols = st.columns(2)
lang_list = list(LANGUAGES.items())
for i, (code, info) in enumerate(lang_list):
col_idx = i % 2
with cols[col_idx]:
if st.checkbox(f"{info['name']}", key=f"lang_{code}", value=(code == "en")):
selected_languages.append(code)
if not selected_languages:
selected_languages = ["en"]
st.info("English selected by default")
st.markdown("---")
# Generate button
generate_btn = st.button(
"🚀 Generate Descriptions",
type="primary",
use_container_width=True,
disabled=uploaded_file is None
)
if st.button("🔄 Clear", use_container_width=True):
st.rerun()
# Main content
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("Input Image")
if uploaded_file:
image = Image.open(uploaded_file).convert("RGB")
st.image(image, use_column_width=True)
st.caption(f"Size: {image.size[0]}×{image.size[1]} pixels")
else:
st.info("👈 Upload an image from the sidebar")
st.image(
"https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=400&auto=format",
caption="Sample background",
use_column_width=True
)
with col2:
st.subheader("Results")
if generate_btn and uploaded_file:
image = Image.open(uploaded_file).convert("RGB")
with st.spinner("Generating descriptions..."):
results = {}
progress_bar = st.progress(0)
for i, lang_code in enumerate(selected_languages):
# Update progress
progress = (i + 1) / len(selected_languages)
progress_bar.progress(progress)
# Generate caption for this language
caption = generate_multilingual_caption(image, lang_code)
lang_name = LANGUAGES[lang_code]["name"]
results[lang_name] = caption
progress_bar.empty()
# Display results
st.success(f"✅ Generated {len(results)} descriptions")
# Create results DataFrame
df_results = pd.DataFrame({
"Language": list(results.keys()),
"Description": list(results.values())
})
# Display table
st.dataframe(
df_results,
use_container_width=True,
hide_index=True
)
# Show individual descriptions
st.markdown("### Descriptions by Language")
for lang_name, description in results.items():
with st.expander(f"{lang_name}", expanded=(lang_name == "English")):
st.markdown(f"**{description}**")
# Export option
st.markdown("---")
st.markdown("### 💾 Export Results")
# Create export text
export_text = f"""Multilingual Image Descriptions
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Image: {uploaded_file.name if uploaded_file else 'Unknown'}
"""
for lang_name, description in results.items():
export_text += f"\n{lang_name}:\n{description}\n"
# Download button
st.download_button(
"📥 Download as TXT",
export_text,
f"descriptions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
"text/plain"
)
elif uploaded_file:
st.info("👈 Click 'Generate Descriptions' to analyze the image")
# Footer
st.markdown("---")
st.caption("""
**Powered by:** BLIP-2 Multilingual Model • **UCAS @2025** •
Model: Salesforce/blip2-opt-2.7b
""")
if __name__ == "__main__":
main()