Spaces:
Running
Running
| """ | |
| 🌍 Advanced Multilingual Image Describer | |
| Using latest Vision-Language Models (VLMs) with native multilingual support | |
| """ | |
| import streamlit as st | |
| import torch | |
| from PIL import Image | |
| import time | |
| from datetime import datetime | |
| import pandas as pd | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Set page config | |
| st.set_page_config( | |
| page_title="Multilingual Image Describer", | |
| page_icon="🌍", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .st-emotion-cache-16txtl3 { | |
| padding-top: 3rem; | |
| } | |
| .header-title { | |
| text-align: center; | |
| color: #2C3E50; | |
| margin-bottom: 1rem; | |
| } | |
| .model-badge { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 5px 15px; | |
| border-radius: 20px; | |
| font-size: 12px; | |
| display: inline-block; | |
| margin: 5px; | |
| } | |
| .language-tag { | |
| background: #E3F2FD; | |
| color: #1976D2; | |
| padding: 3px 10px; | |
| border-radius: 15px; | |
| font-size: 12px; | |
| margin: 2px; | |
| display: inline-block; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialize session state | |
| if 'model' not in st.session_state: | |
| st.session_state.model = None | |
| if 'model_name' not in st.session_state: | |
| st.session_state.model_name = None | |
| if 'results' not in st.session_state: | |
| st.session_state.results = None | |
| # Model options (latest vision-language models) | |
| MODEL_OPTIONS = { | |
| "llava-hf/llava-1.5-7b-hf": { | |
| "name": "LLaVA 1.5 (7B)", | |
| "multilingual": True, | |
| "languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"], | |
| "prompt_templates": { | |
| "en": "Describe this image in detail:", | |
| "zh": "详细描述这张图片:", | |
| "es": "Describe esta imagen en detalle:", | |
| "fr": "Décrivez cette image en détail :", | |
| "de": "Beschreiben Sie dieses Bild im Detail:", | |
| "am": "ይህንን ምስል በዝርዝር ይግለጹ፡" | |
| } | |
| }, | |
| "Qwen/Qwen-VL-Chat": { | |
| "name": "Qwen-VL-Chat", | |
| "multilingual": True, | |
| "languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"], | |
| "prompt_templates": { | |
| "en": "Describe this image in English:", | |
| "zh": "用中文描述这张图片:", | |
| "am": "በአማርኛ ይህንን ምስል ይግለጹ፡" | |
| } | |
| }, | |
| "vikhyatk/moondream2": { | |
| "name": "Moondream 2", | |
| "multilingual": True, | |
| "languages": ["en", "es", "fr", "de"], | |
| "prompt_templates": { | |
| "en": "Describe this image:", | |
| "zh": "描述这张图片:", | |
| "am": "ይህንን ምስል ይግለጹ፡" | |
| } | |
| } | |
| } | |
| # Language mapping | |
| LANGUAGE_NAMES = { | |
| "en": "🇺🇸 English", | |
| "zh": "🇨🇳 中文", | |
| "am": "🇪🇹 አማርኛ", | |
| "es": "🇪🇸 Español", | |
| "fr": "🇫🇷 Français", | |
| "de": "🇩🇪 Deutsch", | |
| "ar": "🇸🇦 العربية", | |
| "hi": "🇮🇳 हिन्दी", | |
| "ru": "🇷🇺 Русский", | |
| "ja": "🇯🇵 日本語", | |
| "ko": "🇰🇷 한국어", | |
| "it": "🇮🇹 Italiano", | |
| "pt": "🇵🇹 Português", | |
| "tr": "🇹🇷 Türkçe" | |
| } | |
| def load_model(model_id): | |
| """Load the selected vision-language model""" | |
| try: | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| st.info(f"🚀 Loading {MODEL_OPTIONS[model_id]['name']}...") | |
| # Load processor and model | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float16, | |
| device_map="auto" if torch.cuda.is_available() else None | |
| ) | |
| return processor, model, model_id | |
| except Exception as e: | |
| st.error(f"❌ Failed to load model: {str(e)[:200]}") | |
| return None, None, None | |
| def generate_caption(image, model_tuple, language="en", model_id=None): | |
| """Generate caption using the vision-language model""" | |
| if None in model_tuple: | |
| return "Model not loaded" | |
| processor, model, loaded_model_id = model_tuple | |
| try: | |
| # Get prompt template based on model and language | |
| model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"]) | |
| prompt_template = model_info["prompt_templates"].get( | |
| language, | |
| model_info["prompt_templates"].get("en", "Describe this image:") | |
| ) | |
| # Prepare inputs | |
| if "llava" in loaded_model_id: | |
| # LLaVA format | |
| prompt = f"USER: <image>\n{prompt_template}\nASSISTANT:" | |
| inputs = processor(text=prompt, images=image, return_tensors="pt") | |
| elif "qwen" in loaded_model_id.lower(): | |
| # Qwen-VL format | |
| prompt = f"<img>Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:</img>" | |
| inputs = processor(text=prompt, images=image, return_tensors="pt") | |
| else: | |
| # Default format | |
| inputs = processor(text=prompt_template, images=image, return_tensors="pt") | |
| # Move to device | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| # Generate | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| temperature=0.7, | |
| do_sample=True | |
| ) | |
| # Decode | |
| generated_text = processor.batch_decode( | |
| generated_ids, | |
| skip_special_tokens=True | |
| )[0].strip() | |
| # Clean up response | |
| if "llava" in loaded_model_id: | |
| # Remove the prompt part | |
| if "ASSISTANT:" in generated_text: | |
| generated_text = generated_text.split("ASSISTANT:")[-1].strip() | |
| return generated_text | |
| except Exception as e: | |
| return f"Error generating description: {str(e)[:100]}" | |
| def main(): | |
| # Title | |
| st.markdown("<h1 class='header-title'>🌍 Advanced Multilingual Image Describer</h1>", unsafe_allow_html=True) | |
| # Model info | |
| st.markdown(""" | |
| <div style="text-align: center; margin-bottom: 2rem;"> | |
| <span class='model-badge'>Latest Vision-Language Models</span> | |
| <span class='model-badge'>Native Multilingual Support</span> | |
| <span class='model-badge'>No Translation APIs Needed</span> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Sidebar | |
| with st.sidebar: | |
| st.markdown("### ⚙️ Configuration") | |
| # Model selection | |
| st.markdown("#### 🤖 Select Model") | |
| model_choice = st.selectbox( | |
| "Choose a vision-language model:", | |
| options=list(MODEL_OPTIONS.keys()), | |
| format_func=lambda x: MODEL_OPTIONS[x]["name"], | |
| help="LLaVA supports most languages. Qwen-VL is faster." | |
| ) | |
| # Show model info | |
| model_info = MODEL_OPTIONS[model_choice] | |
| st.caption(f"✅ Languages: {len(model_info['languages'])}") | |
| st.caption(f"📊 Parameters: 7B+") | |
| # Language selection | |
| st.markdown("#### 🌐 Select Language") | |
| available_langs = model_info["languages"] | |
| selected_lang = st.selectbox( | |
| "Output language:", | |
| options=available_langs, | |
| format_func=lambda x: LANGUAGE_NAMES.get(x, x), | |
| index=0 | |
| ) | |
| # Show language tags | |
| st.markdown("**Supported languages:**") | |
| lang_tags = " ".join([ | |
| f'<span class="language-tag">{LANGUAGE_NAMES.get(lang, lang)}</span>' | |
| for lang in available_langs[:8] | |
| ]) | |
| st.markdown(f'<div>{lang_tags}</div>', unsafe_allow_html=True) | |
| # Image upload | |
| st.markdown("---") | |
| st.markdown("### 📸 Upload Image") | |
| uploaded_file = st.file_uploader( | |
| "Choose an image file", | |
| type=["jpg", "jpeg", "png", "webp", "bmp"], | |
| label_visibility="collapsed" | |
| ) | |
| # Advanced options | |
| with st.expander("⚡ Advanced Settings"): | |
| max_tokens = st.slider("Max tokens", 50, 500, 200, 50) | |
| temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1) | |
| st.markdown("---") | |
| # Action buttons | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| load_btn = st.button("🔄 Load Model", use_container_width=True) | |
| with col2: | |
| if st.button("🗑️ Clear", use_container_width=True): | |
| st.session_state.results = None | |
| st.rerun() | |
| # Load model if requested | |
| if load_btn or (st.session_state.model is None and uploaded_file): | |
| with st.spinner(f"Loading {model_info['name']}..."): | |
| processor, model, model_id = load_model(model_choice) | |
| if processor and model: | |
| st.session_state.model = (processor, model, model_id) | |
| st.session_state.model_name = model_info["name"] | |
| st.success(f"✅ {model_info['name']} loaded!") | |
| else: | |
| st.error("❌ Failed to load model") | |
| # Quick stats | |
| if st.session_state.results: | |
| st.markdown("---") | |
| st.markdown("### 📊 Quick Stats") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Model", st.session_state.model_name or "N/A") | |
| with col2: | |
| st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang)) | |
| # Main content | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.markdown("### 📤 Input Image") | |
| if uploaded_file: | |
| try: | |
| image = Image.open(uploaded_file).convert("RGB") | |
| st.image(image, use_column_width=True) | |
| st.caption(f"📏 Size: {image.size[0]}×{image.size[1]} pixels") | |
| # Store for processing | |
| st.session_state.current_image = image | |
| except Exception as e: | |
| st.error(f"Error loading image: {e}") | |
| else: | |
| st.info("👈 Upload an image to get started") | |
| # Show placeholder | |
| st.image( | |
| "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format", | |
| caption="Upload your own image for analysis", | |
| use_column_width=True | |
| ) | |
| with col2: | |
| st.markdown("### 📋 Results") | |
| # Process image if model is loaded | |
| if (uploaded_file and st.session_state.model and | |
| st.session_state.current_image and | |
| 'current_image' in st.session_state): | |
| # Generate button | |
| if st.button("🚀 Generate Description", type="primary", use_container_width=True): | |
| with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."): | |
| start_time = time.time() | |
| # Generate caption | |
| caption = generate_caption( | |
| st.session_state.current_image, | |
| st.session_state.model, | |
| selected_lang, | |
| model_choice | |
| ) | |
| processing_time = time.time() - start_time | |
| # Store results | |
| st.session_state.results = { | |
| "caption": caption, | |
| "language": selected_lang, | |
| "language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang), | |
| "model": st.session_state.model_name, | |
| "processing_time": f"{processing_time:.2f}s", | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| } | |
| # Display results | |
| if st.session_state.results: | |
| results = st.session_state.results | |
| st.success(f"✅ Generated in {results['processing_time']}") | |
| # Display caption | |
| st.markdown("#### Generated Description") | |
| st.markdown(f""" | |
| <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #667eea;"> | |
| <p style="font-size: 16px; line-height: 1.6;">{results['caption']}</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Metadata | |
| st.markdown("#### 📊 Analysis Details") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Model", results['model']) | |
| with col2: | |
| st.metric("Language", results['language_name']) | |
| with col3: | |
| st.metric("Time", results['processing_time']) | |
| # Export options | |
| st.markdown("---") | |
| st.markdown("#### 💾 Export Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # JSON export | |
| import json | |
| json_data = json.dumps(results, indent=2, ensure_ascii=False) | |
| st.download_button( | |
| "📥 Download JSON", | |
| json_data, | |
| f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", | |
| "application/json", | |
| use_container_width=True | |
| ) | |
| with col2: | |
| # Text export | |
| text_data = f"""Image Description | |
| Generated: {results['timestamp']} | |
| Model: {results['model']} | |
| Language: {results['language_name']} | |
| Processing Time: {results['processing_time']} | |
| DESCRIPTION: | |
| {results['caption']} | |
| --- | |
| Generated by Multilingual Image Describer | |
| Powered by {results['model']} | |
| """ | |
| st.download_button( | |
| "📥 Download TXT", | |
| text_data, | |
| f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", | |
| "text/plain", | |
| use_container_width=True | |
| ) | |
| # Try another language | |
| st.markdown("---") | |
| st.markdown("#### 🔄 Try Another Language") | |
| if st.button("🔄 Generate in Different Language", use_container_width=True): | |
| st.session_state.results = None | |
| st.rerun() | |
| elif uploaded_file and not st.session_state.model: | |
| st.warning("⚠️ Please load the model first!") | |
| st.info("Click '🔄 Load Model' in the sidebar") | |
| elif not uploaded_file: | |
| st.info("👈 Upload an image to begin") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;"> | |
| <p> | |
| <strong>Powered by Latest Vision-Language Models</strong> • | |
| <a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf" target="_blank" style="color: #667eea;">LLaVA</a> • | |
| <a href="https://huggingface.co/Qwen/Qwen-VL-Chat" target="_blank" style="color: #667eea;">Qwen-VL</a> | |
| </p> | |
| <p style="font-size: 0.8em;"> | |
| Native multilingual support • No translation APIs • Direct caption generation | |
| </p> | |
| <p style="font-size: 0.7em; color: #999; margin-top: 15px;"> | |
| UCAS @2025 • Built with Streamlit & Transformers | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() |