""" 🌍 Advanced Multilingual Image Describer Using latest Vision-Language Models (VLMs) with native multilingual support """ import streamlit as st import torch from PIL import Image import time from datetime import datetime import pandas as pd import warnings warnings.filterwarnings("ignore") # Set page config st.set_page_config( page_title="Multilingual Image Describer", page_icon="🌍", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'model' not in st.session_state: st.session_state.model = None if 'model_name' not in st.session_state: st.session_state.model_name = None if 'results' not in st.session_state: st.session_state.results = None # Model options (latest vision-language models) MODEL_OPTIONS = { "llava-hf/llava-1.5-7b-hf": { "name": "LLaVA 1.5 (7B)", "multilingual": True, "languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"], "prompt_templates": { "en": "Describe this image in detail:", "zh": "详细描述这张图片：", "es": "Describe esta imagen en detalle:", "fr": "Décrivez cette image en détail :", "de": "Beschreiben Sie dieses Bild im Detail:", "am": "ይህንን ምስል በዝርዝር ይግለጹ፡" } }, "Qwen/Qwen-VL-Chat": { "name": "Qwen-VL-Chat", "multilingual": True, "languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"], "prompt_templates": { "en": "Describe this image in English:", "zh": "用中文描述这张图片：", "am": "በአማርኛ ይህንን ምስል ይግለጹ፡" } }, "vikhyatk/moondream2": { "name": "Moondream 2", "multilingual": True, "languages": ["en", "es", "fr", "de"], "prompt_templates": { "en": "Describe this image:", "zh": "描述这张图片：", "am": "ይህንን ምስል ይግለጹ፡" } } } # Language mapping LANGUAGE_NAMES = { "en": "🇺🇸 English", "zh": "🇨🇳 中文", "am": "🇪🇹 አማርኛ", "es": "🇪🇸 Español", "fr": "🇫🇷 Français", "de": "🇩🇪 Deutsch", "ar": "🇸🇦 العربية", "hi": "🇮🇳 हिन्दी", "ru": "🇷🇺 Русский", "ja": "🇯🇵 日本語", "ko": "🇰🇷 한국어", "it": "🇮🇹 Italiano", "pt": "🇵🇹 Português", "tr": "🇹🇷 Türkçe" } @st.cache_resource(show_spinner=True) def load_model(model_id): """Load the selected vision-language model""" try: from transformers import AutoProcessor, AutoModelForVision2Seq st.info(f"🚀 Loading {MODEL_OPTIONS[model_id]['name']}...") # Load processor and model processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForVision2Seq.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto" if torch.cuda.is_available() else None ) return processor, model, model_id except Exception as e: st.error(f"❌ Failed to load model: {str(e)[:200]}") return None, None, None def generate_caption(image, model_tuple, language="en", model_id=None): """Generate caption using the vision-language model""" if None in model_tuple: return "Model not loaded" processor, model, loaded_model_id = model_tuple try: # Get prompt template based on model and language model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"]) prompt_template = model_info["prompt_templates"].get( language, model_info["prompt_templates"].get("en", "Describe this image:") ) # Prepare inputs if "llava" in loaded_model_id: # LLaVA format prompt = f"USER: \n{prompt_template}\nASSISTANT:" inputs = processor(text=prompt, images=image, return_tensors="pt") elif "qwen" in loaded_model_id.lower(): # Qwen-VL format prompt = f"Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:" inputs = processor(text=prompt, images=image, return_tensors="pt") else: # Default format inputs = processor(text=prompt_template, images=image, return_tensors="pt") # Move to device if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} # Generate with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=200, temperature=0.7, do_sample=True ) # Decode generated_text = processor.batch_decode( generated_ids, skip_special_tokens=True )[0].strip() # Clean up response if "llava" in loaded_model_id: # Remove the prompt part if "ASSISTANT:" in generated_text: generated_text = generated_text.split("ASSISTANT:")[-1].strip() return generated_text except Exception as e: return f"Error generating description: {str(e)[:100]}" def main(): # Title st.markdown("

🌍 Advanced Multilingual Image Describer

", unsafe_allow_html=True) # Model info st.markdown("""

Latest Vision-Language Models Native Multilingual Support No Translation APIs Needed

""", unsafe_allow_html=True) # Sidebar with st.sidebar: st.markdown("### ⚙️ Configuration") # Model selection st.markdown("#### 🤖 Select Model") model_choice = st.selectbox( "Choose a vision-language model:", options=list(MODEL_OPTIONS.keys()), format_func=lambda x: MODEL_OPTIONS[x]["name"], help="LLaVA supports most languages. Qwen-VL is faster." ) # Show model info model_info = MODEL_OPTIONS[model_choice] st.caption(f"✅ Languages: {len(model_info['languages'])}") st.caption(f"📊 Parameters: 7B+") # Language selection st.markdown("#### 🌐 Select Language") available_langs = model_info["languages"] selected_lang = st.selectbox( "Output language:", options=available_langs, format_func=lambda x: LANGUAGE_NAMES.get(x, x), index=0 ) # Show language tags st.markdown("**Supported languages:**") lang_tags = " ".join([ f'{LANGUAGE_NAMES.get(lang, lang)}' for lang in available_langs[:8] ]) st.markdown(f'

{lang_tags}

', unsafe_allow_html=True) # Image upload st.markdown("---") st.markdown("### 📸 Upload Image") uploaded_file = st.file_uploader( "Choose an image file", type=["jpg", "jpeg", "png", "webp", "bmp"], label_visibility="collapsed" ) # Advanced options with st.expander("⚡ Advanced Settings"): max_tokens = st.slider("Max tokens", 50, 500, 200, 50) temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1) st.markdown("---") # Action buttons col1, col2 = st.columns(2) with col1: load_btn = st.button("🔄 Load Model", use_container_width=True) with col2: if st.button("🗑️ Clear", use_container_width=True): st.session_state.results = None st.rerun() # Load model if requested if load_btn or (st.session_state.model is None and uploaded_file): with st.spinner(f"Loading {model_info['name']}..."): processor, model, model_id = load_model(model_choice) if processor and model: st.session_state.model = (processor, model, model_id) st.session_state.model_name = model_info["name"] st.success(f"✅ {model_info['name']} loaded!") else: st.error("❌ Failed to load model") # Quick stats if st.session_state.results: st.markdown("---") st.markdown("### 📊 Quick Stats") col1, col2 = st.columns(2) with col1: st.metric("Model", st.session_state.model_name or "N/A") with col2: st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang)) # Main content col1, col2 = st.columns([1, 1]) with col1: st.markdown("### 📤 Input Image") if uploaded_file: try: image = Image.open(uploaded_file).convert("RGB") st.image(image, use_column_width=True) st.caption(f"📏 Size: {image.size[0]}×{image.size[1]} pixels") # Store for processing st.session_state.current_image = image except Exception as e: st.error(f"Error loading image: {e}") else: st.info("👈 Upload an image to get started") # Show placeholder st.image( "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format", caption="Upload your own image for analysis", use_column_width=True ) with col2: st.markdown("### 📋 Results") # Process image if model is loaded if (uploaded_file and st.session_state.model and st.session_state.current_image and 'current_image' in st.session_state): # Generate button if st.button("🚀 Generate Description", type="primary", use_container_width=True): with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."): start_time = time.time() # Generate caption caption = generate_caption( st.session_state.current_image, st.session_state.model, selected_lang, model_choice ) processing_time = time.time() - start_time # Store results st.session_state.results = { "caption": caption, "language": selected_lang, "language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang), "model": st.session_state.model_name, "processing_time": f"{processing_time:.2f}s", "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") } # Display results if st.session_state.results: results = st.session_state.results st.success(f"✅ Generated in {results['processing_time']}") # Display caption st.markdown("#### Generated Description") st.markdown(f"""

{results['caption']}

""", unsafe_allow_html=True) # Metadata st.markdown("#### 📊 Analysis Details") col1, col2, col3 = st.columns(3) with col1: st.metric("Model", results['model']) with col2: st.metric("Language", results['language_name']) with col3: st.metric("Time", results['processing_time']) # Export options st.markdown("---") st.markdown("#### 💾 Export Results") col1, col2 = st.columns(2) with col1: # JSON export import json json_data = json.dumps(results, indent=2, ensure_ascii=False) st.download_button( "📥 Download JSON", json_data, f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", "application/json", use_container_width=True ) with col2: # Text export text_data = f"""Image Description Generated: {results['timestamp']} Model: {results['model']} Language: {results['language_name']} Processing Time: {results['processing_time']} DESCRIPTION: {results['caption']} --- Generated by Multilingual Image Describer Powered by {results['model']} """ st.download_button( "📥 Download TXT", text_data, f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", "text/plain", use_container_width=True ) # Try another language st.markdown("---") st.markdown("#### 🔄 Try Another Language") if st.button("🔄 Generate in Different Language", use_container_width=True): st.session_state.results = None st.rerun() elif uploaded_file and not st.session_state.model: st.warning("⚠️ Please load the model first!") st.info("Click '🔄 Load Model' in the sidebar") elif not uploaded_file: st.info("👈 Upload an image to begin") # Footer st.markdown("---") st.markdown("""

Powered by Latest Vision-Language Models • LLaVA • Qwen-VL

Native multilingual support • No translation APIs • Direct caption generation

UCAS @2025 • Built with Streamlit & Transformers

""", unsafe_allow_html=True) if __name__ == "__main__": main()