""" ๐ŸŒ Advanced Multilingual Image Describer Using latest Vision-Language Models (VLMs) with native multilingual support """ import streamlit as st import torch from PIL import Image import time from datetime import datetime import pandas as pd import warnings warnings.filterwarnings("ignore") # Set page config st.set_page_config( page_title="Multilingual Image Describer", page_icon="๐ŸŒ", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'model' not in st.session_state: st.session_state.model = None if 'model_name' not in st.session_state: st.session_state.model_name = None if 'results' not in st.session_state: st.session_state.results = None # Model options (latest vision-language models) MODEL_OPTIONS = { "llava-hf/llava-1.5-7b-hf": { "name": "LLaVA 1.5 (7B)", "multilingual": True, "languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"], "prompt_templates": { "en": "Describe this image in detail:", "zh": "่ฏฆ็ป†ๆ่ฟฐ่ฟ™ๅผ ๅ›พ็‰‡๏ผš", "es": "Describe esta imagen en detalle:", "fr": "Dรฉcrivez cette image en dรฉtail :", "de": "Beschreiben Sie dieses Bild im Detail:", "am": "แ‹ญแˆ…แŠ•แŠ• แˆแˆตแˆ แ‰ แ‹แˆญแ‹แˆญ แ‹ญแŒแˆˆแŒนแก" } }, "Qwen/Qwen-VL-Chat": { "name": "Qwen-VL-Chat", "multilingual": True, "languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"], "prompt_templates": { "en": "Describe this image in English:", "zh": "็”จไธญๆ–‡ๆ่ฟฐ่ฟ™ๅผ ๅ›พ็‰‡๏ผš", "am": "แ‰ แŠ แˆ›แˆญแŠ› แ‹ญแˆ…แŠ•แŠ• แˆแˆตแˆ แ‹ญแŒแˆˆแŒนแก" } }, "vikhyatk/moondream2": { "name": "Moondream 2", "multilingual": True, "languages": ["en", "es", "fr", "de"], "prompt_templates": { "en": "Describe this image:", "zh": "ๆ่ฟฐ่ฟ™ๅผ ๅ›พ็‰‡๏ผš", "am": "แ‹ญแˆ…แŠ•แŠ• แˆแˆตแˆ แ‹ญแŒแˆˆแŒนแก" } } } # Language mapping LANGUAGE_NAMES = { "en": "๐Ÿ‡บ๐Ÿ‡ธ English", "zh": "๐Ÿ‡จ๐Ÿ‡ณ ไธญๆ–‡", "am": "๐Ÿ‡ช๐Ÿ‡น แŠ แˆ›แˆญแŠ›", "es": "๐Ÿ‡ช๐Ÿ‡ธ Espaรฑol", "fr": "๐Ÿ‡ซ๐Ÿ‡ท Franรงais", "de": "๐Ÿ‡ฉ๐Ÿ‡ช Deutsch", "ar": "๐Ÿ‡ธ๐Ÿ‡ฆ ุงู„ุนุฑุจูŠุฉ", "hi": "๐Ÿ‡ฎ๐Ÿ‡ณ เคนเคฟเคจเฅเคฆเฅ€", "ru": "๐Ÿ‡ท๐Ÿ‡บ ะ ัƒััะบะธะน", "ja": "๐Ÿ‡ฏ๐Ÿ‡ต ๆ—ฅๆœฌ่ชž", "ko": "๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด", "it": "๐Ÿ‡ฎ๐Ÿ‡น Italiano", "pt": "๐Ÿ‡ต๐Ÿ‡น Portuguรชs", "tr": "๐Ÿ‡น๐Ÿ‡ท Tรผrkรงe" } @st.cache_resource(show_spinner=True) def load_model(model_id): """Load the selected vision-language model""" try: from transformers import AutoProcessor, AutoModelForVision2Seq st.info(f"๐Ÿš€ Loading {MODEL_OPTIONS[model_id]['name']}...") # Load processor and model processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForVision2Seq.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto" if torch.cuda.is_available() else None ) return processor, model, model_id except Exception as e: st.error(f"โŒ Failed to load model: {str(e)[:200]}") return None, None, None def generate_caption(image, model_tuple, language="en", model_id=None): """Generate caption using the vision-language model""" if None in model_tuple: return "Model not loaded" processor, model, loaded_model_id = model_tuple try: # Get prompt template based on model and language model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"]) prompt_template = model_info["prompt_templates"].get( language, model_info["prompt_templates"].get("en", "Describe this image:") ) # Prepare inputs if "llava" in loaded_model_id: # LLaVA format prompt = f"USER: \n{prompt_template}\nASSISTANT:" inputs = processor(text=prompt, images=image, return_tensors="pt") elif "qwen" in loaded_model_id.lower(): # Qwen-VL format prompt = f"Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:" inputs = processor(text=prompt, images=image, return_tensors="pt") else: # Default format inputs = processor(text=prompt_template, images=image, return_tensors="pt") # Move to device if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} # Generate with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=200, temperature=0.7, do_sample=True ) # Decode generated_text = processor.batch_decode( generated_ids, skip_special_tokens=True )[0].strip() # Clean up response if "llava" in loaded_model_id: # Remove the prompt part if "ASSISTANT:" in generated_text: generated_text = generated_text.split("ASSISTANT:")[-1].strip() return generated_text except Exception as e: return f"Error generating description: {str(e)[:100]}" def main(): # Title st.markdown("

๐ŸŒ Advanced Multilingual Image Describer

", unsafe_allow_html=True) # Model info st.markdown("""
Latest Vision-Language Models Native Multilingual Support No Translation APIs Needed
""", unsafe_allow_html=True) # Sidebar with st.sidebar: st.markdown("### โš™๏ธ Configuration") # Model selection st.markdown("#### ๐Ÿค– Select Model") model_choice = st.selectbox( "Choose a vision-language model:", options=list(MODEL_OPTIONS.keys()), format_func=lambda x: MODEL_OPTIONS[x]["name"], help="LLaVA supports most languages. Qwen-VL is faster." ) # Show model info model_info = MODEL_OPTIONS[model_choice] st.caption(f"โœ… Languages: {len(model_info['languages'])}") st.caption(f"๐Ÿ“Š Parameters: 7B+") # Language selection st.markdown("#### ๐ŸŒ Select Language") available_langs = model_info["languages"] selected_lang = st.selectbox( "Output language:", options=available_langs, format_func=lambda x: LANGUAGE_NAMES.get(x, x), index=0 ) # Show language tags st.markdown("**Supported languages:**") lang_tags = " ".join([ f'{LANGUAGE_NAMES.get(lang, lang)}' for lang in available_langs[:8] ]) st.markdown(f'
{lang_tags}
', unsafe_allow_html=True) # Image upload st.markdown("---") st.markdown("### ๐Ÿ“ธ Upload Image") uploaded_file = st.file_uploader( "Choose an image file", type=["jpg", "jpeg", "png", "webp", "bmp"], label_visibility="collapsed" ) # Advanced options with st.expander("โšก Advanced Settings"): max_tokens = st.slider("Max tokens", 50, 500, 200, 50) temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1) st.markdown("---") # Action buttons col1, col2 = st.columns(2) with col1: load_btn = st.button("๐Ÿ”„ Load Model", use_container_width=True) with col2: if st.button("๐Ÿ—‘๏ธ Clear", use_container_width=True): st.session_state.results = None st.rerun() # Load model if requested if load_btn or (st.session_state.model is None and uploaded_file): with st.spinner(f"Loading {model_info['name']}..."): processor, model, model_id = load_model(model_choice) if processor and model: st.session_state.model = (processor, model, model_id) st.session_state.model_name = model_info["name"] st.success(f"โœ… {model_info['name']} loaded!") else: st.error("โŒ Failed to load model") # Quick stats if st.session_state.results: st.markdown("---") st.markdown("### ๐Ÿ“Š Quick Stats") col1, col2 = st.columns(2) with col1: st.metric("Model", st.session_state.model_name or "N/A") with col2: st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang)) # Main content col1, col2 = st.columns([1, 1]) with col1: st.markdown("### ๐Ÿ“ค Input Image") if uploaded_file: try: image = Image.open(uploaded_file).convert("RGB") st.image(image, use_column_width=True) st.caption(f"๐Ÿ“ Size: {image.size[0]}ร—{image.size[1]} pixels") # Store for processing st.session_state.current_image = image except Exception as e: st.error(f"Error loading image: {e}") else: st.info("๐Ÿ‘ˆ Upload an image to get started") # Show placeholder st.image( "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format", caption="Upload your own image for analysis", use_column_width=True ) with col2: st.markdown("### ๐Ÿ“‹ Results") # Process image if model is loaded if (uploaded_file and st.session_state.model and st.session_state.current_image and 'current_image' in st.session_state): # Generate button if st.button("๐Ÿš€ Generate Description", type="primary", use_container_width=True): with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."): start_time = time.time() # Generate caption caption = generate_caption( st.session_state.current_image, st.session_state.model, selected_lang, model_choice ) processing_time = time.time() - start_time # Store results st.session_state.results = { "caption": caption, "language": selected_lang, "language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang), "model": st.session_state.model_name, "processing_time": f"{processing_time:.2f}s", "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") } # Display results if st.session_state.results: results = st.session_state.results st.success(f"โœ… Generated in {results['processing_time']}") # Display caption st.markdown("#### Generated Description") st.markdown(f"""

{results['caption']}

""", unsafe_allow_html=True) # Metadata st.markdown("#### ๐Ÿ“Š Analysis Details") col1, col2, col3 = st.columns(3) with col1: st.metric("Model", results['model']) with col2: st.metric("Language", results['language_name']) with col3: st.metric("Time", results['processing_time']) # Export options st.markdown("---") st.markdown("#### ๐Ÿ’พ Export Results") col1, col2 = st.columns(2) with col1: # JSON export import json json_data = json.dumps(results, indent=2, ensure_ascii=False) st.download_button( "๐Ÿ“ฅ Download JSON", json_data, f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", "application/json", use_container_width=True ) with col2: # Text export text_data = f"""Image Description Generated: {results['timestamp']} Model: {results['model']} Language: {results['language_name']} Processing Time: {results['processing_time']} DESCRIPTION: {results['caption']} --- Generated by Multilingual Image Describer Powered by {results['model']} """ st.download_button( "๐Ÿ“ฅ Download TXT", text_data, f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt", "text/plain", use_container_width=True ) # Try another language st.markdown("---") st.markdown("#### ๐Ÿ”„ Try Another Language") if st.button("๐Ÿ”„ Generate in Different Language", use_container_width=True): st.session_state.results = None st.rerun() elif uploaded_file and not st.session_state.model: st.warning("โš ๏ธ Please load the model first!") st.info("Click '๐Ÿ”„ Load Model' in the sidebar") elif not uploaded_file: st.info("๐Ÿ‘ˆ Upload an image to begin") # Footer st.markdown("---") st.markdown("""

Powered by Latest Vision-Language Models โ€ข LLaVA โ€ข Qwen-VL

Native multilingual support โ€ข No translation APIs โ€ข Direct caption generation

UCAS @2025 โ€ข Built with Streamlit & Transformers

""", unsafe_allow_html=True) if __name__ == "__main__": main()