Spaces:

amogneandualem
/

amogne-vlm-LLM

Running

File size: 15,971 Bytes

"""
🌍 Advanced Multilingual Image Describer
Using latest Vision-Language Models (VLMs) with native multilingual support
"""

import streamlit as st
import torch
from PIL import Image
import time
from datetime import datetime
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Set page config
st.set_page_config(
    page_title="Multilingual Image Describer",
    page_icon="🌍",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .st-emotion-cache-16txtl3 {
        padding-top: 3rem;
    }
    .header-title {
        text-align: center;
        color: #2C3E50;
        margin-bottom: 1rem;
    }
    .model-badge {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 5px 15px;
        border-radius: 20px;
        font-size: 12px;
        display: inline-block;
        margin: 5px;
    }
    .language-tag {
        background: #E3F2FD;
        color: #1976D2;
        padding: 3px 10px;
        border-radius: 15px;
        font-size: 12px;
        margin: 2px;
        display: inline-block;
    }
</style>
""", unsafe_allow_html=True)

# Initialize session state
if 'model' not in st.session_state:
    st.session_state.model = None
if 'model_name' not in st.session_state:
    st.session_state.model_name = None
if 'results' not in st.session_state:
    st.session_state.results = None

# Model options (latest vision-language models)
MODEL_OPTIONS = {
    "llava-hf/llava-1.5-7b-hf": {
        "name": "LLaVA 1.5 (7B)",
        "multilingual": True,
        "languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"],
        "prompt_templates": {
            "en": "Describe this image in detail:",
            "zh": "详细描述这张图片：",
            "es": "Describe esta imagen en detalle:",
            "fr": "Décrivez cette image en détail :",
            "de": "Beschreiben Sie dieses Bild im Detail:",
            "am": "ይህንን ምስል በዝርዝር ይግለጹ፡"
        }
    },
    "Qwen/Qwen-VL-Chat": {
        "name": "Qwen-VL-Chat",
        "multilingual": True,
        "languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"],
        "prompt_templates": {
            "en": "Describe this image in English:",
            "zh": "用中文描述这张图片：",
            "am": "በአማርኛ ይህንን ምስል ይግለጹ፡"
        }
    },
    "vikhyatk/moondream2": {
        "name": "Moondream 2",
        "multilingual": True,
        "languages": ["en", "es", "fr", "de"],
        "prompt_templates": {
            "en": "Describe this image:",
            "zh": "描述这张图片：",
            "am": "ይህንን ምስል ይግለጹ፡"
        }
    }
}

# Language mapping
LANGUAGE_NAMES = {
    "en": "🇺🇸 English",
    "zh": "🇨🇳 中文",
    "am": "🇪🇹 አማርኛ",
    "es": "🇪🇸 Español",
    "fr": "🇫🇷 Français",
    "de": "🇩🇪 Deutsch",
    "ar": "🇸🇦 العربية",
    "hi": "🇮🇳 हिन्दी",
    "ru": "🇷🇺 Русский",
    "ja": "🇯🇵 日本語",
    "ko": "🇰🇷 한국어",
    "it": "🇮🇹 Italiano",
    "pt": "🇵🇹 Português",
    "tr": "🇹🇷 Türkçe"
}

@st.cache_resource(show_spinner=True)
def load_model(model_id):
    """Load the selected vision-language model"""
    try:
        from transformers import AutoProcessor, AutoModelForVision2Seq
        
        st.info(f"🚀 Loading {MODEL_OPTIONS[model_id]['name']}...")
        
        # Load processor and model
        processor = AutoProcessor.from_pretrained(model_id)
        model = AutoModelForVision2Seq.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        return processor, model, model_id
        
    except Exception as e:
        st.error(f"❌ Failed to load model: {str(e)[:200]}")
        return None, None, None

def generate_caption(image, model_tuple, language="en", model_id=None):
    """Generate caption using the vision-language model"""
    if None in model_tuple:
        return "Model not loaded"
    
    processor, model, loaded_model_id = model_tuple
    
    try:
        # Get prompt template based on model and language
        model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"])
        prompt_template = model_info["prompt_templates"].get(
            language, 
            model_info["prompt_templates"].get("en", "Describe this image:")
        )
        
        # Prepare inputs
        if "llava" in loaded_model_id:
            # LLaVA format
            prompt = f"USER: <image>\n{prompt_template}\nASSISTANT:"
            inputs = processor(text=prompt, images=image, return_tensors="pt")
        elif "qwen" in loaded_model_id.lower():
            # Qwen-VL format
            prompt = f"<img>Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:</img>"
            inputs = processor(text=prompt, images=image, return_tensors="pt")
        else:
            # Default format
            inputs = processor(text=prompt_template, images=image, return_tensors="pt")
        
        # Move to device
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.7,
                do_sample=True
            )
        
        # Decode
        generated_text = processor.batch_decode(
            generated_ids, 
            skip_special_tokens=True
        )[0].strip()
        
        # Clean up response
        if "llava" in loaded_model_id:
            # Remove the prompt part
            if "ASSISTANT:" in generated_text:
                generated_text = generated_text.split("ASSISTANT:")[-1].strip()
        
        return generated_text
        
    except Exception as e:
        return f"Error generating description: {str(e)[:100]}"

def main():
    # Title
    st.markdown("<h1 class='header-title'>🌍 Advanced Multilingual Image Describer</h1>", unsafe_allow_html=True)
    
    # Model info
    st.markdown("""
    <div style="text-align: center; margin-bottom: 2rem;">
        <span class='model-badge'>Latest Vision-Language Models</span>
        <span class='model-badge'>Native Multilingual Support</span>
        <span class='model-badge'>No Translation APIs Needed</span>
    </div>
    """, unsafe_allow_html=True)
    
    # Sidebar
    with st.sidebar:
        st.markdown("### ⚙️ Configuration")
        
        # Model selection
        st.markdown("#### 🤖 Select Model")
        model_choice = st.selectbox(
            "Choose a vision-language model:",
            options=list(MODEL_OPTIONS.keys()),
            format_func=lambda x: MODEL_OPTIONS[x]["name"],
            help="LLaVA supports most languages. Qwen-VL is faster."
        )
        
        # Show model info
        model_info = MODEL_OPTIONS[model_choice]
        st.caption(f"✅ Languages: {len(model_info['languages'])}")
        st.caption(f"📊 Parameters: 7B+")
        
        # Language selection
        st.markdown("#### 🌐 Select Language")
        available_langs = model_info["languages"]
        selected_lang = st.selectbox(
            "Output language:",
            options=available_langs,
            format_func=lambda x: LANGUAGE_NAMES.get(x, x),
            index=0
        )
        
        # Show language tags
        st.markdown("**Supported languages:**")
        lang_tags = " ".join([
            f'<span class="language-tag">{LANGUAGE_NAMES.get(lang, lang)}</span>'
            for lang in available_langs[:8]
        ])
        st.markdown(f'<div>{lang_tags}</div>', unsafe_allow_html=True)
        
        # Image upload
        st.markdown("---")
        st.markdown("### 📸 Upload Image")
        uploaded_file = st.file_uploader(
            "Choose an image file",
            type=["jpg", "jpeg", "png", "webp", "bmp"],
            label_visibility="collapsed"
        )
        
        # Advanced options
        with st.expander("⚡ Advanced Settings"):
            max_tokens = st.slider("Max tokens", 50, 500, 200, 50)
            temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1)
        
        st.markdown("---")
        
        # Action buttons
        col1, col2 = st.columns(2)
        with col1:
            load_btn = st.button("🔄 Load Model", use_container_width=True)
        with col2:
            if st.button("🗑️ Clear", use_container_width=True):
                st.session_state.results = None
                st.rerun()
        
        # Load model if requested
        if load_btn or (st.session_state.model is None and uploaded_file):
            with st.spinner(f"Loading {model_info['name']}..."):
                processor, model, model_id = load_model(model_choice)
                if processor and model:
                    st.session_state.model = (processor, model, model_id)
                    st.session_state.model_name = model_info["name"]
                    st.success(f"✅ {model_info['name']} loaded!")
                else:
                    st.error("❌ Failed to load model")
        
        # Quick stats
        if st.session_state.results:
            st.markdown("---")
            st.markdown("### 📊 Quick Stats")
            col1, col2 = st.columns(2)
            with col1:
                st.metric("Model", st.session_state.model_name or "N/A")
            with col2:
                st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang))
    
    # Main content
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.markdown("### 📤 Input Image")
        
        if uploaded_file:
            try:
                image = Image.open(uploaded_file).convert("RGB")
                st.image(image, use_column_width=True)
                st.caption(f"📏 Size: {image.size[0]}×{image.size[1]} pixels")
                
                # Store for processing
                st.session_state.current_image = image
                
            except Exception as e:
                st.error(f"Error loading image: {e}")
        else:
            st.info("👈 Upload an image to get started")
            # Show placeholder
            st.image(
                "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format",
                caption="Upload your own image for analysis",
                use_column_width=True
            )
    
    with col2:
        st.markdown("### 📋 Results")
        
        # Process image if model is loaded
        if (uploaded_file and st.session_state.model and 
            st.session_state.current_image and 
            'current_image' in st.session_state):
            
            # Generate button
            if st.button("🚀 Generate Description", type="primary", use_container_width=True):
                with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."):
                    start_time = time.time()
                    
                    # Generate caption
                    caption = generate_caption(
                        st.session_state.current_image,
                        st.session_state.model,
                        selected_lang,
                        model_choice
                    )
                    
                    processing_time = time.time() - start_time
                    
                    # Store results
                    st.session_state.results = {
                        "caption": caption,
                        "language": selected_lang,
                        "language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang),
                        "model": st.session_state.model_name,
                        "processing_time": f"{processing_time:.2f}s",
                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
        
        # Display results
        if st.session_state.results:
            results = st.session_state.results
            
            st.success(f"✅ Generated in {results['processing_time']}")
            
            # Display caption
            st.markdown("#### Generated Description")
            st.markdown(f"""
            <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #667eea;">
                <p style="font-size: 16px; line-height: 1.6;">{results['caption']}</p>
            </div>
            """, unsafe_allow_html=True)
            
            # Metadata
            st.markdown("#### 📊 Analysis Details")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Model", results['model'])
            with col2:
                st.metric("Language", results['language_name'])
            with col3:
                st.metric("Time", results['processing_time'])
            
            # Export options
            st.markdown("---")
            st.markdown("#### 💾 Export Results")
            
            col1, col2 = st.columns(2)
            with col1:
                # JSON export
                import json
                json_data = json.dumps(results, indent=2, ensure_ascii=False)
                st.download_button(
                    "📥 Download JSON",
                    json_data,
                    f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
                    "application/json",
                    use_container_width=True
                )
            
            with col2:
                # Text export
                text_data = f"""Image Description
Generated: {results['timestamp']}
Model: {results['model']}
Language: {results['language_name']}
Processing Time: {results['processing_time']}

DESCRIPTION:
{results['caption']}

---
Generated by Multilingual Image Describer
Powered by {results['model']}
"""
                st.download_button(
                    "📥 Download TXT",
                    text_data,
                    f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
                    "text/plain",
                    use_container_width=True
                )
            
            # Try another language
            st.markdown("---")
            st.markdown("#### 🔄 Try Another Language")
            if st.button("🔄 Generate in Different Language", use_container_width=True):
                st.session_state.results = None
                st.rerun()
        
        elif uploaded_file and not st.session_state.model:
            st.warning("⚠️ Please load the model first!")
            st.info("Click '🔄 Load Model' in the sidebar")
        elif not uploaded_file:
            st.info("👈 Upload an image to begin")
    
    # Footer
    st.markdown("---")
    st.markdown("""
    <div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;">
        <p>
            <strong>Powered by Latest Vision-Language Models</strong> •
            <a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf" target="_blank" style="color: #667eea;">LLaVA</a> •
            <a href="https://huggingface.co/Qwen/Qwen-VL-Chat" target="_blank" style="color: #667eea;">Qwen-VL</a>
        </p>
        <p style="font-size: 0.8em;">
            Native multilingual support • No translation APIs • Direct caption generation
        </p>
        <p style="font-size: 0.7em; color: #999; margin-top: 15px;">
            UCAS @2025 • Built with Streamlit & Transformers
        </p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()