Spaces:

amogneandualem
/

amogne-vlm-LLM

Running

App Files Files Community

amogneandualem commited on 3 days ago

Commit

9646139

verified ·

1 Parent(s): 78def46

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -692

app.py CHANGED Viewed

@@ -1,23 +1,16 @@
 """
-Multilingual Image Describer Designed by bAmogne A. @UCAS-2025
 """
 import streamlit as st
 import torch
 from PIL import Image
-import cv2
-import numpy as np
-from transformers import BlipProcessor, BlipForConditionalGeneration
-from ultralytics import YOLO
-import json
 import time
 from datetime import datetime
 import pandas as pd
-import plotly.graph_objects as go
-import plotly.express as px
-import os
-import requests
-from io import BytesIO
 import warnings
 warnings.filterwarnings("ignore")
@@ -25,759 +18,229 @@ warnings.filterwarnings("ignore")
 st.set_page_config(
     page_title="Multilingual Image Describer",
     page_icon="🌍",
-    layout="wide",
-    initial_sidebar_state="expanded"
 )
-# Custom CSS
-st.markdown("""
-<style>
-    .main {
-        padding: 0rem 1rem;
-    }
-    .header {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        padding: 2rem;
-        border-radius: 10px;
-        color: white;
-        margin-bottom: 2rem;
-    }
-    .card {
-        background: white;
-        padding: 1.5rem;
-        border-radius: 10px;
-        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        margin-bottom: 1rem;
-        border: 1px solid #e0e0e0;
-    }
-    .object-tag {
-        display: inline-block;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        padding: 5px 10px;
-        margin: 3px;
-        border-radius: 15px;
-        font-size: 12px;
-        font-weight: 500;
-    }
-    .stat-card {
-        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-        padding: 15px;
-        border-radius: 10px;
-        text-align: center;
-        margin: 5px;
-    }
-    .stat-value {
-        font-size: 24px;
-        font-weight: bold;
-        color: #2B6CB0;
-    }
-    .stat-label {
-        font-size: 12px;
-        color: #718096;
-    }
-    .stProgress > div > div > div > div {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    }
-    .stButton > button {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        border: none;
-        padding: 10px 20px;
-        border-radius: 5px;
-        font-weight: 500;
-    }
-    .stButton > button:hover {
-        transform: translateY(-2px);
-        box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
-    }
-</style>
-""", unsafe_allow_html=True)
 # Initialize session state
 if 'model' not in st.session_state:
     st.session_state.model = None
-if 'detection_model' not in st.session_state:
-    st.session_state.detection_model = None
-if 'results' not in st.session_state:
-    st.session_state.results = None
-if 'image' not in st.session_state:
-    st.session_state.image = None
-if 'hf_token' not in st.session_state:
-    st.session_state.hf_token = None
-# Language configuration with real translation support
 LANGUAGES = {
-    "en": {"name": "English", "emoji": "🇺🇸", "code": "eng_Latn"},
-    "es": {"name": "Spanish", "emoji": "🇪🇸", "code": "spa_Latn"},
-    "fr": {"name": "French", "emoji": "🇫🇷", "code": "fra_Latn"},
-    "de": {"name": "German", "emoji": "🇩🇪", "code": "deu_Latn"},
-    "zh": {"name": "Chinese", "emoji": "🇨🇳", "code": "zho_Hans"},
-    "hi": {"name": "Hindi", "emoji": "🇮🇳", "code": "hin_Deva"},
-    "ar": {"name": "Arabic", "emoji": "🇸🇦", "code": "arb_Arab"},
-    "ru": {"name": "Russian", "emoji": "🇷🇺", "code": "rus_Cyrl"},
-    "ja": {"name": "Japanese", "emoji": "🇯🇵", "code": "jpn_Jpan"},
-    "ko": {"name": "Korean", "emoji": "🇰🇷", "code": "kor_Hang"},
-    "pt": {"name": "Portuguese", "emoji": "🇵🇹", "code": "por_Latn"},
-    "it": {"name": "Italian", "emoji": "🇮🇹", "code": "ita_Latn"},
-    "am": {"name": "Amharic", "emoji": "🇪🇹", "code": "amh_Ethi"},
-    "tr": {"name": "Turkish", "emoji": "🇹🇷", "code": "tur_Latn"},
 }
-# Hugging Face Translation Function
-def translate_with_huggingface(text, target_lang="en", api_token=None):
-    """
-    Translate text using Hugging Face Inference API with NLLB model
-    """
-    if target_lang == "en" or not text.strip():
-        return text
-    # Get target language code
-    lang_info = LANGUAGES.get(target_lang)
-    if not lang_info or 'code' not in lang_info:
-        return f"[{target_lang.upper()}] {text}"
-    target_code = lang_info['code']
-    # Hugging Face Inference API endpoint
-    API_URL = "https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M"
-    # Prepare headers
-    headers = {}
-    if api_token:
-        headers["Authorization"] = f"Bearer {api_token}"
-    payload = {
-        "inputs": text,
-        "parameters": {
-            "src_lang": "eng_Latn",
-            "tgt_lang": target_code
-        }
-    }
     try:
-        # Make API request
-        response = requests.post(
-            API_URL,
-            headers=headers,
-            json=payload,
-            timeout=30
         )
-        if response.status_code == 200:
-            result = response.json()
-            # Parse response
-            if isinstance(result, list) and len(result) > 0:
-                translated_text = result[0].get('translation_text', text)
-                return translated_text
-            elif isinstance(result, dict) and 'translation_text' in result:
-                return result['translation_text']
-            else:
-                st.warning(f"Unexpected API response format. Using original text.")
-                return text
-        elif response.status_code == 503:
-            # Model is loading
-            st.warning(f"Translation model is loading. Please try again in 30 seconds.")
-            return f"[{target_lang.upper()}] {text}"
-        else:
-            st.warning(f"Translation API error {response.status_code}. Using original text.")
-            return text
-    except requests.exceptions.Timeout:
-        st.warning("Translation request timed out. Using original text.")
-        return text
-    except Exception as e:
-        st.warning(f"Translation error: {str(e)[:100]}... Using original text.")
-        return text
-def translate_object_list(objects, target_lang="en", api_token=None):
-    """
-    Translate a list of object names
-    """
-    if target_lang == "en" or not objects:
-        return objects
-    translated_objects = []
-    for obj in objects:
-        translated_obj = translate_with_huggingface(obj, target_lang, api_token)
-        translated_objects.append(translated_obj)
-    return translated_objects
-@st.cache_resource(show_spinner="Loading BLIP model...")
-def load_caption_model():
-    """Load BLIP model for image captioning"""
-    try:
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        model = BlipForConditionalGeneration.from_pretrained(
-            "Salesforce/blip-image-captioning-base"
-        )
         return processor, model
     except Exception as e:
-        st.error(f"Error loading BLIP model: {e}")
         return None, None
-@st.cache_resource(show_spinner="Loading YOLO model...")
-def load_detection_model():
-    """Load YOLO model for object detection"""
-    try:
-        model = YOLO('yolov8n.pt')
-        return model
-    except Exception as e:
-        st.error(f"Error loading YOLO model: {e}")
-        return None
-def detect_objects(image, model, confidence_threshold=0.25):
-    """Detect objects in image using YOLO"""
-    if model is None:
-        return [], []
-    try:
-        # Run detection
-        results = model(image, conf=confidence_threshold, verbose=False)
-        detected_objects = []
-        detection_details = []
-        for result in results:
-            if result.boxes is not None:
-                boxes = result.boxes.cpu().numpy()
-                for box in boxes:
-                    x1, y1, x2, y2 = box.xyxy[0]
-                    conf = box.conf[0]
-                    cls = int(box.cls[0])
-                    obj_name = result.names[cls]
-                    detected_objects.append({
-                        "object": obj_name,
-                        "confidence": float(conf),
-                        "bbox": {
-                            "x1": float(x1),
-                            "y1": float(y1),
-                            "x2": float(x2),
-                            "y2": float(y2)
-                        }
-                    })
-        # Get unique object names for summary
-        unique_objects = list(set([obj["object"] for obj in detected_objects]))
-        return unique_objects, detected_objects
-    except Exception as e:
-        st.error(f"Detection error: {e}")
-        return [], []
-def generate_caption(image, model_tuple):
-    """Generate caption for image using BLIP"""
-    if model_tuple is None:
-        return "Models not loaded"
     try:
-        processor, model = model_tuple
-        # Use CPU for inference on Hugging Face
-        device = torch.device("cpu")
-        model = model.to(device)
-        inputs = processor(image, return_tensors="pt").to(device)
         with torch.no_grad():
-            out = model.generate(**inputs, max_length=50, num_beams=3)
-        caption = processor.decode(out[0], skip_special_tokens=True)
-        return caption
     except Exception as e:
-        return "An image containing various objects and scenes."
-def load_sample_image():
-    """Load a default sample image"""
-    try:
-        # Use a simple local sample or a reliable URL
-        sample_url = "https://images.unsplash.com/photo-1546182990-dffeafbe841d?w=800&auto=format&fit=crop"
-        response = requests.get(sample_url, timeout=10)
-        if response.status_code == 200:
-            return BytesIO(response.content)
-    except:
-        pass
-    return None
 def main():
-    # Header
-    st.markdown("""
-    <div class="header">
-        <h1 style="margin: 0; font-size: 2.5em;">🌍 Multilingual Image Describer</h1>
-        <p style="margin: 0; opacity: 0.9; font-size: 1.1em;">
-            Upload or capture an image to get object detection and descriptions
-        </p>
-        <p style="margin: 10px 0 0 0; font-size: 0.9em; opacity: 0.7;">
-            Powered by BLIP + YOLOv8 • UCAS @2025 • Real Translation Enabled
-        </p>
-    </div>
-    """, unsafe_allow_html=True)
-    # Initialize models
-    with st.spinner("🚀 Loading AI models..."):
         if st.session_state.model is None:
-            st.session_state.model = load_caption_model()
-        if st.session_state.detection_model is None:
-            st.session_state.detection_model = load_detection_model()
-    if st.session_state.model is None or st.session_state.detection_model is None:
-        st.error("Failed to load AI models. Please refresh the page.")
         return
     # Sidebar
     with st.sidebar:
-        st.markdown("### 📸 Image Input")
-        # Input method
-        input_method = st.radio(
-            "Select input method:",
-            ["Upload", "Camera", "Sample"],
-            horizontal=True,
-            label_visibility="collapsed"
         )
-        uploaded_image = None
-        if input_method == "Upload":
-            uploaded_image = st.file_uploader(
-                "Choose an image file",
-                type=["jpg", "jpeg", "png", "webp", "bmp"],
-                label_visibility="collapsed"
-            )
-        elif input_method == "Camera":
-            camera_image = st.camera_input("Take a picture", label_visibility="collapsed")
-            if camera_image:
-                uploaded_image = camera_image
-        else:  # Sample
-            if st.button("Load Sample Image", use_container_width=True):
-                sample_bytes = load_sample_image()
-                if sample_bytes:
-                    uploaded_image = sample_bytes
-                    st.success("Sample image loaded!")
         st.markdown("---")
-        # Language selection
-        st.markdown("### 🌐 Language Settings")
-        # API Token input (optional but recommended)
-        st.markdown("#### 🔑 Translation API")
-        api_token = st.text_input(
-            "Hugging Face Token (optional)",
-            type="password",
-            help="Get free token from huggingface.co/settings/tokens",
-            placeholder="hf_xxxxxxxxxxxxxxxxxxx"
-        )
-        if api_token:
-            st.session_state.hf_token = api_token
-            st.success("✅ API token saved for translation")
-        else:
-            st.info("ℹ️ Without token, translation may be limited")
-        st.markdown("#### 🗣️ Select Language")
-        language_options = [(code, f"{info['emoji']} {info['name']}")
-                          for code, info in LANGUAGES.items()]
-        selected_lang = st.selectbox(
-            "Choose language for description:",
-            options=[code for code, _ in language_options],
-            format_func=lambda x: f"{LANGUAGES[x]['emoji']} {LANGUAGES[x]['name']}",
-            index=0,
-            label_visibility="collapsed"
-        )
-        # Show language info
-        if selected_lang in LANGUAGES:
-            lang_info = LANGUAGES[selected_lang]
-            st.caption(f"Selected: {lang_info['name']} ({lang_info['code']})")
-        st.markdown("---")
-        # Settings
-        with st.expander("⚙️ Advanced Settings"):
-            confidence = st.slider(
-                "Detection Confidence",
-                min_value=0.1,
-                max_value=0.9,
-                value=0.25,
-                step=0.05,
-                help="Higher values = more confident detections"
-            )
-            enable_translation = st.checkbox(
-                "Enable real-time translation",
-                value=True,
-                help="Uses Hugging Face NLLB model for translation"
-            )
-            translation_mode = st.radio(
-                "Translation Mode",
-                ["Full translation", "Keywords only", "Disabled"],
-                index=0,
-                help="Full: Translate everything, Keywords: Only translate object names"
-            )
         st.markdown("---")
-        # Process buttons
-        col1, col2 = st.columns(2)
-        with col1:
-            process_btn = st.button(
-                "🚀 Analyze Image",
-                type="primary",
-                use_container_width=True,
-                disabled=uploaded_image is None,
-                help="Process image and generate description"
-            )
-        with col2:
-            if st.button("🗑️ Clear All", use_container_width=True):
-                st.session_state.results = None
-                st.session_state.image = None
-                st.rerun()
-        # Quick stats if results exist
-        if st.session_state.results:
-            st.markdown("---")
-            st.markdown("### 📊 Quick Stats")
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Objects", st.session_state.results["detection_count"])
-            with col2:
-                st.metric("Unique", st.session_state.results["unique_count"])
-            with col3:
-                st.metric("Time", st.session_state.results["processing_time"])
     # Main content
     col1, col2 = st.columns([1, 1])
     with col1:
-        st.markdown("### 📤 Input Image")
-        if uploaded_image:
-            try:
-                image = Image.open(uploaded_image).convert("RGB")
-                st.session_state.image = image
-                # Display image
-                st.image(
-                    image,
-                    caption=f"Image • {image.size[0]}×{image.size[1]} pixels",
-                    use_column_width=True
-                )
-                # Show image info
-                with st.expander("📋 Image Details"):
-                    st.write(f"**Format:** {image.format if hasattr(image, 'format') else 'Unknown'}")
-                    st.write(f"**Mode:** {image.mode}")
-                    st.write(f"**Size:** {image.size[0]} × {image.size[1]} pixels")
-            except Exception as e:
-                st.error(f"Error loading image: {e}")
         else:
-            # Placeholder
-            st.info("👈 Please upload an image, use camera, or load sample")
-            # Show sample preview
             st.image(
-                "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=800&auto=format&fit=crop",
-                caption="Sample: Colorful gradient background",
                 use_column_width=True
             )
-            st.caption("Try uploading your own image for best results!")
     with col2:
-        st.markdown("### 📋 Analysis Results")
-        if process_btn and st.session_state.image:
-            with st.spinner("🔄 Processing image..."):
-                # Create progress indicators
                 progress_bar = st.progress(0)
-                status_text = st.empty()
-                # Step 1: Generate caption
-                status_text.text("📝 Generating image description...")
-                progress_bar.progress(25)
-                caption = generate_caption(st.session_state.image, st.session_state.model)
-                # Step 2: Detect objects
-                status_text.text("🔍 Detecting objects...")
-                progress_bar.progress(50)
-                unique_objects, detection_details = detect_objects(
-                    st.session_state.image,
-                    st.session_state.detection_model,
-                    confidence
-                )
-                # Step 3: Apply translation if enabled
-                status_text.text("🌍 Translating content...")
-                progress_bar.progress(75)
-                translated_caption = caption
-                translated_objects = unique_objects
-                if enable_translation and selected_lang != "en":
-                    # Get API token
-                    api_token = st.session_state.hf_token
-                    # Translate based on mode
-                    if translation_mode == "Full translation":
-                        translated_caption = translate_with_huggingface(
-                            caption, selected_lang, api_token
-                        )
-                        translated_objects = translate_object_list(
-                            unique_objects, selected_lang, api_token
-                        )
-                    elif translation_mode == "Keywords only":
-                        translated_objects = translate_object_list(
-                            unique_objects, selected_lang, api_token
-                        )
-                        translated_caption = caption
-                    # else: "Disabled" - keep original
-                else:
-                    # Add language prefix if translation is disabled
-                    if selected_lang != "en":
-                        translated_caption = f"[{selected_lang.upper()}] {caption}"
-                # Step 4: Complete
-                status_text.text("✅ Processing complete!")
-                progress_bar.progress(100)
-                time.sleep(0.5)
-                processing_time = time.time() - st.session_state.get('process_start_time', time.time())
-                # Prepare results
-                results = {
-                    "original_caption": caption,
-                    "caption": translated_caption,
-                    "original_objects": unique_objects,
-                    "detected_objects": translated_objects,
-                    "detection_details": detection_details,
-                    "detection_count": len(detection_details),
-                    "unique_count": len(unique_objects),
-                    "language": selected_lang,
-                    "language_name": LANGUAGES[selected_lang]["name"],
-                    "translation_enabled": enable_translation,
-                    "translation_mode": translation_mode,
-                    "processing_time": f"{processing_time:.2f}s",
-                    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                }
-                st.session_state.results = results
-                st.session_state.process_start_time = None
-            # Clear progress indicators
-            progress_bar.empty()
-            status_text.empty()
-            # Display results in tabs
-            tab1, tab2, tab3 = st.tabs(["📝 Description", "🔍 Objects", "💾 Export"])
-            with tab1:
-                st.markdown("#### Image Description")
-                # Display caption
-                st.markdown(f'<div class="card">{results["caption"]}</div>', unsafe_allow_html=True)
-                # Show translation note if applicable
-                if results["translation_enabled"] and selected_lang != "en":
-                    st.success(f"✅ Translated to {results['language_name']}")
-                st.markdown("#### Analysis Summary")
-                # Stats in columns
-                cols = st.columns(4)
-                with cols[0]:
-                    st.metric("Objects", results["detection_count"])
-                with cols[1]:
-                    st.metric("Unique", results["unique_count"])
-                with cols[2]:
-                    st.metric("Time", results["processing_time"])
-                with cols[3]:
-                    st.metric("Language", results["language_name"])
-                # Show original if translated
-                if results["translation_enabled"] and selected_lang != "en" and results["original_caption"] != results["caption"]:
-                    with st.expander("🔤 View Original English"):
-                        st.write(results["original_caption"])
-            with tab2:
-                if results["detected_objects"]:
-                    # Display object tags
-                    st.markdown("#### Detected Objects")
-                    tags_html = " ".join(
-                        [f'<span class="object-tag">{obj}</span>'
-                         for obj in results["detected_objects"][:20]]  # Limit to 20 for display
-                    )
-                    st.markdown(f'<div style="margin: 10px 0;">{tags_html}</div>', unsafe_allow_html=True)
-                    if len(results["detected_objects"]) > 20:
-                        st.caption(f"Showing 20 of {len(results['detected_objects'])} objects")
-                    # Detailed table
-                    if results["detection_details"]:
-                        st.markdown("#### Detailed Results")
-                        df = pd.DataFrame(results["detection_details"])
-                        st.dataframe(
-                            df[['object', 'confidence']].sort_values('confidence', ascending=False),
-                            use_container_width=True,
-                            height=300
-                        )
-                        # Confidence chart
-                        if len(df) > 0:
-                            fig = px.histogram(
-                                df,
-                                x='confidence',
-                                nbins=10,
-                                title='Confidence Distribution',
-                                labels={'confidence': 'Confidence Score'},
-                                color_discrete_sequence=['#667eea']
-                            )
-                            st.plotly_chart(fig, use_container_width=True)
-                else:
-                    st.info("🔍 No objects detected in this image")
-                    st.markdown("Try adjusting the confidence threshold in settings")
-            with tab3:
-                st.markdown("#### Export Results")
-                # JSON export
-                json_data = json.dumps(results, indent=2, ensure_ascii=False)
-                col1, col2, col3 = st.columns(3)
-                with col1:
-                    st.download_button(
-                        "📥 Download JSON",
-                        json_data,
-                        f"image_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
-                        "application/json",
-                        use_container_width=True,
-                        help="Download complete analysis as JSON"
-                    )
-                with col2:
-                    # Text export
-                    text_data = f"""IMAGE ANALYSIS REPORT
-================================
-Generated: {results['timestamp']}
-Language: {results['language_name']}
-Translation: {'Enabled' if results['translation_enabled'] else 'Disabled'}
-DESCRIPTION:
-{results['caption']}
-DETECTED OBJECTS:
-Total Objects: {results['detection_count']}
-Unique Objects: {results['unique_count']}
-Object List: {', '.join(results['detected_objects']) if results['detected_objects'] else 'None'}
-PROCESSING INFO:
-Processing Time: {results['processing_time']}
-Detection Confidence: {confidence}
----
-Multilingual Image Describer • UCAS @2025
-Powered by BLIP + YOLOv8
-"""
-                    st.download_button(
-                        "📥 Download TXT",
-                        text_data,
-                        f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
-                        "text/plain",
-                        use_container_width=True,
-                        help="Download summary as text file"
-                    )
-                with col3:
-                    if st.button("🔄 Analyze Another", use_container_width=True):
-                        st.session_state.results = None
-                        st.rerun()
-                # View JSON
-                with st.expander("📄 View Complete JSON Data"):
-                    st.code(json_data, language="json")
-        elif st.session_state.results:
-            # Show cached results
-            results = st.session_state.results
-            st.success(f"✅ Analysis complete ({results['processing_time']})")
-            # Quick summary
-            st.markdown(f"**Description:** {results['caption']}")
-            if results["detected_objects"]:
-                st.markdown(f"**Objects detected:** {len(results['detected_objects'])} items")
-                # Show first few objects
-                preview_objects = results["detected_objects"][:5]
-                preview_text = ", ".join(preview_objects)
-                if len(results["detected_objects"]) > 5:
-                    preview_text += f" (+{len(results['detected_objects']) - 5} more)"
-                st.markdown(f"**Sample:** {preview_text}")
-            # Action buttons
-            col1, col2 = st.columns(2)
-            with col1:
-                if st.button("🔄 Analyze New Image", use_container_width=True):
-                    st.session_state.results = None
-                    st.session_state.image = None
-                    st.rerun()
-            with col2:
-                if st.button("📊 View Full Report", use_container_width=True):
-                    # This will refresh and show tabs
-                    st.rerun()
-        elif process_btn and st.session_state.image is None:
-            st.warning("⚠️ Please upload an image first!")
     # Footer
     st.markdown("---")
-    st.markdown("""
-    <div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;">
-        <p>
-            🌍 <strong>Real Translation Enabled</strong> •
-            <a href="https://huggingface.co/docs/hub/spaces" target="_blank" style="color: #667eea; text-decoration: none;">
-                Hugging Face Spaces
-            </a> •
-            <a href="https://huggingface.co/facebook/nllb-200-distilled-600M" target="_blank" style="color: #667eea; text-decoration: none;">
-                NLLB Translation Model
-            </a>
-        </p>
-        <p style="font-size: 0.8em; margin-top: 10px;">
-            AI Models: BLIP (Image Captioning) • YOLOv8 (Object Detection) • NLLB (Translation)<br>
-            Supports: English, Spanish, French, German, Chinese, Hindi, Arabic, Russian, Japanese, Korean, Portuguese, Italian, Amharic, Turkish
-        </p>
-        <p style="font-size: 0.7em; margin-top: 15px; color: #999;">
-            Built with ❤️ by UCAS @2025 • For educational and research purposes
-        </p>
-    </div>
-    """, unsafe_allow_html=True)
 if __name__ == "__main__":
-    # Set process start time
-    if 'process_start_time' not in st.session_state:
-        st.session_state.process_start_time = time.time()
     main()

 """
+🌍 Multilingual Image Describer - SIMPLE
+Using pre-trained multilingual model for direct captioning
 """
 import streamlit as st
 import torch
 from PIL import Image
+import requests
+from io import BytesIO
 import time
 from datetime import datetime
 import pandas as pd
 import warnings
 warnings.filterwarnings("ignore")
 st.set_page_config(
     page_title="Multilingual Image Describer",
     page_icon="🌍",
+    layout="wide"
 )
 # Initialize session state
 if 'model' not in st.session_state:
     st.session_state.model = None
+# Language settings
 LANGUAGES = {
+    "en": {"name": "English", "prompt": "a photo of"},
+    "zh": {"name": "中文", "prompt": "一张照片"},
+    "am": {"name": "አማርኛ", "prompt": "የሚያሳይ ፎቶ"},
+    "es": {"name": "Español", "prompt": "una foto de"},
+    "fr": {"name": "Français", "prompt": "une photo de"},
+    "de": {"name": "Deutsch", "prompt": "ein Foto von"},
+    "ar": {"name": "العربية", "prompt": "صورة"},
+    "hi": {"name": "हिन्दी", "prompt": "की एक तस्वीर"},
+    "ru": {"name": "Русский", "prompt": "фотография"},
+    "ja": {"name": "日本語", "prompt": "の写真"}
 }
+@st.cache_resource(show_spinner="Loading multilingual model...")
+def load_model():
+    """Load multilingual image captioning model"""
     try:
+        from transformers import Blip2Processor, Blip2ForConditionalGeneration
+        # Using BLIP-2 with multilingual capabilities
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         )
+        # Move to GPU if available
+        if torch.cuda.is_available():
+            model = model.to("cuda")
         return processor, model
     except Exception as e:
+        st.error(f"Model loading error: {str(e)[:100]}")
         return None, None
+def generate_multilingual_caption(image, language="en"):
+    """Generate caption directly in the target language"""
+    if st.session_state.model is None:
+        return "Model not loaded"
+    processor, model = st.session_state.model
     try:
+        # Prepare prompt based on language
+        prompt_text = LANGUAGES.get(language, LANGUAGES["en"])["prompt"]
+        # Process image
+        inputs = processor(image, text=prompt_text, return_tensors="pt")
+        # Move to device
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Generate caption
         with torch.no_grad():
+            outputs = model.generate(**inputs, max_length=50)
+        # Decode the output
+        caption = processor.decode(outputs[0], skip_special_tokens=True)
+        # Remove the prompt from the beginning if present
+        if caption.lower().startswith(prompt_text.lower()):
+            caption = caption[len(prompt_text):].strip()
+        return caption.strip()
     except Exception as e:
+        return f"An image with various objects. (Error: {str(e)[:50]})"
 def main():
+    # Title
+    st.title("🌍 Multilingual Image Describer")
+    st.markdown("Upload an image to get descriptions in multiple languages")
+    # Load model
+    with st.spinner("Loading AI model..."):
         if st.session_state.model is None:
+            st.session_state.model = load_model()
+    if st.session_state.model is None:
+        st.error("Failed to load model. Please refresh the page.")
         return
     # Sidebar
     with st.sidebar:
+        st.header("📸 Upload Image")
+        uploaded_file = st.file_uploader(
+            "Choose an image",
+            type=["jpg", "jpeg", "png", "webp"],
+            help="Upload any image file"
         )
         st.markdown("---")
+        st.header("🌐 Select Languages")
+        # Language selection with checkboxes
+        selected_languages = []
+        cols = st.columns(2)
+        lang_list = list(LANGUAGES.items())
+        for i, (code, info) in enumerate(lang_list):
+            col_idx = i % 2
+            with cols[col_idx]:
+                if st.checkbox(f"{info['name']}", key=f"lang_{code}", value=(code == "en")):
+                    selected_languages.append(code)
+        if not selected_languages:
+            selected_languages = ["en"]
+            st.info("English selected by default")
         st.markdown("---")
+        # Generate button
+        generate_btn = st.button(
+            "🚀 Generate Descriptions",
+            type="primary",
+            use_container_width=True,
+            disabled=uploaded_file is None
+        )
+        if st.button("🔄 Clear", use_container_width=True):
+            st.rerun()
     # Main content
     col1, col2 = st.columns([1, 1])
     with col1:
+        st.subheader("Input Image")
+        if uploaded_file:
+            image = Image.open(uploaded_file).convert("RGB")
+            st.image(image, use_column_width=True)
+            st.caption(f"Size: {image.size[0]}×{image.size[1]} pixels")
         else:
+            st.info("👈 Upload an image from the sidebar")
             st.image(
+                "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=400&auto=format",
+                caption="Sample background",
                 use_column_width=True
             )
     with col2:
+        st.subheader("Results")
+        if generate_btn and uploaded_file:
+            image = Image.open(uploaded_file).convert("RGB")
+            with st.spinner("Generating descriptions..."):
+                results = {}
                 progress_bar = st.progress(0)
+                for i, lang_code in enumerate(selected_languages):
+                    # Update progress
+                    progress = (i + 1) / len(selected_languages)
+                    progress_bar.progress(progress)
+                    # Generate caption for this language
+                    caption = generate_multilingual_caption(image, lang_code)
+                    lang_name = LANGUAGES[lang_code]["name"]
+                    results[lang_name] = caption
+                progress_bar.empty()
+            # Display results
+            st.success(f"✅ Generated {len(results)} descriptions")
+            # Create results DataFrame
+            df_results = pd.DataFrame({
+                "Language": list(results.keys()),
+                "Description": list(results.values())
+            })
+            # Display table
+            st.dataframe(
+                df_results,
+                use_container_width=True,
+                hide_index=True
+            )
+            # Show individual descriptions
+            st.markdown("### Descriptions by Language")
+            for lang_name, description in results.items():
+                with st.expander(f"{lang_name}", expanded=(lang_name == "English")):
+                    st.markdown(f"**{description}**")
+            # Export option
+            st.markdown("---")
+            st.markdown("### 💾 Export Results")
+            # Create export text
+            export_text = f"""Multilingual Image Descriptions
+Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+Image: {uploaded_file.name if uploaded_file else 'Unknown'}
+"""
+            for lang_name, description in results.items():
+                export_text += f"\n{lang_name}:\n{description}\n"
+            # Download button
+            st.download_button(
+                "📥 Download as TXT",
+                export_text,
+                f"descriptions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                "text/plain"
+            )
+        elif uploaded_file:
+            st.info("👈 Click 'Generate Descriptions' to analyze the image")
     # Footer
     st.markdown("---")
+    st.caption("""
+    **Powered by:** BLIP-2 Multilingual Model • **UCAS @2025** •
+    Model: Salesforce/blip2-opt-2.7b
+    """)
 if __name__ == "__main__":
     main()