File size: 15,971 Bytes
58fc959
3bb2461
 
58fc959
 
 
229cdc2
e136df8
229cdc2
 
58fc959
229cdc2
 
047f62b
58fc959
 
 
 
3bb2461
 
58fc959
 
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58fc959
 
 
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
047f62b
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c105bfb
 
3bb2461
 
 
e601754
3bb2461
9646139
3bb2461
 
 
 
 
 
 
 
e601754
 
3bb2461
 
c105bfb
3bb2461
 
c105bfb
3bb2461
 
 
9646139
58fc959
3bb2461
229cdc2
58fc959
3bb2461
 
 
 
 
 
c105bfb
3bb2461
 
 
 
 
 
 
 
 
 
 
 
c105bfb
9646139
 
 
229cdc2
3bb2461
229cdc2
3bb2461
 
 
 
 
 
9646139
3bb2461
 
 
 
 
9646139
3bb2461
 
 
 
 
 
 
229cdc2
58fc959
3bb2461
229cdc2
58fc959
9646139
3bb2461
c105bfb
3bb2461
 
 
 
 
 
 
 
58fc959
 
 
3bb2461
 
 
 
 
 
 
 
 
58fc959
 
3bb2461
 
 
 
58fc959
3bb2461
 
 
 
 
 
 
 
 
e601754
3bb2461
 
 
 
 
 
 
58fc959
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
58fc959
 
 
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58fc959
3bb2461
 
 
 
 
 
 
 
 
58fc959
c105bfb
58fc959
 
 
3bb2461
 
9646139
3bb2461
 
 
 
 
 
 
 
 
 
58fc959
3bb2461
 
c105bfb
3bb2461
 
c105bfb
 
58fc959
 
3bb2461
58fc959
3bb2461
 
 
 
9646139
3bb2461
 
 
 
e601754
3bb2461
 
 
 
 
 
 
9646139
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58fc959
3bb2461
229cdc2
3bb2461
 
 
 
 
 
 
e601754
3bb2461
 
 
 
 
 
 
 
 
58fc959
3bb2461
9646139
3bb2461
c105bfb
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9646139
3bb2461
 
 
9646139
3bb2461
 
 
 
 
 
 
9646139
3bb2461
 
 
 
 
 
e601754
3bb2461
 
 
 
 
229cdc2
58fc959
 
3bb2461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08f2d92
229cdc2
58fc959
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
"""
๐ŸŒ Advanced Multilingual Image Describer
Using latest Vision-Language Models (VLMs) with native multilingual support
"""

import streamlit as st
import torch
from PIL import Image
import time
from datetime import datetime
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Set page config
st.set_page_config(
    page_title="Multilingual Image Describer",
    page_icon="๐ŸŒ",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .st-emotion-cache-16txtl3 {
        padding-top: 3rem;
    }
    .header-title {
        text-align: center;
        color: #2C3E50;
        margin-bottom: 1rem;
    }
    .model-badge {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 5px 15px;
        border-radius: 20px;
        font-size: 12px;
        display: inline-block;
        margin: 5px;
    }
    .language-tag {
        background: #E3F2FD;
        color: #1976D2;
        padding: 3px 10px;
        border-radius: 15px;
        font-size: 12px;
        margin: 2px;
        display: inline-block;
    }
</style>
""", unsafe_allow_html=True)

# Initialize session state
if 'model' not in st.session_state:
    st.session_state.model = None
if 'model_name' not in st.session_state:
    st.session_state.model_name = None
if 'results' not in st.session_state:
    st.session_state.results = None

# Model options (latest vision-language models)
MODEL_OPTIONS = {
    "llava-hf/llava-1.5-7b-hf": {
        "name": "LLaVA 1.5 (7B)",
        "multilingual": True,
        "languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"],
        "prompt_templates": {
            "en": "Describe this image in detail:",
            "zh": "่ฏฆ็ป†ๆ่ฟฐ่ฟ™ๅผ ๅ›พ็‰‡๏ผš",
            "es": "Describe esta imagen en detalle:",
            "fr": "Dรฉcrivez cette image en dรฉtail :",
            "de": "Beschreiben Sie dieses Bild im Detail:",
            "am": "แ‹ญแˆ…แŠ•แŠ• แˆแˆตแˆ แ‰ แ‹แˆญแ‹แˆญ แ‹ญแŒแˆˆแŒนแก"
        }
    },
    "Qwen/Qwen-VL-Chat": {
        "name": "Qwen-VL-Chat",
        "multilingual": True,
        "languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"],
        "prompt_templates": {
            "en": "Describe this image in English:",
            "zh": "็”จไธญๆ–‡ๆ่ฟฐ่ฟ™ๅผ ๅ›พ็‰‡๏ผš",
            "am": "แ‰ แŠ แˆ›แˆญแŠ› แ‹ญแˆ…แŠ•แŠ• แˆแˆตแˆ แ‹ญแŒแˆˆแŒนแก"
        }
    },
    "vikhyatk/moondream2": {
        "name": "Moondream 2",
        "multilingual": True,
        "languages": ["en", "es", "fr", "de"],
        "prompt_templates": {
            "en": "Describe this image:",
            "zh": "ๆ่ฟฐ่ฟ™ๅผ ๅ›พ็‰‡๏ผš",
            "am": "แ‹ญแˆ…แŠ•แŠ• แˆแˆตแˆ แ‹ญแŒแˆˆแŒนแก"
        }
    }
}

# Language mapping
LANGUAGE_NAMES = {
    "en": "๐Ÿ‡บ๐Ÿ‡ธ English",
    "zh": "๐Ÿ‡จ๐Ÿ‡ณ ไธญๆ–‡",
    "am": "๐Ÿ‡ช๐Ÿ‡น แŠ แˆ›แˆญแŠ›",
    "es": "๐Ÿ‡ช๐Ÿ‡ธ Espaรฑol",
    "fr": "๐Ÿ‡ซ๐Ÿ‡ท Franรงais",
    "de": "๐Ÿ‡ฉ๐Ÿ‡ช Deutsch",
    "ar": "๐Ÿ‡ธ๐Ÿ‡ฆ ุงู„ุนุฑุจูŠุฉ",
    "hi": "๐Ÿ‡ฎ๐Ÿ‡ณ เคนเคฟเคจเฅเคฆเฅ€",
    "ru": "๐Ÿ‡ท๐Ÿ‡บ ะ ัƒััะบะธะน",
    "ja": "๐Ÿ‡ฏ๐Ÿ‡ต ๆ—ฅๆœฌ่ชž",
    "ko": "๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด",
    "it": "๐Ÿ‡ฎ๐Ÿ‡น Italiano",
    "pt": "๐Ÿ‡ต๐Ÿ‡น Portuguรชs",
    "tr": "๐Ÿ‡น๐Ÿ‡ท Tรผrkรงe"
}

@st.cache_resource(show_spinner=True)
def load_model(model_id):
    """Load the selected vision-language model"""
    try:
        from transformers import AutoProcessor, AutoModelForVision2Seq
        
        st.info(f"๐Ÿš€ Loading {MODEL_OPTIONS[model_id]['name']}...")
        
        # Load processor and model
        processor = AutoProcessor.from_pretrained(model_id)
        model = AutoModelForVision2Seq.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        return processor, model, model_id
        
    except Exception as e:
        st.error(f"โŒ Failed to load model: {str(e)[:200]}")
        return None, None, None

def generate_caption(image, model_tuple, language="en", model_id=None):
    """Generate caption using the vision-language model"""
    if None in model_tuple:
        return "Model not loaded"
    
    processor, model, loaded_model_id = model_tuple
    
    try:
        # Get prompt template based on model and language
        model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"])
        prompt_template = model_info["prompt_templates"].get(
            language, 
            model_info["prompt_templates"].get("en", "Describe this image:")
        )
        
        # Prepare inputs
        if "llava" in loaded_model_id:
            # LLaVA format
            prompt = f"USER: <image>\n{prompt_template}\nASSISTANT:"
            inputs = processor(text=prompt, images=image, return_tensors="pt")
        elif "qwen" in loaded_model_id.lower():
            # Qwen-VL format
            prompt = f"<img>Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:</img>"
            inputs = processor(text=prompt, images=image, return_tensors="pt")
        else:
            # Default format
            inputs = processor(text=prompt_template, images=image, return_tensors="pt")
        
        # Move to device
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.7,
                do_sample=True
            )
        
        # Decode
        generated_text = processor.batch_decode(
            generated_ids, 
            skip_special_tokens=True
        )[0].strip()
        
        # Clean up response
        if "llava" in loaded_model_id:
            # Remove the prompt part
            if "ASSISTANT:" in generated_text:
                generated_text = generated_text.split("ASSISTANT:")[-1].strip()
        
        return generated_text
        
    except Exception as e:
        return f"Error generating description: {str(e)[:100]}"

def main():
    # Title
    st.markdown("<h1 class='header-title'>๐ŸŒ Advanced Multilingual Image Describer</h1>", unsafe_allow_html=True)
    
    # Model info
    st.markdown("""
    <div style="text-align: center; margin-bottom: 2rem;">
        <span class='model-badge'>Latest Vision-Language Models</span>
        <span class='model-badge'>Native Multilingual Support</span>
        <span class='model-badge'>No Translation APIs Needed</span>
    </div>
    """, unsafe_allow_html=True)
    
    # Sidebar
    with st.sidebar:
        st.markdown("### โš™๏ธ Configuration")
        
        # Model selection
        st.markdown("#### ๐Ÿค– Select Model")
        model_choice = st.selectbox(
            "Choose a vision-language model:",
            options=list(MODEL_OPTIONS.keys()),
            format_func=lambda x: MODEL_OPTIONS[x]["name"],
            help="LLaVA supports most languages. Qwen-VL is faster."
        )
        
        # Show model info
        model_info = MODEL_OPTIONS[model_choice]
        st.caption(f"โœ… Languages: {len(model_info['languages'])}")
        st.caption(f"๐Ÿ“Š Parameters: 7B+")
        
        # Language selection
        st.markdown("#### ๐ŸŒ Select Language")
        available_langs = model_info["languages"]
        selected_lang = st.selectbox(
            "Output language:",
            options=available_langs,
            format_func=lambda x: LANGUAGE_NAMES.get(x, x),
            index=0
        )
        
        # Show language tags
        st.markdown("**Supported languages:**")
        lang_tags = " ".join([
            f'<span class="language-tag">{LANGUAGE_NAMES.get(lang, lang)}</span>'
            for lang in available_langs[:8]
        ])
        st.markdown(f'<div>{lang_tags}</div>', unsafe_allow_html=True)
        
        # Image upload
        st.markdown("---")
        st.markdown("### ๐Ÿ“ธ Upload Image")
        uploaded_file = st.file_uploader(
            "Choose an image file",
            type=["jpg", "jpeg", "png", "webp", "bmp"],
            label_visibility="collapsed"
        )
        
        # Advanced options
        with st.expander("โšก Advanced Settings"):
            max_tokens = st.slider("Max tokens", 50, 500, 200, 50)
            temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1)
        
        st.markdown("---")
        
        # Action buttons
        col1, col2 = st.columns(2)
        with col1:
            load_btn = st.button("๐Ÿ”„ Load Model", use_container_width=True)
        with col2:
            if st.button("๐Ÿ—‘๏ธ Clear", use_container_width=True):
                st.session_state.results = None
                st.rerun()
        
        # Load model if requested
        if load_btn or (st.session_state.model is None and uploaded_file):
            with st.spinner(f"Loading {model_info['name']}..."):
                processor, model, model_id = load_model(model_choice)
                if processor and model:
                    st.session_state.model = (processor, model, model_id)
                    st.session_state.model_name = model_info["name"]
                    st.success(f"โœ… {model_info['name']} loaded!")
                else:
                    st.error("โŒ Failed to load model")
        
        # Quick stats
        if st.session_state.results:
            st.markdown("---")
            st.markdown("### ๐Ÿ“Š Quick Stats")
            col1, col2 = st.columns(2)
            with col1:
                st.metric("Model", st.session_state.model_name or "N/A")
            with col2:
                st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang))
    
    # Main content
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.markdown("### ๐Ÿ“ค Input Image")
        
        if uploaded_file:
            try:
                image = Image.open(uploaded_file).convert("RGB")
                st.image(image, use_column_width=True)
                st.caption(f"๐Ÿ“ Size: {image.size[0]}ร—{image.size[1]} pixels")
                
                # Store for processing
                st.session_state.current_image = image
                
            except Exception as e:
                st.error(f"Error loading image: {e}")
        else:
            st.info("๐Ÿ‘ˆ Upload an image to get started")
            # Show placeholder
            st.image(
                "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format",
                caption="Upload your own image for analysis",
                use_column_width=True
            )
    
    with col2:
        st.markdown("### ๐Ÿ“‹ Results")
        
        # Process image if model is loaded
        if (uploaded_file and st.session_state.model and 
            st.session_state.current_image and 
            'current_image' in st.session_state):
            
            # Generate button
            if st.button("๐Ÿš€ Generate Description", type="primary", use_container_width=True):
                with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."):
                    start_time = time.time()
                    
                    # Generate caption
                    caption = generate_caption(
                        st.session_state.current_image,
                        st.session_state.model,
                        selected_lang,
                        model_choice
                    )
                    
                    processing_time = time.time() - start_time
                    
                    # Store results
                    st.session_state.results = {
                        "caption": caption,
                        "language": selected_lang,
                        "language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang),
                        "model": st.session_state.model_name,
                        "processing_time": f"{processing_time:.2f}s",
                        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
        
        # Display results
        if st.session_state.results:
            results = st.session_state.results
            
            st.success(f"โœ… Generated in {results['processing_time']}")
            
            # Display caption
            st.markdown("#### Generated Description")
            st.markdown(f"""
            <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #667eea;">
                <p style="font-size: 16px; line-height: 1.6;">{results['caption']}</p>
            </div>
            """, unsafe_allow_html=True)
            
            # Metadata
            st.markdown("#### ๐Ÿ“Š Analysis Details")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Model", results['model'])
            with col2:
                st.metric("Language", results['language_name'])
            with col3:
                st.metric("Time", results['processing_time'])
            
            # Export options
            st.markdown("---")
            st.markdown("#### ๐Ÿ’พ Export Results")
            
            col1, col2 = st.columns(2)
            with col1:
                # JSON export
                import json
                json_data = json.dumps(results, indent=2, ensure_ascii=False)
                st.download_button(
                    "๐Ÿ“ฅ Download JSON",
                    json_data,
                    f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
                    "application/json",
                    use_container_width=True
                )
            
            with col2:
                # Text export
                text_data = f"""Image Description
Generated: {results['timestamp']}
Model: {results['model']}
Language: {results['language_name']}
Processing Time: {results['processing_time']}

DESCRIPTION:
{results['caption']}

---
Generated by Multilingual Image Describer
Powered by {results['model']}
"""
                st.download_button(
                    "๐Ÿ“ฅ Download TXT",
                    text_data,
                    f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
                    "text/plain",
                    use_container_width=True
                )
            
            # Try another language
            st.markdown("---")
            st.markdown("#### ๐Ÿ”„ Try Another Language")
            if st.button("๐Ÿ”„ Generate in Different Language", use_container_width=True):
                st.session_state.results = None
                st.rerun()
        
        elif uploaded_file and not st.session_state.model:
            st.warning("โš ๏ธ Please load the model first!")
            st.info("Click '๐Ÿ”„ Load Model' in the sidebar")
        elif not uploaded_file:
            st.info("๐Ÿ‘ˆ Upload an image to begin")
    
    # Footer
    st.markdown("---")
    st.markdown("""
    <div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;">
        <p>
            <strong>Powered by Latest Vision-Language Models</strong> โ€ข
            <a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf" target="_blank" style="color: #667eea;">LLaVA</a> โ€ข
            <a href="https://huggingface.co/Qwen/Qwen-VL-Chat" target="_blank" style="color: #667eea;">Qwen-VL</a>
        </p>
        <p style="font-size: 0.8em;">
            Native multilingual support โ€ข No translation APIs โ€ข Direct caption generation
        </p>
        <p style="font-size: 0.7em; color: #999; margin-top: 15px;">
            UCAS @2025 โ€ข Built with Streamlit & Transformers
        </p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()