amogneandualem commited on
Commit
9646139
·
verified ·
1 Parent(s): 78def46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -692
app.py CHANGED
@@ -1,23 +1,16 @@
1
  """
2
- Multilingual Image Describer Designed by bAmogne A. @UCAS-2025
 
3
  """
4
 
5
  import streamlit as st
6
  import torch
7
  from PIL import Image
8
- import cv2
9
- import numpy as np
10
- from transformers import BlipProcessor, BlipForConditionalGeneration
11
- from ultralytics import YOLO
12
- import json
13
  import time
14
  from datetime import datetime
15
  import pandas as pd
16
- import plotly.graph_objects as go
17
- import plotly.express as px
18
- import os
19
- import requests
20
- from io import BytesIO
21
  import warnings
22
  warnings.filterwarnings("ignore")
23
 
@@ -25,759 +18,229 @@ warnings.filterwarnings("ignore")
25
  st.set_page_config(
26
  page_title="Multilingual Image Describer",
27
  page_icon="🌍",
28
- layout="wide",
29
- initial_sidebar_state="expanded"
30
  )
31
 
32
- # Custom CSS
33
- st.markdown("""
34
- <style>
35
- .main {
36
- padding: 0rem 1rem;
37
- }
38
-
39
- .header {
40
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
41
- padding: 2rem;
42
- border-radius: 10px;
43
- color: white;
44
- margin-bottom: 2rem;
45
- }
46
-
47
- .card {
48
- background: white;
49
- padding: 1.5rem;
50
- border-radius: 10px;
51
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
52
- margin-bottom: 1rem;
53
- border: 1px solid #e0e0e0;
54
- }
55
-
56
- .object-tag {
57
- display: inline-block;
58
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
59
- color: white;
60
- padding: 5px 10px;
61
- margin: 3px;
62
- border-radius: 15px;
63
- font-size: 12px;
64
- font-weight: 500;
65
- }
66
-
67
- .stat-card {
68
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
69
- padding: 15px;
70
- border-radius: 10px;
71
- text-align: center;
72
- margin: 5px;
73
- }
74
-
75
- .stat-value {
76
- font-size: 24px;
77
- font-weight: bold;
78
- color: #2B6CB0;
79
- }
80
-
81
- .stat-label {
82
- font-size: 12px;
83
- color: #718096;
84
- }
85
-
86
- .stProgress > div > div > div > div {
87
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
88
- }
89
-
90
- .stButton > button {
91
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
92
- color: white;
93
- border: none;
94
- padding: 10px 20px;
95
- border-radius: 5px;
96
- font-weight: 500;
97
- }
98
-
99
- .stButton > button:hover {
100
- transform: translateY(-2px);
101
- box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
102
- }
103
- </style>
104
- """, unsafe_allow_html=True)
105
-
106
  # Initialize session state
107
  if 'model' not in st.session_state:
108
  st.session_state.model = None
109
- if 'detection_model' not in st.session_state:
110
- st.session_state.detection_model = None
111
- if 'results' not in st.session_state:
112
- st.session_state.results = None
113
- if 'image' not in st.session_state:
114
- st.session_state.image = None
115
- if 'hf_token' not in st.session_state:
116
- st.session_state.hf_token = None
117
 
118
- # Language configuration with real translation support
119
  LANGUAGES = {
120
- "en": {"name": "English", "emoji": "🇺🇸", "code": "eng_Latn"},
121
- "es": {"name": "Spanish", "emoji": "🇪🇸", "code": "spa_Latn"},
122
- "fr": {"name": "French", "emoji": "🇫🇷", "code": "fra_Latn"},
123
- "de": {"name": "German", "emoji": "🇩🇪", "code": "deu_Latn"},
124
- "zh": {"name": "Chinese", "emoji": "🇨🇳", "code": "zho_Hans"},
125
- "hi": {"name": "Hindi", "emoji": "🇮🇳", "code": "hin_Deva"},
126
- "ar": {"name": "Arabic", "emoji": "🇸🇦", "code": "arb_Arab"},
127
- "ru": {"name": "Russian", "emoji": "🇷🇺", "code": "rus_Cyrl"},
128
- "ja": {"name": "Japanese", "emoji": "🇯🇵", "code": "jpn_Jpan"},
129
- "ko": {"name": "Korean", "emoji": "🇰🇷", "code": "kor_Hang"},
130
- "pt": {"name": "Portuguese", "emoji": "🇵🇹", "code": "por_Latn"},
131
- "it": {"name": "Italian", "emoji": "🇮🇹", "code": "ita_Latn"},
132
- "am": {"name": "Amharic", "emoji": "🇪🇹", "code": "amh_Ethi"},
133
- "tr": {"name": "Turkish", "emoji": "🇹🇷", "code": "tur_Latn"},
134
  }
135
 
136
- # Hugging Face Translation Function
137
- def translate_with_huggingface(text, target_lang="en", api_token=None):
138
- """
139
- Translate text using Hugging Face Inference API with NLLB model
140
- """
141
- if target_lang == "en" or not text.strip():
142
- return text
143
-
144
- # Get target language code
145
- lang_info = LANGUAGES.get(target_lang)
146
- if not lang_info or 'code' not in lang_info:
147
- return f"[{target_lang.upper()}] {text}"
148
-
149
- target_code = lang_info['code']
150
-
151
- # Hugging Face Inference API endpoint
152
- API_URL = "https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M"
153
-
154
- # Prepare headers
155
- headers = {}
156
- if api_token:
157
- headers["Authorization"] = f"Bearer {api_token}"
158
-
159
- payload = {
160
- "inputs": text,
161
- "parameters": {
162
- "src_lang": "eng_Latn",
163
- "tgt_lang": target_code
164
- }
165
- }
166
-
167
  try:
168
- # Make API request
169
- response = requests.post(
170
- API_URL,
171
- headers=headers,
172
- json=payload,
173
- timeout=30
 
174
  )
175
 
176
- if response.status_code == 200:
177
- result = response.json()
178
-
179
- # Parse response
180
- if isinstance(result, list) and len(result) > 0:
181
- translated_text = result[0].get('translation_text', text)
182
- return translated_text
183
- elif isinstance(result, dict) and 'translation_text' in result:
184
- return result['translation_text']
185
- else:
186
- st.warning(f"Unexpected API response format. Using original text.")
187
- return text
188
- elif response.status_code == 503:
189
- # Model is loading
190
- st.warning(f"Translation model is loading. Please try again in 30 seconds.")
191
- return f"[{target_lang.upper()}] {text}"
192
- else:
193
- st.warning(f"Translation API error {response.status_code}. Using original text.")
194
- return text
195
 
196
- except requests.exceptions.Timeout:
197
- st.warning("Translation request timed out. Using original text.")
198
- return text
199
- except Exception as e:
200
- st.warning(f"Translation error: {str(e)[:100]}... Using original text.")
201
- return text
202
-
203
- def translate_object_list(objects, target_lang="en", api_token=None):
204
- """
205
- Translate a list of object names
206
- """
207
- if target_lang == "en" or not objects:
208
- return objects
209
-
210
- translated_objects = []
211
- for obj in objects:
212
- translated_obj = translate_with_huggingface(obj, target_lang, api_token)
213
- translated_objects.append(translated_obj)
214
-
215
- return translated_objects
216
-
217
- @st.cache_resource(show_spinner="Loading BLIP model...")
218
- def load_caption_model():
219
- """Load BLIP model for image captioning"""
220
- try:
221
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
222
- model = BlipForConditionalGeneration.from_pretrained(
223
- "Salesforce/blip-image-captioning-base"
224
- )
225
  return processor, model
226
  except Exception as e:
227
- st.error(f"Error loading BLIP model: {e}")
228
  return None, None
229
 
230
- @st.cache_resource(show_spinner="Loading YOLO model...")
231
- def load_detection_model():
232
- """Load YOLO model for object detection"""
233
- try:
234
- model = YOLO('yolov8n.pt')
235
- return model
236
- except Exception as e:
237
- st.error(f"Error loading YOLO model: {e}")
238
- return None
239
-
240
- def detect_objects(image, model, confidence_threshold=0.25):
241
- """Detect objects in image using YOLO"""
242
- if model is None:
243
- return [], []
244
 
245
- try:
246
- # Run detection
247
- results = model(image, conf=confidence_threshold, verbose=False)
248
-
249
- detected_objects = []
250
- detection_details = []
251
-
252
- for result in results:
253
- if result.boxes is not None:
254
- boxes = result.boxes.cpu().numpy()
255
- for box in boxes:
256
- x1, y1, x2, y2 = box.xyxy[0]
257
- conf = box.conf[0]
258
- cls = int(box.cls[0])
259
-
260
- obj_name = result.names[cls]
261
-
262
- detected_objects.append({
263
- "object": obj_name,
264
- "confidence": float(conf),
265
- "bbox": {
266
- "x1": float(x1),
267
- "y1": float(y1),
268
- "x2": float(x2),
269
- "y2": float(y2)
270
- }
271
- })
272
-
273
- # Get unique object names for summary
274
- unique_objects = list(set([obj["object"] for obj in detected_objects]))
275
-
276
- return unique_objects, detected_objects
277
- except Exception as e:
278
- st.error(f"Detection error: {e}")
279
- return [], []
280
-
281
- def generate_caption(image, model_tuple):
282
- """Generate caption for image using BLIP"""
283
- if model_tuple is None:
284
- return "Models not loaded"
285
 
286
  try:
287
- processor, model = model_tuple
 
288
 
289
- # Use CPU for inference on Hugging Face
290
- device = torch.device("cpu")
291
- model = model.to(device)
292
 
293
- inputs = processor(image, return_tensors="pt").to(device)
 
 
294
 
 
295
  with torch.no_grad():
296
- out = model.generate(**inputs, max_length=50, num_beams=3)
 
 
 
 
 
 
 
297
 
298
- caption = processor.decode(out[0], skip_special_tokens=True)
299
- return caption
300
  except Exception as e:
301
- return "An image containing various objects and scenes."
302
-
303
- def load_sample_image():
304
- """Load a default sample image"""
305
- try:
306
- # Use a simple local sample or a reliable URL
307
- sample_url = "https://images.unsplash.com/photo-1546182990-dffeafbe841d?w=800&auto=format&fit=crop"
308
- response = requests.get(sample_url, timeout=10)
309
- if response.status_code == 200:
310
- return BytesIO(response.content)
311
- except:
312
- pass
313
- return None
314
 
315
  def main():
316
- # Header
317
- st.markdown("""
318
- <div class="header">
319
- <h1 style="margin: 0; font-size: 2.5em;">🌍 Multilingual Image Describer</h1>
320
- <p style="margin: 0; opacity: 0.9; font-size: 1.1em;">
321
- Upload or capture an image to get object detection and descriptions
322
- </p>
323
- <p style="margin: 10px 0 0 0; font-size: 0.9em; opacity: 0.7;">
324
- Powered by BLIP + YOLOv8 • UCAS @2025 • Real Translation Enabled
325
- </p>
326
- </div>
327
- """, unsafe_allow_html=True)
328
 
329
- # Initialize models
330
- with st.spinner("🚀 Loading AI models..."):
331
  if st.session_state.model is None:
332
- st.session_state.model = load_caption_model()
333
- if st.session_state.detection_model is None:
334
- st.session_state.detection_model = load_detection_model()
335
 
336
- if st.session_state.model is None or st.session_state.detection_model is None:
337
- st.error("Failed to load AI models. Please refresh the page.")
338
  return
339
 
340
  # Sidebar
341
  with st.sidebar:
342
- st.markdown("### 📸 Image Input")
343
-
344
- # Input method
345
- input_method = st.radio(
346
- "Select input method:",
347
- ["Upload", "Camera", "Sample"],
348
- horizontal=True,
349
- label_visibility="collapsed"
350
  )
351
 
352
- uploaded_image = None
353
-
354
- if input_method == "Upload":
355
- uploaded_image = st.file_uploader(
356
- "Choose an image file",
357
- type=["jpg", "jpeg", "png", "webp", "bmp"],
358
- label_visibility="collapsed"
359
- )
360
-
361
- elif input_method == "Camera":
362
- camera_image = st.camera_input("Take a picture", label_visibility="collapsed")
363
- if camera_image:
364
- uploaded_image = camera_image
365
-
366
- else: # Sample
367
- if st.button("Load Sample Image", use_container_width=True):
368
- sample_bytes = load_sample_image()
369
- if sample_bytes:
370
- uploaded_image = sample_bytes
371
- st.success("Sample image loaded!")
372
-
373
  st.markdown("---")
 
374
 
375
- # Language selection
376
- st.markdown("### 🌐 Language Settings")
377
-
378
- # API Token input (optional but recommended)
379
- st.markdown("#### 🔑 Translation API")
380
- api_token = st.text_input(
381
- "Hugging Face Token (optional)",
382
- type="password",
383
- help="Get free token from huggingface.co/settings/tokens",
384
- placeholder="hf_xxxxxxxxxxxxxxxxxxx"
385
- )
386
-
387
- if api_token:
388
- st.session_state.hf_token = api_token
389
- st.success("✅ API token saved for translation")
390
- else:
391
- st.info("ℹ️ Without token, translation may be limited")
392
-
393
- st.markdown("#### 🗣️ Select Language")
394
- language_options = [(code, f"{info['emoji']} {info['name']}")
395
- for code, info in LANGUAGES.items()]
396
- selected_lang = st.selectbox(
397
- "Choose language for description:",
398
- options=[code for code, _ in language_options],
399
- format_func=lambda x: f"{LANGUAGES[x]['emoji']} {LANGUAGES[x]['name']}",
400
- index=0,
401
- label_visibility="collapsed"
402
- )
403
-
404
- # Show language info
405
- if selected_lang in LANGUAGES:
406
- lang_info = LANGUAGES[selected_lang]
407
- st.caption(f"Selected: {lang_info['name']} ({lang_info['code']})")
408
 
409
- st.markdown("---")
 
 
 
 
 
410
 
411
- # Settings
412
- with st.expander("⚙️ Advanced Settings"):
413
- confidence = st.slider(
414
- "Detection Confidence",
415
- min_value=0.1,
416
- max_value=0.9,
417
- value=0.25,
418
- step=0.05,
419
- help="Higher values = more confident detections"
420
- )
421
-
422
- enable_translation = st.checkbox(
423
- "Enable real-time translation",
424
- value=True,
425
- help="Uses Hugging Face NLLB model for translation"
426
- )
427
-
428
- translation_mode = st.radio(
429
- "Translation Mode",
430
- ["Full translation", "Keywords only", "Disabled"],
431
- index=0,
432
- help="Full: Translate everything, Keywords: Only translate object names"
433
- )
434
 
435
  st.markdown("---")
436
 
437
- # Process buttons
438
- col1, col2 = st.columns(2)
439
- with col1:
440
- process_btn = st.button(
441
- "🚀 Analyze Image",
442
- type="primary",
443
- use_container_width=True,
444
- disabled=uploaded_image is None,
445
- help="Process image and generate description"
446
- )
447
- with col2:
448
- if st.button("🗑️ Clear All", use_container_width=True):
449
- st.session_state.results = None
450
- st.session_state.image = None
451
- st.rerun()
452
 
453
- # Quick stats if results exist
454
- if st.session_state.results:
455
- st.markdown("---")
456
- st.markdown("### 📊 Quick Stats")
457
- col1, col2, col3 = st.columns(3)
458
- with col1:
459
- st.metric("Objects", st.session_state.results["detection_count"])
460
- with col2:
461
- st.metric("Unique", st.session_state.results["unique_count"])
462
- with col3:
463
- st.metric("Time", st.session_state.results["processing_time"])
464
 
465
  # Main content
466
  col1, col2 = st.columns([1, 1])
467
 
468
  with col1:
469
- st.markdown("### 📤 Input Image")
470
-
471
- if uploaded_image:
472
- try:
473
- image = Image.open(uploaded_image).convert("RGB")
474
- st.session_state.image = image
475
-
476
- # Display image
477
- st.image(
478
- image,
479
- caption=f"Image • {image.size[0]}×{image.size[1]} pixels",
480
- use_column_width=True
481
- )
482
-
483
- # Show image info
484
- with st.expander("📋 Image Details"):
485
- st.write(f"**Format:** {image.format if hasattr(image, 'format') else 'Unknown'}")
486
- st.write(f"**Mode:** {image.mode}")
487
- st.write(f"**Size:** {image.size[0]} × {image.size[1]} pixels")
488
-
489
- except Exception as e:
490
- st.error(f"Error loading image: {e}")
491
  else:
492
- # Placeholder
493
- st.info("👈 Please upload an image, use camera, or load sample")
494
-
495
- # Show sample preview
496
  st.image(
497
- "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=800&auto=format&fit=crop",
498
- caption="Sample: Colorful gradient background",
499
  use_column_width=True
500
  )
501
-
502
- st.caption("Try uploading your own image for best results!")
503
 
504
  with col2:
505
- st.markdown("### 📋 Analysis Results")
506
 
507
- if process_btn and st.session_state.image:
508
- with st.spinner("🔄 Processing image..."):
509
- # Create progress indicators
 
 
510
  progress_bar = st.progress(0)
511
- status_text = st.empty()
512
-
513
- # Step 1: Generate caption
514
- status_text.text("📝 Generating image description...")
515
- progress_bar.progress(25)
516
- caption = generate_caption(st.session_state.image, st.session_state.model)
517
-
518
- # Step 2: Detect objects
519
- status_text.text("🔍 Detecting objects...")
520
- progress_bar.progress(50)
521
- unique_objects, detection_details = detect_objects(
522
- st.session_state.image,
523
- st.session_state.detection_model,
524
- confidence
525
- )
526
-
527
- # Step 3: Apply translation if enabled
528
- status_text.text("🌍 Translating content...")
529
- progress_bar.progress(75)
530
-
531
- translated_caption = caption
532
- translated_objects = unique_objects
533
 
534
- if enable_translation and selected_lang != "en":
535
- # Get API token
536
- api_token = st.session_state.hf_token
 
537
 
538
- # Translate based on mode
539
- if translation_mode == "Full translation":
540
- translated_caption = translate_with_huggingface(
541
- caption, selected_lang, api_token
542
- )
543
- translated_objects = translate_object_list(
544
- unique_objects, selected_lang, api_token
545
- )
546
- elif translation_mode == "Keywords only":
547
- translated_objects = translate_object_list(
548
- unique_objects, selected_lang, api_token
549
- )
550
- translated_caption = caption
551
- # else: "Disabled" - keep original
552
- else:
553
- # Add language prefix if translation is disabled
554
- if selected_lang != "en":
555
- translated_caption = f"[{selected_lang.upper()}] {caption}"
556
-
557
- # Step 4: Complete
558
- status_text.text("✅ Processing complete!")
559
- progress_bar.progress(100)
560
- time.sleep(0.5)
561
-
562
- processing_time = time.time() - st.session_state.get('process_start_time', time.time())
563
-
564
- # Prepare results
565
- results = {
566
- "original_caption": caption,
567
- "caption": translated_caption,
568
- "original_objects": unique_objects,
569
- "detected_objects": translated_objects,
570
- "detection_details": detection_details,
571
- "detection_count": len(detection_details),
572
- "unique_count": len(unique_objects),
573
- "language": selected_lang,
574
- "language_name": LANGUAGES[selected_lang]["name"],
575
- "translation_enabled": enable_translation,
576
- "translation_mode": translation_mode,
577
- "processing_time": f"{processing_time:.2f}s",
578
- "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
579
- }
580
 
581
- st.session_state.results = results
582
- st.session_state.process_start_time = None
583
 
584
- # Clear progress indicators
585
- progress_bar.empty()
586
- status_text.empty()
587
-
588
- # Display results in tabs
589
- tab1, tab2, tab3 = st.tabs(["📝 Description", "🔍 Objects", "💾 Export"])
590
-
591
- with tab1:
592
- st.markdown("#### Image Description")
593
-
594
- # Display caption
595
- st.markdown(f'<div class="card">{results["caption"]}</div>', unsafe_allow_html=True)
596
-
597
- # Show translation note if applicable
598
- if results["translation_enabled"] and selected_lang != "en":
599
- st.success(f"✅ Translated to {results['language_name']}")
600
-
601
- st.markdown("#### Analysis Summary")
602
-
603
- # Stats in columns
604
- cols = st.columns(4)
605
- with cols[0]:
606
- st.metric("Objects", results["detection_count"])
607
- with cols[1]:
608
- st.metric("Unique", results["unique_count"])
609
- with cols[2]:
610
- st.metric("Time", results["processing_time"])
611
- with cols[3]:
612
- st.metric("Language", results["language_name"])
613
-
614
- # Show original if translated
615
- if results["translation_enabled"] and selected_lang != "en" and results["original_caption"] != results["caption"]:
616
- with st.expander("🔤 View Original English"):
617
- st.write(results["original_caption"])
618
 
619
- with tab2:
620
- if results["detected_objects"]:
621
- # Display object tags
622
- st.markdown("#### Detected Objects")
623
-
624
- tags_html = " ".join(
625
- [f'<span class="object-tag">{obj}</span>'
626
- for obj in results["detected_objects"][:20]] # Limit to 20 for display
627
- )
628
- st.markdown(f'<div style="margin: 10px 0;">{tags_html}</div>', unsafe_allow_html=True)
629
-
630
- if len(results["detected_objects"]) > 20:
631
- st.caption(f"Showing 20 of {len(results['detected_objects'])} objects")
632
-
633
- # Detailed table
634
- if results["detection_details"]:
635
- st.markdown("#### Detailed Results")
636
-
637
- df = pd.DataFrame(results["detection_details"])
638
- st.dataframe(
639
- df[['object', 'confidence']].sort_values('confidence', ascending=False),
640
- use_container_width=True,
641
- height=300
642
- )
643
-
644
- # Confidence chart
645
- if len(df) > 0:
646
- fig = px.histogram(
647
- df,
648
- x='confidence',
649
- nbins=10,
650
- title='Confidence Distribution',
651
- labels={'confidence': 'Confidence Score'},
652
- color_discrete_sequence=['#667eea']
653
- )
654
- st.plotly_chart(fig, use_container_width=True)
655
- else:
656
- st.info("🔍 No objects detected in this image")
657
- st.markdown("Try adjusting the confidence threshold in settings")
658
 
659
- with tab3:
660
- st.markdown("#### Export Results")
661
-
662
- # JSON export
663
- json_data = json.dumps(results, indent=2, ensure_ascii=False)
664
-
665
- col1, col2, col3 = st.columns(3)
666
-
667
- with col1:
668
- st.download_button(
669
- "📥 Download JSON",
670
- json_data,
671
- f"image_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
672
- "application/json",
673
- use_container_width=True,
674
- help="Download complete analysis as JSON"
675
- )
676
-
677
- with col2:
678
- # Text export
679
- text_data = f"""IMAGE ANALYSIS REPORT
680
- ================================
681
- Generated: {results['timestamp']}
682
- Language: {results['language_name']}
683
- Translation: {'Enabled' if results['translation_enabled'] else 'Disabled'}
684
-
685
- DESCRIPTION:
686
- {results['caption']}
687
-
688
- DETECTED OBJECTS:
689
- Total Objects: {results['detection_count']}
690
- Unique Objects: {results['unique_count']}
691
- Object List: {', '.join(results['detected_objects']) if results['detected_objects'] else 'None'}
692
-
693
- PROCESSING INFO:
694
- Processing Time: {results['processing_time']}
695
- Detection Confidence: {confidence}
696
-
697
- ---
698
- Multilingual Image Describer • UCAS @2025
699
- Powered by BLIP + YOLOv8
700
- """
701
-
702
- st.download_button(
703
- "📥 Download TXT",
704
- text_data,
705
- f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
706
- "text/plain",
707
- use_container_width=True,
708
- help="Download summary as text file"
709
- )
710
-
711
- with col3:
712
- if st.button("🔄 Analyze Another", use_container_width=True):
713
- st.session_state.results = None
714
- st.rerun()
715
-
716
- # View JSON
717
- with st.expander("📄 View Complete JSON Data"):
718
- st.code(json_data, language="json")
719
-
720
- elif st.session_state.results:
721
- # Show cached results
722
- results = st.session_state.results
723
 
724
- st.success(f"✅ Analysis complete ({results['processing_time']})")
 
725
 
726
- # Quick summary
727
- st.markdown(f"**Description:** {results['caption']}")
 
728
 
729
- if results["detected_objects"]:
730
- st.markdown(f"**Objects detected:** {len(results['detected_objects'])} items")
731
-
732
- # Show first few objects
733
- preview_objects = results["detected_objects"][:5]
734
- preview_text = ", ".join(preview_objects)
735
- if len(results["detected_objects"]) > 5:
736
- preview_text += f" (+{len(results['detected_objects']) - 5} more)"
737
-
738
- st.markdown(f"**Sample:** {preview_text}")
739
 
740
- # Action buttons
741
- col1, col2 = st.columns(2)
742
- with col1:
743
- if st.button("🔄 Analyze New Image", use_container_width=True):
744
- st.session_state.results = None
745
- st.session_state.image = None
746
- st.rerun()
747
- with col2:
748
- if st.button("📊 View Full Report", use_container_width=True):
749
- # This will refresh and show tabs
750
- st.rerun()
 
 
 
 
 
751
 
752
- elif process_btn and st.session_state.image is None:
753
- st.warning("⚠️ Please upload an image first!")
754
 
755
  # Footer
756
  st.markdown("---")
757
- st.markdown("""
758
- <div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;">
759
- <p>
760
- 🌍 <strong>Real Translation Enabled</strong> •
761
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank" style="color: #667eea; text-decoration: none;">
762
- Hugging Face Spaces
763
- </a> •
764
- <a href="https://huggingface.co/facebook/nllb-200-distilled-600M" target="_blank" style="color: #667eea; text-decoration: none;">
765
- NLLB Translation Model
766
- </a>
767
- </p>
768
- <p style="font-size: 0.8em; margin-top: 10px;">
769
- AI Models: BLIP (Image Captioning) • YOLOv8 (Object Detection) • NLLB (Translation)<br>
770
- Supports: English, Spanish, French, German, Chinese, Hindi, Arabic, Russian, Japanese, Korean, Portuguese, Italian, Amharic, Turkish
771
- </p>
772
- <p style="font-size: 0.7em; margin-top: 15px; color: #999;">
773
- Built with ❤️ by UCAS @2025 • For educational and research purposes
774
- </p>
775
- </div>
776
- """, unsafe_allow_html=True)
777
 
778
  if __name__ == "__main__":
779
- # Set process start time
780
- if 'process_start_time' not in st.session_state:
781
- st.session_state.process_start_time = time.time()
782
-
783
  main()
 
1
  """
2
+ 🌍 Multilingual Image Describer - SIMPLE
3
+ Using pre-trained multilingual model for direct captioning
4
  """
5
 
6
  import streamlit as st
7
  import torch
8
  from PIL import Image
9
+ import requests
10
+ from io import BytesIO
 
 
 
11
  import time
12
  from datetime import datetime
13
  import pandas as pd
 
 
 
 
 
14
  import warnings
15
  warnings.filterwarnings("ignore")
16
 
 
18
  st.set_page_config(
19
  page_title="Multilingual Image Describer",
20
  page_icon="🌍",
21
+ layout="wide"
 
22
  )
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # Initialize session state
25
  if 'model' not in st.session_state:
26
  st.session_state.model = None
 
 
 
 
 
 
 
 
27
 
28
+ # Language settings
29
  LANGUAGES = {
30
+ "en": {"name": "English", "prompt": "a photo of"},
31
+ "zh": {"name": "中文", "prompt": "一张照片"},
32
+ "am": {"name": "አማርኛ", "prompt": "የሚያሳይ ፎቶ"},
33
+ "es": {"name": "Español", "prompt": "una foto de"},
34
+ "fr": {"name": "Français", "prompt": "une photo de"},
35
+ "de": {"name": "Deutsch", "prompt": "ein Foto von"},
36
+ "ar": {"name": "العربية", "prompt": "صورة"},
37
+ "hi": {"name": "हिन्दी", "prompt": "की एक तस्वीर"},
38
+ "ru": {"name": "Русский", "prompt": "фотография"},
39
+ "ja": {"name": "日本語", "prompt": "の写真"}
 
 
 
 
40
  }
41
 
42
+ @st.cache_resource(show_spinner="Loading multilingual model...")
43
+ def load_model():
44
+ """Load multilingual image captioning model"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  try:
46
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
47
+
48
+ # Using BLIP-2 with multilingual capabilities
49
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
50
+ model = Blip2ForConditionalGeneration.from_pretrained(
51
+ "Salesforce/blip2-opt-2.7b",
52
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
53
  )
54
 
55
+ # Move to GPU if available
56
+ if torch.cuda.is_available():
57
+ model = model.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  return processor, model
60
  except Exception as e:
61
+ st.error(f"Model loading error: {str(e)[:100]}")
62
  return None, None
63
 
64
+ def generate_multilingual_caption(image, language="en"):
65
+ """Generate caption directly in the target language"""
66
+ if st.session_state.model is None:
67
+ return "Model not loaded"
 
 
 
 
 
 
 
 
 
 
68
 
69
+ processor, model = st.session_state.model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  try:
72
+ # Prepare prompt based on language
73
+ prompt_text = LANGUAGES.get(language, LANGUAGES["en"])["prompt"]
74
 
75
+ # Process image
76
+ inputs = processor(image, text=prompt_text, return_tensors="pt")
 
77
 
78
+ # Move to device
79
+ if torch.cuda.is_available():
80
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
81
 
82
+ # Generate caption
83
  with torch.no_grad():
84
+ outputs = model.generate(**inputs, max_length=50)
85
+
86
+ # Decode the output
87
+ caption = processor.decode(outputs[0], skip_special_tokens=True)
88
+
89
+ # Remove the prompt from the beginning if present
90
+ if caption.lower().startswith(prompt_text.lower()):
91
+ caption = caption[len(prompt_text):].strip()
92
 
93
+ return caption.strip()
 
94
  except Exception as e:
95
+ return f"An image with various objects. (Error: {str(e)[:50]})"
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def main():
98
+ # Title
99
+ st.title("🌍 Multilingual Image Describer")
100
+ st.markdown("Upload an image to get descriptions in multiple languages")
 
 
 
 
 
 
 
 
 
101
 
102
+ # Load model
103
+ with st.spinner("Loading AI model..."):
104
  if st.session_state.model is None:
105
+ st.session_state.model = load_model()
 
 
106
 
107
+ if st.session_state.model is None:
108
+ st.error("Failed to load model. Please refresh the page.")
109
  return
110
 
111
  # Sidebar
112
  with st.sidebar:
113
+ st.header("📸 Upload Image")
114
+ uploaded_file = st.file_uploader(
115
+ "Choose an image",
116
+ type=["jpg", "jpeg", "png", "webp"],
117
+ help="Upload any image file"
 
 
 
118
  )
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  st.markdown("---")
121
+ st.header("🌐 Select Languages")
122
 
123
+ # Language selection with checkboxes
124
+ selected_languages = []
125
+ cols = st.columns(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ lang_list = list(LANGUAGES.items())
128
+ for i, (code, info) in enumerate(lang_list):
129
+ col_idx = i % 2
130
+ with cols[col_idx]:
131
+ if st.checkbox(f"{info['name']}", key=f"lang_{code}", value=(code == "en")):
132
+ selected_languages.append(code)
133
 
134
+ if not selected_languages:
135
+ selected_languages = ["en"]
136
+ st.info("English selected by default")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  st.markdown("---")
139
 
140
+ # Generate button
141
+ generate_btn = st.button(
142
+ "🚀 Generate Descriptions",
143
+ type="primary",
144
+ use_container_width=True,
145
+ disabled=uploaded_file is None
146
+ )
 
 
 
 
 
 
 
 
147
 
148
+ if st.button("🔄 Clear", use_container_width=True):
149
+ st.rerun()
 
 
 
 
 
 
 
 
 
150
 
151
  # Main content
152
  col1, col2 = st.columns([1, 1])
153
 
154
  with col1:
155
+ st.subheader("Input Image")
156
+ if uploaded_file:
157
+ image = Image.open(uploaded_file).convert("RGB")
158
+ st.image(image, use_column_width=True)
159
+ st.caption(f"Size: {image.size[0]}×{image.size[1]} pixels")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  else:
161
+ st.info("👈 Upload an image from the sidebar")
 
 
 
162
  st.image(
163
+ "https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=400&auto=format",
164
+ caption="Sample background",
165
  use_column_width=True
166
  )
 
 
167
 
168
  with col2:
169
+ st.subheader("Results")
170
 
171
+ if generate_btn and uploaded_file:
172
+ image = Image.open(uploaded_file).convert("RGB")
173
+
174
+ with st.spinner("Generating descriptions..."):
175
+ results = {}
176
  progress_bar = st.progress(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ for i, lang_code in enumerate(selected_languages):
179
+ # Update progress
180
+ progress = (i + 1) / len(selected_languages)
181
+ progress_bar.progress(progress)
182
 
183
+ # Generate caption for this language
184
+ caption = generate_multilingual_caption(image, lang_code)
185
+ lang_name = LANGUAGES[lang_code]["name"]
186
+
187
+ results[lang_name] = caption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ progress_bar.empty()
 
190
 
191
+ # Display results
192
+ st.success(f"✅ Generated {len(results)} descriptions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ # Create results DataFrame
195
+ df_results = pd.DataFrame({
196
+ "Language": list(results.keys()),
197
+ "Description": list(results.values())
198
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ # Display table
201
+ st.dataframe(
202
+ df_results,
203
+ use_container_width=True,
204
+ hide_index=True
205
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ # Show individual descriptions
208
+ st.markdown("### Descriptions by Language")
209
 
210
+ for lang_name, description in results.items():
211
+ with st.expander(f"{lang_name}", expanded=(lang_name == "English")):
212
+ st.markdown(f"**{description}**")
213
 
214
+ # Export option
215
+ st.markdown("---")
216
+ st.markdown("### 💾 Export Results")
 
 
 
 
 
 
 
217
 
218
+ # Create export text
219
+ export_text = f"""Multilingual Image Descriptions
220
+ Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
221
+ Image: {uploaded_file.name if uploaded_file else 'Unknown'}
222
+
223
+ """
224
+ for lang_name, description in results.items():
225
+ export_text += f"\n{lang_name}:\n{description}\n"
226
+
227
+ # Download button
228
+ st.download_button(
229
+ "📥 Download as TXT",
230
+ export_text,
231
+ f"descriptions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
232
+ "text/plain"
233
+ )
234
 
235
+ elif uploaded_file:
236
+ st.info("👈 Click 'Generate Descriptions' to analyze the image")
237
 
238
  # Footer
239
  st.markdown("---")
240
+ st.caption("""
241
+ **Powered by:** BLIP-2 Multilingual Model **UCAS @2025**
242
+ Model: Salesforce/blip2-opt-2.7b
243
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  if __name__ == "__main__":
 
 
 
 
246
  main()