Spaces:

amogneandualem
/

amogne-vlm-LLM

Running

App Files Files Community

amogne-vlm-LLM / app.py

amogneandualem

Update app.py

3bb2461 verified 2 days ago

raw

history blame contribute delete

16 kB

	"""
	🌍 Advanced Multilingual Image Describer
	Using latest Vision-Language Models (VLMs) with native multilingual support
	"""

	import streamlit as st
	import torch
	from PIL import Image
	import time
	from datetime import datetime
	import pandas as pd
	import warnings
	warnings.filterwarnings("ignore")

	# Set page config
	st.set_page_config(
	page_title="Multilingual Image Describer",
	page_icon="🌍",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.st-emotion-cache-16txtl3 {
	padding-top: 3rem;
	}
	.header-title {
	text-align: center;
	color: #2C3E50;
	margin-bottom: 1rem;
	}
	.model-badge {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 5px 15px;
	border-radius: 20px;
	font-size: 12px;
	display: inline-block;
	margin: 5px;
	}
	.language-tag {
	background: #E3F2FD;
	color: #1976D2;
	padding: 3px 10px;
	border-radius: 15px;
	font-size: 12px;
	margin: 2px;
	display: inline-block;
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'model' not in st.session_state:
	st.session_state.model = None
	if 'model_name' not in st.session_state:
	st.session_state.model_name = None
	if 'results' not in st.session_state:
	st.session_state.results = None

	# Model options (latest vision-language models)
	MODEL_OPTIONS = {
	"llava-hf/llava-1.5-7b-hf": {
	"name": "LLaVA 1.5 (7B)",
	"multilingual": True,
	"languages": ["en", "zh", "es", "fr", "de", "it", "ru", "ja", "ko", "ar"],
	"prompt_templates": {
	"en": "Describe this image in detail:",
	"zh": "详细描述这张图片：",
	"es": "Describe esta imagen en detalle:",
	"fr": "Décrivez cette image en détail :",
	"de": "Beschreiben Sie dieses Bild im Detail:",
	"am": "ይህንን ምስል በዝርዝር ይግለጹ፡"
	}
	},
	"Qwen/Qwen-VL-Chat": {
	"name": "Qwen-VL-Chat",
	"multilingual": True,
	"languages": ["en", "zh", "ja", "ko", "fr", "de", "es", "ru"],
	"prompt_templates": {
	"en": "Describe this image in English:",
	"zh": "用中文描述这张图片：",
	"am": "በአማርኛ ይህንን ምስል ይግለጹ፡"
	}
	},
	"vikhyatk/moondream2": {
	"name": "Moondream 2",
	"multilingual": True,
	"languages": ["en", "es", "fr", "de"],
	"prompt_templates": {
	"en": "Describe this image:",
	"zh": "描述这张图片：",
	"am": "ይህንን ምስል ይግለጹ፡"
	}
	}
	}

	# Language mapping
	LANGUAGE_NAMES = {
	"en": "🇺🇸 English",
	"zh": "🇨🇳 中文",
	"am": "🇪🇹 አማርኛ",
	"es": "🇪🇸 Español",
	"fr": "🇫🇷 Français",
	"de": "🇩🇪 Deutsch",
	"ar": "🇸🇦 العربية",
	"hi": "🇮🇳 हिन्दी",
	"ru": "🇷🇺 Русский",
	"ja": "🇯🇵 日本語",
	"ko": "🇰🇷 한국어",
	"it": "🇮🇹 Italiano",
	"pt": "🇵🇹 Português",
	"tr": "🇹🇷 Türkçe"
	}

	@st.cache_resource(show_spinner=True)
	def load_model(model_id):
	"""Load the selected vision-language model"""
	try:
	from transformers import AutoProcessor, AutoModelForVision2Seq

	st.info(f"🚀 Loading {MODEL_OPTIONS[model_id]['name']}...")

	# Load processor and model
	processor = AutoProcessor.from_pretrained(model_id)
	model = AutoModelForVision2Seq.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto" if torch.cuda.is_available() else None
	)

	return processor, model, model_id

	except Exception as e:
	st.error(f"❌ Failed to load model: {str(e)[:200]}")
	return None, None, None

	def generate_caption(image, model_tuple, language="en", model_id=None):
	"""Generate caption using the vision-language model"""
	if None in model_tuple:
	return "Model not loaded"

	processor, model, loaded_model_id = model_tuple

	try:
	# Get prompt template based on model and language
	model_info = MODEL_OPTIONS.get(loaded_model_id, MODEL_OPTIONS["llava-hf/llava-1.5-7b-hf"])
	prompt_template = model_info["prompt_templates"].get(
	language,
	model_info["prompt_templates"].get("en", "Describe this image:")
	)

	# Prepare inputs
	if "llava" in loaded_model_id:
	# LLaVA format
	prompt = f"USER: <image>\n{prompt_template}\nASSISTANT:"
	inputs = processor(text=prompt, images=image, return_tensors="pt")
	elif "qwen" in loaded_model_id.lower():
	# Qwen-VL format
	prompt = f"<img>Describe this image in {LANGUAGE_NAMES.get(language, 'English')}:</img>"
	inputs = processor(text=prompt, images=image, return_tensors="pt")
	else:
	# Default format
	inputs = processor(text=prompt_template, images=image, return_tensors="pt")

	# Move to device
	if torch.cuda.is_available():
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	# Generate
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=200,
	temperature=0.7,
	do_sample=True
	)

	# Decode
	generated_text = processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0].strip()

	# Clean up response
	if "llava" in loaded_model_id:
	# Remove the prompt part
	if "ASSISTANT:" in generated_text:
	generated_text = generated_text.split("ASSISTANT:")[-1].strip()

	return generated_text

	except Exception as e:
	return f"Error generating description: {str(e)[:100]}"

	def main():
	# Title
	st.markdown("<h1 class='header-title'>🌍 Advanced Multilingual Image Describer</h1>", unsafe_allow_html=True)

	# Model info
	st.markdown("""
	<div style="text-align: center; margin-bottom: 2rem;">
	<span class='model-badge'>Latest Vision-Language Models</span>
	<span class='model-badge'>Native Multilingual Support</span>
	<span class='model-badge'>No Translation APIs Needed</span>
	</div>
	""", unsafe_allow_html=True)

	# Sidebar
	with st.sidebar:
	st.markdown("### ⚙️ Configuration")

	# Model selection
	st.markdown("#### 🤖 Select Model")
	model_choice = st.selectbox(
	"Choose a vision-language model:",
	options=list(MODEL_OPTIONS.keys()),
	format_func=lambda x: MODEL_OPTIONS[x]["name"],
	help="LLaVA supports most languages. Qwen-VL is faster."
	)

	# Show model info
	model_info = MODEL_OPTIONS[model_choice]
	st.caption(f"✅ Languages: {len(model_info['languages'])}")
	st.caption(f"📊 Parameters: 7B+")

	# Language selection
	st.markdown("#### 🌐 Select Language")
	available_langs = model_info["languages"]
	selected_lang = st.selectbox(
	"Output language:",
	options=available_langs,
	format_func=lambda x: LANGUAGE_NAMES.get(x, x),
	index=0
	)

	# Show language tags
	st.markdown("Supported languages:")
	lang_tags = " ".join([
	f'<span class="language-tag">{LANGUAGE_NAMES.get(lang, lang)}</span>'
	for lang in available_langs[:8]
	])
	st.markdown(f'<div>{lang_tags}</div>', unsafe_allow_html=True)

	# Image upload
	st.markdown("---")
	st.markdown("### 📸 Upload Image")
	uploaded_file = st.file_uploader(
	"Choose an image file",
	type=["jpg", "jpeg", "png", "webp", "bmp"],
	label_visibility="collapsed"
	)

	# Advanced options
	with st.expander("⚡ Advanced Settings"):
	max_tokens = st.slider("Max tokens", 50, 500, 200, 50)
	temperature = st.slider("Temperature", 0.1, 1.0, 0.7, 0.1)

	st.markdown("---")

	# Action buttons
	col1, col2 = st.columns(2)
	with col1:
	load_btn = st.button("🔄 Load Model", use_container_width=True)
	with col2:
	if st.button("🗑️ Clear", use_container_width=True):
	st.session_state.results = None
	st.rerun()

	# Load model if requested
	if load_btn or (st.session_state.model is None and uploaded_file):
	with st.spinner(f"Loading {model_info['name']}..."):
	processor, model, model_id = load_model(model_choice)
	if processor and model:
	st.session_state.model = (processor, model, model_id)
	st.session_state.model_name = model_info["name"]
	st.success(f"✅ {model_info['name']} loaded!")
	else:
	st.error("❌ Failed to load model")

	# Quick stats
	if st.session_state.results:
	st.markdown("---")
	st.markdown("### 📊 Quick Stats")
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Model", st.session_state.model_name or "N/A")
	with col2:
	st.metric("Language", LANGUAGE_NAMES.get(selected_lang, selected_lang))

	# Main content
	col1, col2 = st.columns([1, 1])

	with col1:
	st.markdown("### 📤 Input Image")

	if uploaded_file:
	try:
	image = Image.open(uploaded_file).convert("RGB")
	st.image(image, use_column_width=True)
	st.caption(f"📏 Size: {image.size[0]}×{image.size[1]} pixels")

	# Store for processing
	st.session_state.current_image = image

	except Exception as e:
	st.error(f"Error loading image: {e}")
	else:
	st.info("👈 Upload an image to get started")
	# Show placeholder
	st.image(
	"https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=600&auto=format",
	caption="Upload your own image for analysis",
	use_column_width=True
	)

	with col2:
	st.markdown("### 📋 Results")

	# Process image if model is loaded
	if (uploaded_file and st.session_state.model and
	st.session_state.current_image and
	'current_image' in st.session_state):

	# Generate button
	if st.button("🚀 Generate Description", type="primary", use_container_width=True):
	with st.spinner(f"Generating description in {LANGUAGE_NAMES.get(selected_lang, selected_lang)}..."):
	start_time = time.time()

	# Generate caption
	caption = generate_caption(
	st.session_state.current_image,
	st.session_state.model,
	selected_lang,
	model_choice
	)

	processing_time = time.time() - start_time

	# Store results
	st.session_state.results = {
	"caption": caption,
	"language": selected_lang,
	"language_name": LANGUAGE_NAMES.get(selected_lang, selected_lang),
	"model": st.session_state.model_name,
	"processing_time": f"{processing_time:.2f}s",
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	}

	# Display results
	if st.session_state.results:
	results = st.session_state.results

	st.success(f"✅ Generated in {results['processing_time']}")

	# Display caption
	st.markdown("#### Generated Description")
	st.markdown(f"""
	<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #667eea;">
	<p style="font-size: 16px; line-height: 1.6;">{results['caption']}</p>
	</div>
	""", unsafe_allow_html=True)

	# Metadata
	st.markdown("#### 📊 Analysis Details")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Model", results['model'])
	with col2:
	st.metric("Language", results['language_name'])
	with col3:
	st.metric("Time", results['processing_time'])

	# Export options
	st.markdown("---")
	st.markdown("#### 💾 Export Results")

	col1, col2 = st.columns(2)
	with col1:
	# JSON export
	import json
	json_data = json.dumps(results, indent=2, ensure_ascii=False)
	st.download_button(
	"📥 Download JSON",
	json_data,
	f"image_description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	"application/json",
	use_container_width=True
	)

	with col2:
	# Text export
	text_data = f"""Image Description
	Generated: {results['timestamp']}
	Model: {results['model']}
	Language: {results['language_name']}
	Processing Time: {results['processing_time']}

	DESCRIPTION:
	{results['caption']}

	---
	Generated by Multilingual Image Describer
	Powered by {results['model']}
	"""
	st.download_button(
	"📥 Download TXT",
	text_data,
	f"description_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
	"text/plain",
	use_container_width=True
	)

	# Try another language
	st.markdown("---")
	st.markdown("#### 🔄 Try Another Language")
	if st.button("🔄 Generate in Different Language", use_container_width=True):
	st.session_state.results = None
	st.rerun()

	elif uploaded_file and not st.session_state.model:
	st.warning("⚠️ Please load the model first!")
	st.info("Click '🔄 Load Model' in the sidebar")
	elif not uploaded_file:
	st.info("👈 Upload an image to begin")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; color: #666; font-size: 0.9em; padding: 20px;">
	<p>
	<strong>Powered by Latest Vision-Language Models</strong> •
	<a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf" target="_blank" style="color: #667eea;">LLaVA</a> •
	<a href="https://huggingface.co/Qwen/Qwen-VL-Chat" target="_blank" style="color: #667eea;">Qwen-VL</a>
	</p>
	<p style="font-size: 0.8em;">
	Native multilingual support • No translation APIs • Direct caption generation
	</p>
	<p style="font-size: 0.7em; color: #999; margin-top: 15px;">
	UCAS @2025 • Built with Streamlit & Transformers
	</p>
	</div>
	""", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()