Spaces:

amogneandualem
/

amogne-vlm-LLM

Running

App Files Files Community

amogne-vlm-LLM / app.py

amogneandualem

Update app.py

9646139 verified 5 days ago

raw

history blame

8.34 kB

	"""
	🌍 Multilingual Image Describer - SIMPLE
	Using pre-trained multilingual model for direct captioning
	"""

	import streamlit as st
	import torch
	from PIL import Image
	import requests
	from io import BytesIO
	import time
	from datetime import datetime
	import pandas as pd
	import warnings
	warnings.filterwarnings("ignore")

	# Set page config
	st.set_page_config(
	page_title="Multilingual Image Describer",
	page_icon="🌍",
	layout="wide"
	)

	# Initialize session state
	if 'model' not in st.session_state:
	st.session_state.model = None

	# Language settings
	LANGUAGES = {
	"en": {"name": "English", "prompt": "a photo of"},
	"zh": {"name": "中文", "prompt": "一张照片"},
	"am": {"name": "አማርኛ", "prompt": "የሚያሳይ ፎቶ"},
	"es": {"name": "Español", "prompt": "una foto de"},
	"fr": {"name": "Français", "prompt": "une photo de"},
	"de": {"name": "Deutsch", "prompt": "ein Foto von"},
	"ar": {"name": "العربية", "prompt": "صورة"},
	"hi": {"name": "हिन्दी", "prompt": "की एक तस्वीर"},
	"ru": {"name": "Русский", "prompt": "фотография"},
	"ja": {"name": "日本語", "prompt": "の写真"}
	}

	@st.cache_resource(show_spinner="Loading multilingual model...")
	def load_model():
	"""Load multilingual image captioning model"""
	try:
	from transformers import Blip2Processor, Blip2ForConditionalGeneration

	# Using BLIP-2 with multilingual capabilities
	processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
	model = Blip2ForConditionalGeneration.from_pretrained(
	"Salesforce/blip2-opt-2.7b",
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	)

	# Move to GPU if available
	if torch.cuda.is_available():
	model = model.to("cuda")

	return processor, model
	except Exception as e:
	st.error(f"Model loading error: {str(e)[:100]}")
	return None, None

	def generate_multilingual_caption(image, language="en"):
	"""Generate caption directly in the target language"""
	if st.session_state.model is None:
	return "Model not loaded"

	processor, model = st.session_state.model

	try:
	# Prepare prompt based on language
	prompt_text = LANGUAGES.get(language, LANGUAGES["en"])["prompt"]

	# Process image
	inputs = processor(image, text=prompt_text, return_tensors="pt")

	# Move to device
	if torch.cuda.is_available():
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	# Generate caption
	with torch.no_grad():
	outputs = model.generate(**inputs, max_length=50)

	# Decode the output
	caption = processor.decode(outputs[0], skip_special_tokens=True)

	# Remove the prompt from the beginning if present
	if caption.lower().startswith(prompt_text.lower()):
	caption = caption[len(prompt_text):].strip()

	return caption.strip()
	except Exception as e:
	return f"An image with various objects. (Error: {str(e)[:50]})"

	def main():
	# Title
	st.title("🌍 Multilingual Image Describer")
	st.markdown("Upload an image to get descriptions in multiple languages")

	# Load model
	with st.spinner("Loading AI model..."):
	if st.session_state.model is None:
	st.session_state.model = load_model()

	if st.session_state.model is None:
	st.error("Failed to load model. Please refresh the page.")
	return

	# Sidebar
	with st.sidebar:
	st.header("📸 Upload Image")
	uploaded_file = st.file_uploader(
	"Choose an image",
	type=["jpg", "jpeg", "png", "webp"],
	help="Upload any image file"
	)

	st.markdown("---")
	st.header("🌐 Select Languages")

	# Language selection with checkboxes
	selected_languages = []
	cols = st.columns(2)

	lang_list = list(LANGUAGES.items())
	for i, (code, info) in enumerate(lang_list):
	col_idx = i % 2
	with cols[col_idx]:
	if st.checkbox(f"{info['name']}", key=f"lang_{code}", value=(code == "en")):
	selected_languages.append(code)

	if not selected_languages:
	selected_languages = ["en"]
	st.info("English selected by default")

	st.markdown("---")

	# Generate button
	generate_btn = st.button(
	"🚀 Generate Descriptions",
	type="primary",
	use_container_width=True,
	disabled=uploaded_file is None
	)

	if st.button("🔄 Clear", use_container_width=True):
	st.rerun()

	# Main content
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("Input Image")
	if uploaded_file:
	image = Image.open(uploaded_file).convert("RGB")
	st.image(image, use_column_width=True)
	st.caption(f"Size: {image.size[0]}×{image.size[1]} pixels")
	else:
	st.info("👈 Upload an image from the sidebar")
	st.image(
	"https://images.unsplash.com/photo-1579546929662-711aa81148cf?w=400&auto=format",
	caption="Sample background",
	use_column_width=True
	)

	with col2:
	st.subheader("Results")

	if generate_btn and uploaded_file:
	image = Image.open(uploaded_file).convert("RGB")

	with st.spinner("Generating descriptions..."):
	results = {}
	progress_bar = st.progress(0)

	for i, lang_code in enumerate(selected_languages):
	# Update progress
	progress = (i + 1) / len(selected_languages)
	progress_bar.progress(progress)

	# Generate caption for this language
	caption = generate_multilingual_caption(image, lang_code)
	lang_name = LANGUAGES[lang_code]["name"]

	results[lang_name] = caption

	progress_bar.empty()

	# Display results
	st.success(f"✅ Generated {len(results)} descriptions")

	# Create results DataFrame
	df_results = pd.DataFrame({
	"Language": list(results.keys()),
	"Description": list(results.values())
	})

	# Display table
	st.dataframe(
	df_results,
	use_container_width=True,
	hide_index=True
	)

	# Show individual descriptions
	st.markdown("### Descriptions by Language")

	for lang_name, description in results.items():
	with st.expander(f"{lang_name}", expanded=(lang_name == "English")):
	st.markdown(f"{description}")

	# Export option
	st.markdown("---")
	st.markdown("### 💾 Export Results")

	# Create export text
	export_text = f"""Multilingual Image Descriptions
	Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	Image: {uploaded_file.name if uploaded_file else 'Unknown'}

	"""
	for lang_name, description in results.items():
	export_text += f"\n{lang_name}:\n{description}\n"

	# Download button
	st.download_button(
	"📥 Download as TXT",
	export_text,
	f"descriptions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
	"text/plain"
	)

	elif uploaded_file:
	st.info("👈 Click 'Generate Descriptions' to analyze the image")

	# Footer
	st.markdown("---")
	st.caption("""
	Powered by: BLIP-2 Multilingual Model • UCAS @2025 •
	Model: Salesforce/blip2-opt-2.7b
	""")

	if __name__ == "__main__":
	main()