Spaces:

scr930
/

geolocal-StreetCLIP

Running

geolocal-StreetCLIP / app.py

Update app.py

1848536 verified over 1 year ago

1.34 kB

	import gradio as gr
	from transformers import CLIPProcessor, CLIPModel
	from PIL import Image
	import torch

	# Load the model and processor
	model = CLIPModel.from_pretrained("geolocal/StreetCLIP")
	processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")

	def classify_image(image):
	# Example labels for classification
	labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"]

	# Preprocess the image and text
	inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)

	# Perform the inference
	outputs = model(**inputs)

	# Postprocess the outputs
	logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	probs = logits_per_image.softmax(dim=1) # we can use softmax to get probabilities

	# Convert the probabilities to a list
	probs_list = probs.tolist()[0]

	# Create a dictionary of labels and probabilities
	result = {label: prob for label, prob in zip(labels, probs_list)}

	return result

	# Define Gradio interface
	iface = gr.Interface(
	fn=classify_image,
	inputs=gr.Image(type="pil"),
	outputs="label",
	title="Geolocal StreetCLIP Classification",
	description="Upload an image to classify using Geolocal StreetCLIP"
	)

	# Launch the interface
	iface.launch()