Spaces:
Running
Running
| import gradio as gr | |
| from transformers import CLIPProcessor, CLIPModel | |
| from PIL import Image | |
| import torch | |
| # Load the model and processor | |
| model = CLIPModel.from_pretrained("geolocal/StreetCLIP") | |
| processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP") | |
| def classify_image(image): | |
| # Example labels for classification | |
| labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"] | |
| # Preprocess the image and text | |
| inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) | |
| # Perform the inference | |
| outputs = model(**inputs) | |
| # Postprocess the outputs | |
| logits_per_image = outputs.logits_per_image # this is the image-text similarity score | |
| probs = logits_per_image.softmax(dim=1) # we can use softmax to get probabilities | |
| # Convert the probabilities to a list | |
| probs_list = probs.tolist()[0] | |
| # Create a dictionary of labels and probabilities | |
| result = {label: prob for label, prob in zip(labels, probs_list)} | |
| return result | |
| # Define Gradio interface | |
| iface = gr.Interface( | |
| fn=classify_image, | |
| inputs=gr.Image(type="pil"), | |
| outputs="label", | |
| title="Geolocal StreetCLIP Classification", | |
| description="Upload an image to classify using Geolocal StreetCLIP" | |
| ) | |
| # Launch the interface | |
| iface.launch() |