Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| # modules/retriever.py | |
| from typing import List, Dict | |
| from openai import OpenAI | |
| from modules.youtube_metadata.embeddings import get_embedding | |
| def retrieve_videos( | |
| query: str, collection, top_k: int = 3, channel_id: str = None | |
| ) -> List[Dict]: | |
| client = OpenAI() | |
| # Create embedding for query | |
| embedding = get_embedding(query) | |
| # Query Chroma | |
| if not channel_id: | |
| results = collection.query( | |
| query_embeddings=[embedding], | |
| n_results=top_k, | |
| include=["metadatas", "documents", "distances"], | |
| ) | |
| else: | |
| results = collection.query( | |
| query_embeddings=[embedding], | |
| n_results=top_k, | |
| include=["metadatas", "documents", "distances"], | |
| where={"channel_id": channel_id}, | |
| ) | |
| # Build list of standardized dicts | |
| videos = [] | |
| metadatas_list = results.get("metadatas", [[]])[0] # list of metadata dicts | |
| documents_list = results.get("documents", [[]])[0] # list of text | |
| distances_list = results.get("distances", [[]])[0] # optional | |
| for idx, meta in enumerate(metadatas_list): | |
| videos.append( | |
| { | |
| "video_id": meta.get("video_id", ""), | |
| "video_title": meta.get( | |
| "video_title", meta.get("title", documents_list[idx]) | |
| ), | |
| "channel": meta.get("channel", meta.get("channel_title", "")), | |
| "description": documents_list[idx] if idx < len(documents_list) else "", | |
| "score": distances_list[idx] if idx < len(distances_list) else None, | |
| } | |
| ) | |
| return videos | |