Spaces:

Mustafa-albakkar
/

Mmo

Sleeping

App Files Files Community

Mustafa-albakkar commited on Oct 29

Commit

63f1c30

verified ·

1 Parent(s): 6cfad42

Upload main.py

Browse files

Files changed (1) hide show

main.py +555 -0

main.py ADDED Viewed

	@@ -0,0 +1,555 @@

+import os
+import pandas as pd
+from fastapi import FastAPI, HTTPException, Body
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, Field
+from typing import List, Dict, Any, Optional, Union
+from datasets import load_dataset, Dataset, DatasetDict
+from huggingface_hub import HfApi, hf_hub_download
+from datetime import datetime, timezone
+import logging
+import uvicorn
+import random
+import mimetypes
+# --- Constants and Config ---
+HF_DATASET_ID = "agents-course/unit4-students-scores"
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+task_file_paths: Dict[str, str] = {}
+tool_threshold = 3
+step_threshold = 6
+questions_for_api: List[Dict[str, Any]] = []
+ground_truth_answers: Dict[str, str] = {}
+filtered_dataset = None
+ALLOWED_CACHE_BASE = os.path.abspath("/app/.cache")
+class ErrorResponse(BaseModel):
+    detail: str
+def load_questions():
+    """
+    Loads the GAIA dataset, filters questions based on tool/step counts,
+    populates 'questions_for_api' with data for the API (excluding sensitive/internal fields),
+    stores ground truth answers, and maps task IDs to their local file paths on the server.
+    """
+    global filtered_dataset
+    global questions_for_api
+    global ground_truth_answers
+    global task_file_paths
+    tempo_filtered = []
+    questions_for_api.clear()
+    ground_truth_answers.clear()
+    task_file_paths.clear()
+    logger.info("Starting to load and filter GAIA dataset (validation split)...")
+    try:
+        dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
+        logger.info(f"GAIA dataset validation split loaded. Features: {dataset.features}")
+    except Exception as e:
+        logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
+        raise RuntimeError("Could not load the primary GAIA dataset.") from e
+    # --- Filtering Logic based on Annotator Metadata ---
+    for item in dataset:
+        metadata = item.get('Annotator Metadata')
+        if metadata:
+            num_tools_str = metadata.get('Number of tools')
+            num_steps_str = metadata.get('Number of steps')
+            if num_tools_str is not None and num_steps_str is not None:
+                try:
+                    num_tools = int(num_tools_str)
+                    num_steps = int(num_steps_str)
+                    if num_tools < tool_threshold and num_steps < step_threshold:
+                        tempo_filtered.append(item)
+                except ValueError:
+                    logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Could not convert tool/step count in metadata: tools='{num_tools_str}', steps='{num_steps_str}'.")
+            else:
+                 logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - 'Number of tools' or 'Number of steps' missing in Metadata.")
+        else:
+             logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
+    filtered_dataset = tempo_filtered
+    logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
+    processed_count = 0
+    for item in filtered_dataset:
+        # Extract data from the dataset item
+        task_id = item.get('task_id')
+        original_question_text = item.get('Question')
+        final_answer = item.get('Final answer')
+        local_file_path = item.get('file_path')
+        file_name = item.get('file_name')
+        if task_id and original_question_text and final_answer is not None:
+            # 1. Create the dictionary to be exposed via the API
+            #    (Includes 'file_name' for info, but excludes 'file_path')
+            processed_item = {
+                "task_id": str(task_id),
+                "question": str(original_question_text),
+                "Level": item.get("Level"),
+                "file_name": file_name,
+            }
+            processed_item = {k: v for k, v in processed_item.items() if v is not None}
+            questions_for_api.append(processed_item)
+            # 2. Store the ground truth answer separately
+            ground_truth_answers[str(task_id)] = str(final_answer)
+            # 3. Store the file path mapping if file details exist and are valid
+            if local_file_path and file_name:
+                 # Log if the path from the dataset isn't absolute (might indicate issues)
+                 if not os.path.isabs(local_file_path):
+                     logger.warning(f"Task {task_id}: Path '{local_file_path}' from dataset is not absolute. This might cause issues finding the file on the server.")
+                 if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
+                     task_file_paths[str(task_id)] = local_file_path
+                     logger.debug(f"Stored file path mapping for task_id {task_id}: {local_file_path}")
+                 else:
+                     logger.warning(f"File path '{local_file_path}' for task_id {task_id} does NOT exist or is not a file on server. Mapping skipped.")
+            elif task_id:
+                 if not local_file_path and not file_name:
+                     logger.debug(f"Task {task_id}: No 'file_path' or 'file_name' found in dataset item. No file mapping stored.")
+                 elif not local_file_path:
+                     logger.debug(f"Task {task_id}: 'file_path' is missing in dataset item (file_name: '{file_name}'). No file mapping stored.")
+                 else: # Not file_name
+                      logger.debug(f"Task {task_id}: 'file_name' is missing in dataset item (file_path: '{local_file_path}'). No file mapping stored.")
+            processed_count += 1
+        else:
+            logger.warning(f"Skipping item processing due to missing essential fields: task_id={task_id}, has_question={original_question_text is not None}, has_answer={final_answer is not None}")
+    logger.info(f"Successfully processed {processed_count} questions for the API.")
+    logger.info(f"Stored file path mappings for {len(task_file_paths)} tasks.")
+    if not questions_for_api:
+         logger.error("CRITICAL: No valid questions were loaded after filtering and processing. API endpoints like /questions will fail.")
+class Question(BaseModel):
+    task_id: str
+    question: str
+    Level: Optional[str] = None
+    file_name: Optional[str] = None
+# --- The rest of your Pydantic models remain the same ---
+class AnswerItem(BaseModel):
+    task_id: str
+    submitted_answer: str = Field(..., description="The agent's answer for the task_id")
+class Submission(BaseModel):
+    username: str = Field(..., description="Hugging Face username", min_length=1)
+    agent_code: str = Field(..., description="The Python class code for the agent")
+    answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
+class ScoreResponse(BaseModel):
+    username: str
+    score: float
+    correct_count: int
+    total_attempted: int
+    message: str
+    timestamp: str
+class ErrorResponse(BaseModel):
+    detail: str
+class AnswerItem(BaseModel):
+    task_id: str
+    submitted_answer: Union[str, int, float]  = Field(..., description="The agent's answer for the task_id. Accepts str, int and float")
+class Submission(BaseModel):
+    username: str = Field(..., description="Hugging Face username", min_length=1)
+    agent_code: str = Field(..., description="The Python class code for the agent", min_length=10)
+    answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
+class ScoreResponse(BaseModel):
+    username: str
+    score: float
+    correct_count: int
+    total_attempted: int
+    message: str
+    timestamp: str
+class ErrorResponse(BaseModel):
+    detail: str
+# --- FastAPI Application ---
+app = FastAPI(
+    title="Agent Evaluation API",
+    description="API to fetch questions and submit agent answers for scoring.",
+)
+# --- Startup Event ---
+@app.on_event("startup")
+async def startup_event():
+    logger.info("Application startup: Loading questions...")
+    try:
+        load_questions()
+        if not questions_for_api:
+            logger.error("CRITICAL: No questions were loaded during startup.")
+        else:
+            logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
+    except Exception as e:
+        logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
+@app.get("/files/{task_id}",
+         summary="Get Associated File by Task ID",
+         description="Downloads the file associated with the given task_id, if one exists and is mapped.",
+         responses={
+             200: {
+                 "description": "File content.",
+                 "content": {"*/*": {}}
+             },
+             403: {"model": ErrorResponse, "description": "Access denied (e.g., path traversal attempt)."},
+             404: {"model": ErrorResponse, "description": "Task ID not found, no file associated, or file missing on server."},
+             500: {"model": ErrorResponse, "description": "Server error reading file."}
+         })
+async def get_task_file(task_id: str):
+    """
+    Serves the file associated with a specific task ID.
+    Includes security checks to prevent accessing arbitrary files.
+    """
+    logger.info(f"Request received for file associated with task_id: {task_id}")
+    if task_id not in task_file_paths:
+        logger.warning(f"File request failed: task_id '{task_id}' not found in file path mapping.")
+        raise HTTPException(status_code=404, detail=f"No file path associated with task_id {task_id}.")
+    # --- ASSIGNMENT HAPPENS HERE ---
+    local_file_path = task_file_paths[task_id]
+    logger.debug(f"Mapped task_id '{task_id}' to local path: {local_file_path}")
+    # --- CRUCIAL SECURITY CHECK ---
+    try:
+        abs_file_path = os.path.abspath(local_file_path)
+        abs_base_path = ALLOWED_CACHE_BASE # Already absolute
+        if not abs_file_path.startswith(abs_base_path):
+            logger.error(f"SECURITY ALERT: Path traversal attempt denied for task_id '{task_id}'. Path '{local_file_path}' resolves outside base '{abs_base_path}'.")
+            raise HTTPException(status_code=403, detail="File access denied.")
+        if not os.path.exists(abs_file_path) or not os.path.isfile(abs_file_path):
+             logger.error(f"File not found on server for task_id '{task_id}' at expected path: {abs_file_path}")
+             raise HTTPException(status_code=404, detail=f"File associated with task_id {task_id} not found on server disk.")
+    except HTTPException as http_exc:
+         raise http_exc
+    except Exception as path_err:
+         logger.error(f"Error resolving or checking path '{local_file_path}' for task_id '{task_id}': {path_err}", exc_info=True)
+         raise HTTPException(status_code=500, detail="Server error validating file path.")
+    mime_type, _ = mimetypes.guess_type(abs_file_path)
+    media_type = mime_type if mime_type else "application/octet-stream"
+    file_name_for_download = os.path.basename(abs_file_path)
+    logger.info(f"Serving file '{file_name_for_download}' (type: {media_type}) for task_id '{task_id}' from path: {abs_file_path}")
+    return FileResponse(path=abs_file_path, media_type=media_type, filename=file_name_for_download)
+def update_huggingface_dataset(username: str, score: float, code_link: str):
+    """
+    Loads the dataset, updates the score and code link if the score is higher,
+    and pushes back to the Hugging Face Hub.
+    Args:
+        username: The username of the participant.
+        score: The new score achieved by the participant.
+        code_link: The link to the code submission associated with this score.
+    Returns:
+        True if the dataset was updated and pushed, False otherwise.
+    Raises:
+        HTTPException: If there's an error interacting with the dataset.
+    """
+    try:
+        # Define the expected schema including the 'code' column
+        expected_columns = {
+            'username': 'str',
+            'score': 'float',
+            'timestamp': 'str',
+            'code': 'str' # Added the code column
+        }
+        # 1. Attempt to load the dataset
+        logger.info(f"Attempting to load dataset '{HF_DATASET_ID}'...")
+        ds_dict = None
+        df = None
+        try:
+            ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
+            logger.info("Dataset loaded successfully.")
+            if "train" in ds_dict:
+                df = ds_dict['train'].to_pandas()
+            else:
+                 logger.warning(f"Dataset '{HF_DATASET_ID}' loaded but no 'train' split found. Creating structure.")
+                 #df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
+        except Exception as load_error:
+            logger.error(f"CRITICAL: Could not load dataset '{HF_DATASET_ID}'. Error: {load_error}", exc_info=True)
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to load required dataset '{HF_DATASET_ID}': {load_error}"
+            )
+        for col, dtype in expected_columns.items():
+            if col not in df.columns:
+                logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
+                df[col] = pd.Series(dtype=dtype)
+        df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
+        df['username'] = df['username'].astype(str).fillna('')
+        df['timestamp'] = df['timestamp'].astype(str).fillna('')
+        df['code'] = df['code'].astype(str).fillna('')
+        # 2. Find existing score for the user
+        existing_entries = df[df['username'] == username]
+        current_timestamp = datetime.now(timezone.utc).isoformat()
+        needs_update = False
+        if not existing_entries.empty:
+            # User exists, find their highest score
+            max_existing_score = existing_entries['score'].max() # Already numeric
+            if score > max_existing_score:
+                logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
+                df = df[df['username'] != username].copy()
+                new_entry = pd.DataFrame([{
+                    'username': username,
+                    'score': score,
+                    'timestamp': current_timestamp,
+                    'code': code_link # Add the code link here
+                }])
+                df = pd.concat([df, new_entry], ignore_index=True)
+                needs_update = True
+            else:
+                logger.info(f"New score {score} is not higher than existing max {max_existing_score} for {username}. No update needed.")
+        else:
+            # User does not exist, add them
+            logger.info(f"User {username} not found. Adding new entry with score {score}.")
+            new_entry = pd.DataFrame([{
+                'username': username,
+                'score': score,
+                'timestamp': current_timestamp,
+                'code': code_link # Add the code link here
+            }])
+            df = pd.concat([df, new_entry], ignore_index=True)
+            needs_update = True
+        # 3. Push updated data back to Hugging Face Hub if changes were made
+        if needs_update:
+            logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
+            df = df[list(expected_columns.keys())]
+            for col, dtype in expected_columns.items():
+                 if dtype == 'str':
+                     df[col] = df[col].astype(str).fillna('')
+                 elif dtype == 'float':
+                     df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
+            logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
+            logger.info(f"Sample data before push:\n{df.head().to_string()}")
+            updated_ds = Dataset.from_pandas(df)
+            final_ds_dict = DatasetDict({'train': updated_ds})
+            logger.info(f"Dataset structure to push: {final_ds_dict}")
+            final_ds_dict.push_to_hub(HF_DATASET_ID)
+            logger.warning("Dataset push to hub is currently commented out in the code.")
+            return True
+        else:
+            logger.info("No changes needed, dataset not pushed.")
+            return False # No update was pushed
+    except Exception as e:
+        logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
+@app.get("/questions",
+         # Return a list of dictionaries with arbitrary keys/values
+         response_model=List[Dict[str, Any]],
+         summary="Get All Filtered Questions (Full Data)",
+         description="Returns the complete list of questions with all associated data (excluding answer/annotation) filtered based on criteria.")
+async def get_questions():
+    """
+    Provides the list of questions (with extended data) that agents should answer.
+    """
+    if not questions_for_api:
+         logger.error("GET /questions requested but no questions are loaded.")
+         raise HTTPException(status_code=404, detail="No questions available.")
+    # questions_for_api now contains the richer dictionaries
+    return questions_for_api
+@app.get("/random-question",
+         # Return a single dictionary with arbitrary keys/values
+         response_model=Dict[str, Any],
+         summary="Get One Random Question (Full Data)",
+         description="Returns a single random question with all associated data (excluding answer/annotation) from the available filtered set.",
+         responses={
+             200: {"description": "A random question with its full data."},
+             404: {"model": ErrorResponse, "description": "No questions available to choose from."}
+         })
+async def get_random_question():
+    """
+    Provides a single, randomly selected question with its extended data.
+    """
+    if not questions_for_api:
+        logger.warning("GET /random-question requested but no questions are loaded.")
+        raise HTTPException(status_code=404, detail="No questions available to choose from.")
+    # Select and return a random question dictionary
+    random_question = random.choice(questions_for_api)
+    logger.info(f"Returning random question with task_id: {random_question.get('task_id', 'N/A')}")
+    # random_question is already the richer dictionary
+    return random_question
+# --- Submit Endpoint (remains the same, uses ground_truth_answers) ---
+@app.post("/submit",
+          response_model=ScoreResponse,
+          summary="Submit Agent Answers",
+          description="Submit answers from an agent, calculate score, and update leaderboard on Hugging Face.",
+          responses={
+              200: {"description": "Submission successful, score calculated."},
+              400: {"model": ErrorResponse, "description": "Invalid input data."},
+              404: {"model": ErrorResponse, "description": "Task ID not found in submission or ground truth."},
+              500: {"model": ErrorResponse, "description": "Server error (e.g., failed to update dataset)."}
+          })
+async def submit_answers(submission: Submission = Body(...)):
+    """
+    Receives agent submissions:
+    - Validates input.
+    - Checks presence of agent code (basic anti-cheat).
+    - Calculates score based on submitted answers vs ground truth.
+    - Updates the score on the Hugging Face dataset if it's a new high score for the user.
+    """
+    logger.info(f"Received submission from username: {submission.username}")
+    # Basic check for agent code presence
+    if not submission.agent_code or len(submission.agent_code.strip()) < 10:
+        logger.warning(f"Submission rejected for {submission.username}: Agent code missing or too short.")
+        raise HTTPException(status_code=400, detail="Agent code is required and must be sufficiently long.")
+    if not submission.answers:
+         logger.warning(f"Submission rejected for {submission.username}: No answers provided.")
+         raise HTTPException(status_code=400, detail="No answers provided in the submission.")
+    correct_count = 0
+    total_attempted_in_payload = len(submission.answers)
+    valid_attempted_count = 0
+    processed_ids = set()
+    for answer_item in submission.answers:
+        task_id = str(answer_item.task_id)
+        submitted = str(answer_item.submitted_answer)
+        if task_id in processed_ids:
+             logger.warning(f"Duplicate task_id '{task_id}' in submission from {submission.username}. Skipping.")
+             continue
+        processed_ids.add(task_id)
+        if task_id not in ground_truth_answers:
+            logger.warning(f"Task ID '{task_id}' submitted by {submission.username} not found in ground truth list. Skipping this answer.")
+            continue
+        valid_attempted_count += 1
+        ground_truth = ground_truth_answers[task_id]
+        # Compare answers (case-insensitive, strip whitespace)
+        if submitted.strip().lower() == ground_truth.strip().lower():
+            correct_count += 1
+            logger.debug(f"Correct answer for {task_id} from {submission.username}")
+        else:
+             logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
+    if valid_attempted_count == 0:
+        score = 0.0
+        message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
+        logger.warning(f"No valid answers processed for {submission.username} out of {total_attempted_in_payload} submitted.")
+    elif not ground_truth_answers: # Prevent division by zero if no questions loaded
+         score = 0.0
+         message = "Score cannot be calculated because no ground truth answers are loaded."
+         logger.error(f"Cannot calculate score for {submission.username}: ground_truth_answers is empty.")
+    else:
+        # Score is based on correct answers divided by the TOTAL number of questions in the filtered set
+        score = round((correct_count / len(ground_truth_answers)) * 100, 2)
+        message = f"Score calculated successfully: {correct_count}/{len(ground_truth_answers)} total questions answered correctly ({valid_attempted_count} valid tasks attempted)."
+        if valid_attempted_count < total_attempted_in_payload:
+             message += f" ({total_attempted_in_payload - valid_attempted_count} submitted answers had invalid or duplicate task IDs)."
+        logger.info(f"Score for {submission.username}: {score}% ({correct_count}/{len(ground_truth_answers)} correct, based on {valid_attempted_count} valid attempts)")
+    # Update Hugging Face dataset
+    try:
+        updated = update_huggingface_dataset(submission.username, score, submission.agent_code)
+        if updated:
+             message += " High score updated on leaderboard."
+             logger.info(f"Leaderboard updated for {submission.username}.")
+        else:
+             message += " Score did not improve previous record, leaderboard not updated."
+             logger.info(f"Leaderboard not updated for {submission.username} as score was not higher.")
+    except HTTPException as http_exc:
+         # Propagate HTTPException from the helper function (e.g., 500 error)
+         raise http_exc
+    except Exception as e:
+         # Catch any other unexpected errors during HF update
+         logger.error(f"Unexpected error during dataset update for {submission.username}: {e}", exc_info=True)
+         raise HTTPException(status_code=500, detail="An unexpected error occurred while updating the leaderboard.")
+    return ScoreResponse(
+        username=submission.username,
+        score=score,
+        correct_count=correct_count,
+        # Return the count of *valid* attempts for clarity
+        total_attempted=valid_attempted_count,
+        message=message,
+        timestamp=datetime.now(timezone.utc).isoformat()
+    )
+# --- Run the application ---
+if __name__ == "__main__":
+    logger.info("Starting FastAPI server for local development...")
+    try:
+        load_questions() # Load questions before starting server
+        if not questions_for_api:
+             logger.error("EXITING: Cannot start server without loaded questions.")
+             # Optional: exit if questions are essential
+             # import sys
+             # sys.exit(1)
+        else:
+            local_port = int(os.getenv("PORT", "8000"))
+            logger.info(f"Running Uvicorn locally on http://127.0.0.1:{local_port}")
+            uvicorn.run(app, host="127.0.0.1", port=local_port, log_level="info")
+    except Exception as e:
+        logger.error(f"Failed to start server: {e}", exc_info=True)