""" Modal Job Submission Module Handles submission of SMOLTRACE evaluation jobs to Modal's serverless compute platform. """ import os import sys import uuid from typing import Dict, Optional, List def submit_modal_job( model: str, provider: str, agent_type: str, hardware: str, dataset_name: str, split: str = "train", difficulty: str = "all", parallel_workers: int = 1, hf_token: Optional[str] = None, hf_inference_provider: Optional[str] = None, search_provider: str = "duckduckgo", enable_tools: Optional[List[str]] = None, output_format: str = "hub", output_dir: Optional[str] = None, enable_otel: bool = True, enable_gpu_metrics: bool = True, private: bool = False, debug: bool = False, quiet: bool = False, run_id: Optional[str] = None ) -> Dict: """ Submit an evaluation job to Modal Args: model: Model identifier (e.g., "openai/gpt-4") provider: Provider type ("litellm", "inference", "transformers") agent_type: Agent type ("tool", "code", "both") hardware: Hardware type (e.g., "auto", "gpu_a10", "gpu_h200") dataset_name: HuggingFace dataset for evaluation split: Dataset split to use difficulty: Difficulty filter parallel_workers: Number of parallel workers hf_token: HuggingFace token hf_inference_provider: HF Inference provider search_provider: Search provider for agents enable_tools: List of tools to enable output_format: Output format ("hub" or "json") output_dir: Output directory for JSON format enable_otel: Enable OpenTelemetry tracing enable_gpu_metrics: Enable GPU metrics collection private: Make datasets private debug: Enable debug mode quiet: Enable quiet mode run_id: Optional run ID (auto-generated if not provided) Returns: dict: Job submission result with job_id, status, and details """ try: import modal except ImportError: return { "success": False, "error": "Modal package not installed. Install with: pip install modal", "job_id": None } # Validate Modal credentials modal_token_id = os.environ.get("MODAL_TOKEN_ID") modal_token_secret = os.environ.get("MODAL_TOKEN_SECRET") if not modal_token_id or not modal_token_secret: return { "success": False, "error": "Modal credentials not configured. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in Settings.", "job_id": None } # Generate job ID job_id = run_id if run_id else f"job_{uuid.uuid4().hex[:8]}" # Map hardware to Modal GPU types hardware_map = { "auto": _auto_select_modal_hardware(provider, model), "cpu": None, # CPU only "gpu_t4": "T4", "gpu_l4": "L4", "gpu_a10": "A10G", "gpu_l40s": "L40S", "gpu_a100": "A100", "gpu_a100_80gb": "A100-80GB", "gpu_h100": "H100", "gpu_h200": "H200", "gpu_b200": "B200" } modal_gpu = hardware_map.get(hardware, "A10G") # Build environment variables env_vars = { "HF_TOKEN": hf_token or os.environ.get("HF_TOKEN", ""), } # Add LLM provider API keys from environment llm_key_names = [ "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY", "COHERE_API_KEY", "MISTRAL_API_KEY", "TOGETHER_API_KEY", "GROQ_API_KEY", "REPLICATE_API_TOKEN", "ANYSCALE_API_KEY", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT", "LITELLM_API_KEY" ] for key_name in llm_key_names: value = os.environ.get(key_name) if value: env_vars[key_name] = value # Build SMOLTRACE command cmd_parts = ["smoltrace-eval"] cmd_parts.append(f"--model {model}") cmd_parts.append(f"--provider {provider}") if hf_inference_provider: cmd_parts.append(f"--hf-inference-provider {hf_inference_provider}") cmd_parts.append(f"--search-provider {search_provider}") if enable_tools: cmd_parts.append(f"--enable-tools {','.join(enable_tools)}") cmd_parts.append(f"--agent-type {agent_type}") cmd_parts.append(f"--dataset-name {dataset_name}") cmd_parts.append(f"--split {split}") if difficulty != "all": cmd_parts.append(f"--difficulty {difficulty}") if parallel_workers > 1: cmd_parts.append(f"--parallel-workers {parallel_workers}") cmd_parts.append(f"--output-format {output_format}") if output_dir and output_format == "json": cmd_parts.append(f"--output-dir {output_dir}") if enable_otel: cmd_parts.append("--enable-otel") if not enable_gpu_metrics: cmd_parts.append("--disable-gpu-metrics") if private: cmd_parts.append("--private") if debug: cmd_parts.append("--debug") if quiet: cmd_parts.append("--quiet") cmd_parts.append(f"--run-id {job_id}") command = " ".join(cmd_parts) # Create Modal app dynamically try: app = modal.App(f"smoltrace-eval-{job_id}") # Detect current Python version dynamically (must match for serialized=True) python_version = f"{sys.version_info.major}.{sys.version_info.minor}" # Define Modal function with appropriate base image # Note: Must match local Python version when using serialized=True if modal_gpu: # Use GPU-optimized image with CUDA for GPU jobs (using latest stable CUDA) image = modal.Image.from_registry( "nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04", add_python=python_version # Dynamically match current environment ).pip_install([ "smoltrace", "ddgs", # DuckDuckGo search "litellm", "transformers", "torch", "accelerate", # Required for GPU device_map "bitsandbytes", # For quantization support "sentencepiece", # For some tokenizers "protobuf", # For some models "hf_transfer", # Fast HuggingFace downloads "nvidia-ml-py" # GPU metrics collection ]).env({ # Enable fast downloads and verbose logging "HF_HUB_ENABLE_HF_TRANSFER": "1", "TRANSFORMERS_VERBOSITY": "info", "HF_HUB_VERBOSITY": "info" }) else: # Use lightweight image for CPU jobs image = modal.Image.debian_slim(python_version=python_version).pip_install([ "smoltrace", "ddgs", # DuckDuckGo search "litellm" ]) @app.function( image=image, gpu=modal_gpu if modal_gpu else None, secrets=[ modal.Secret.from_dict(env_vars) ], timeout=3600, # 1 hour timeout serialized=True # Required for functions defined in local scope ) def run_evaluation(command_to_run: str): """Run SMOLTRACE evaluation on Modal""" import subprocess import sys import os print("=" * 80) print(f"Starting SMOLTRACE evaluation on Modal") print(f"Command: {command_to_run}") print(f"Python version: {sys.version}") # Show GPU info if available try: import torch if torch.cuda.is_available(): print(f"GPU: {torch.cuda.get_device_name(0)}") print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") except: pass print("=" * 80) print("\nNote: Model download may take several minutes for large models (14B = ~28GB)") print("Downloading and initializing model...\n") try: # Run with live output instead of capture_output so we can see progress result = subprocess.run( command_to_run, shell=True, capture_output=False, # Stream output in real-time text=True ) # Since we're not capturing, create a success message print("\n" + "=" * 80) print("EVALUATION COMPLETED") print(f"Return code: {result.returncode}") print("=" * 80) return { "returncode": result.returncode, "stdout": "Check Modal logs for full output (streaming mode)", "stderr": "" } except Exception as e: error_msg = f"Error running evaluation: {str(e)}" print("\n" + "=" * 80) print("EVALUATION FAILED") print(error_msg) print("=" * 80) import traceback traceback.print_exc() return { "returncode": -1, "stdout": "", "stderr": error_msg } # Submit the job using Modal's remote() in a background thread # Note: spawn() doesn't work well with dynamically created apps # remote() ensures the job actually executes, threading keeps UI responsive import threading # Store result in a shared dict since we're using threading result_container = {"modal_call_id": None, "started": False} def run_job_on_modal(): """Run the Modal job in background thread""" try: with app.run(): # Use remote() instead of spawn() for dynamic apps # This ensures the function actually executes function_call = run_evaluation.remote(command) result_container["started"] = True print(f"Modal job completed with return code: {function_call.get('returncode', 'unknown')}") except Exception as e: print(f"Error running Modal job: {e}") result_container["error"] = str(e) # Start the job in a background thread so we don't block the UI job_thread = threading.Thread(target=run_job_on_modal, daemon=True) job_thread.start() # Give Modal a moment to start the job and capture any immediate errors import time time.sleep(2) # Use job_id as the tracking ID since remote() doesn't give us a call_id modal_call_id = f"modal-{job_id}" return { "success": True, "job_id": job_id, "modal_call_id": modal_call_id, # Modal's internal function call ID "platform": "Modal", "hardware": modal_gpu or "CPU", "command": command, "status": "submitted", "message": f"Job successfully submitted to Modal (hardware: {modal_gpu or 'CPU'})", "instructions": f""" ✅ Job submitted successfully! **Job Details:** - Run ID: {job_id} - Modal Call ID: {modal_call_id} - Hardware: {modal_gpu or "CPU"} - Platform: Modal (serverless compute) **What happens next:** 1. Job starts running on Modal infrastructure 2. For GPU jobs: Model downloads first (14B models = ~28GB, can take 10-15 min) 3. SMOLTRACE evaluates your model 4. Results are automatically pushed to HuggingFace datasets 5. They will appear in TraceMind leaderboard when complete **Monitoring**: Check Modal dashboard for real-time logs and progress: https://modal.com/apps **Expected Duration**: - CPU jobs (API models): 2-5 minutes - GPU jobs (local models): 15-30 minutes (includes model download) **Cost**: Modal charges per-second usage. Estimated cost: $0.01-1.00 depending on model size and hardware. """.strip() } except Exception as e: error_msg = str(e) # Check for common Modal errors if "MODAL_TOKEN_ID" in error_msg or "authentication" in error_msg.lower(): return { "success": False, "error": "Modal authentication failed. Please verify your MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in Settings.", "job_id": job_id, "troubleshooting": """ **Steps to fix:** 1. Go to https://modal.com/settings/tokens 2. Create a new token 3. Copy Token ID (starts with 'ak-') and Token Secret (starts with 'as-') 4. Add them to Settings in TraceMind 5. Try again """ } else: return { "success": False, "error": f"Failed to submit Modal job: {error_msg}", "job_id": job_id, "command": command } def _auto_select_modal_hardware(provider: str, model: str) -> Optional[str]: """ Automatically select Modal hardware based on model and provider. Memory estimation for agentic workloads: - Model weights (FP16): ~2GB per 1B params - KV cache for long contexts: ~1.5-2x model size for agentic tasks - Inference overhead: ~20-30% additional - Total: ~4-5GB per 1B params for safe agentic execution Args: provider: Provider type model: Model identifier Returns: str: Modal GPU type or None for CPU """ # API models don't need GPU if provider in ["litellm", "inference"]: return None # Local models need GPU - select based on model size # Conservative allocation for agentic tasks (model weights + KV cache + inference overhead) # Memory estimation: ~4-5GB per 1B params for safe agentic execution model_lower = model.lower() # Extract model size using regex to capture the number before 'b' import re size_match = re.search(r'(\d+\.?\d*)b', model_lower) if size_match: model_size = float(size_match.group(1)) # Complete coverage from 0.5B to 100B+ with no gaps if model_size >= 49: # 49B-100B+: H200 (140GB VRAM) return "H200" elif model_size >= 25: # 25B-48B: A100-80GB (e.g., Gemma-27B, Kimi-48B, 30B, 34B) return "A100-80GB" elif model_size >= 13: # 13B-24B: A100-80GB (e.g., 13B, 14B, 15B, 20B, 22B) return "A100-80GB" elif model_size >= 6: # 6B-12B: L40S 48GB (e.g., 6B, 7B, 8B, 9B, 10B, 11B, 12B) return "L40S" elif model_size >= 1: # 1B-5B: T4 16GB (e.g., 1B, 2B, 3B, 4B, 5B) return "T4" else: # < 1B: T4 16GB return "T4" else: # No size detected in model name - default to L40S (safe middle ground) return "L40S"