Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script untuk training dengan monitoring GPU dan logging yang lengkap | |
| """ | |
| import os | |
| import sys | |
| import time | |
| import json | |
| import psutil | |
| import GPUtil | |
| from pathlib import Path | |
| from datetime import datetime | |
| import logging | |
| from finetune_lora import main as finetune_main | |
| def setup_logging(): | |
| """Setup logging dengan format yang lengkap""" | |
| log_dir = Path("logs") | |
| log_dir.mkdir(exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| log_file = log_dir / f"training_{timestamp}.log" | |
| # Setup logging format | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler(log_file, encoding='utf-8'), | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| return logging.getLogger(__name__) | |
| def get_system_info(): | |
| """Get system information""" | |
| info = { | |
| "timestamp": datetime.now().isoformat(), | |
| "cpu_count": psutil.cpu_count(), | |
| "memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2), | |
| "memory_available_gb": round(psutil.virtual_memory().available / (1024**3), 2), | |
| "disk_usage": {} | |
| } | |
| # Disk usage | |
| for partition in psutil.disk_partitions(): | |
| try: | |
| usage = psutil.disk_usage(partition.mountpoint) | |
| info["disk_usage"][partition.mountpoint] = { | |
| "total_gb": round(usage.total / (1024**3), 2), | |
| "used_gb": round(usage.used / (1024**3), 2), | |
| "free_gb": round(usage.free / (1024**3), 2), | |
| "percent": usage.percent | |
| } | |
| except PermissionError: | |
| continue | |
| return info | |
| def get_gpu_info(): | |
| """Get GPU information""" | |
| try: | |
| gpus = GPUtil.getGPUs() | |
| gpu_info = [] | |
| for gpu in gpus: | |
| gpu_info.append({ | |
| "id": gpu.id, | |
| "name": gpu.name, | |
| "memory_total_mb": gpu.memoryTotal, | |
| "memory_used_mb": gpu.memoryUsed, | |
| "memory_free_mb": gpu.memoryFree, | |
| "memory_utilization_percent": gpu.memoryUtil * 100, | |
| "gpu_utilization_percent": gpu.load * 100, | |
| "temperature_celsius": gpu.temperature | |
| }) | |
| return gpu_info | |
| except Exception as e: | |
| logging.warning(f"Could not get GPU info: {e}") | |
| return [] | |
| def monitor_resources(logger, interval=30): | |
| """Monitor system resources during training""" | |
| logger.info("๐ Starting resource monitoring...") | |
| start_time = time.time() | |
| monitoring_data = [] | |
| try: | |
| while True: | |
| # Get current resource usage | |
| current_time = time.time() | |
| elapsed_time = current_time - start_time | |
| # System info | |
| system_info = get_system_info() | |
| system_info["elapsed_time_seconds"] = elapsed_time | |
| # GPU info | |
| gpu_info = get_gpu_info() | |
| # Memory usage | |
| memory = psutil.virtual_memory() | |
| system_info["memory_used_gb"] = round(memory.used / (1024**3), 2) | |
| system_info["memory_percent"] = memory.percent | |
| # CPU usage | |
| system_info["cpu_percent"] = psutil.cpu_percent(interval=1) | |
| # Combine all info | |
| monitoring_entry = { | |
| "timestamp": datetime.now().isoformat(), | |
| "elapsed_time_seconds": elapsed_time, | |
| "system": system_info, | |
| "gpu": gpu_info | |
| } | |
| monitoring_data.append(monitoring_entry) | |
| # Log summary | |
| logger.info(f"โฑ๏ธ Elapsed: {elapsed_time/60:.1f}min | " | |
| f"CPU: {system_info['cpu_percent']:.1f}% | " | |
| f"RAM: {system_info['memory_percent']:.1f}%") | |
| if gpu_info: | |
| for gpu in gpu_info: | |
| logger.info(f"๐ฎ GPU {gpu['id']}: " | |
| f"Util: {gpu['gpu_utilization_percent']:.1f}% | " | |
| f"Memory: {gpu['memory_utilization_percent']:.1f}% | " | |
| f"Temp: {gpu['temperature_celsius']:.1f}ยฐC") | |
| # Save monitoring data periodically | |
| if len(monitoring_data) % 10 == 0: # Every 10 entries | |
| monitoring_file = Path("logs") / f"monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| with open(monitoring_file, 'w') as f: | |
| json.dump(monitoring_data, f, indent=2) | |
| logger.info(f"๐พ Monitoring data saved: {monitoring_file}") | |
| time.sleep(interval) | |
| except KeyboardInterrupt: | |
| logger.info("โน๏ธ Resource monitoring stopped by user") | |
| return monitoring_data | |
| def main(): | |
| """Main function untuk training dengan monitoring""" | |
| print("๐ Training dengan Monitoring - Llama 3.1 8B LoRA") | |
| print("=" * 60) | |
| # Setup logging | |
| logger = setup_logging() | |
| # Log system information | |
| logger.info("๐ฅ๏ธ System Information:") | |
| system_info = get_system_info() | |
| for key, value in system_info.items(): | |
| if key != "disk_usage": | |
| logger.info(f" {key}: {value}") | |
| # Log GPU information | |
| gpu_info = get_gpu_info() | |
| if gpu_info: | |
| logger.info("๐ฎ GPU Information:") | |
| for gpu in gpu_info: | |
| logger.info(f" GPU {gpu['id']}: {gpu['name']}") | |
| logger.info(f" Memory: {gpu['memory_total_mb']}MB total") | |
| logger.info(f" Temperature: {gpu['temperature_celsius']}ยฐC") | |
| else: | |
| logger.warning("โ ๏ธ No GPU detected. Training will be very slow on CPU!") | |
| # Check prerequisites | |
| logger.info("๐ Checking prerequisites...") | |
| # Check if model exists | |
| model_path = Path("models/llama-3.1-8b-instruct") | |
| if not model_path.exists(): | |
| logger.error("โ Base model not found. Please run download_model.py first!") | |
| return | |
| # Check if dataset exists | |
| data_path = Path("data/training_data.jsonl") | |
| if not data_path.exists(): | |
| logger.error("โ Training dataset not found. Please run create_sample_dataset.py first!") | |
| return | |
| # Check if config exists | |
| config_path = Path("configs/llama_config.yaml") | |
| if not config_path.exists(): | |
| logger.error("โ Model configuration not found. Please run download_model.py first!") | |
| return | |
| logger.info("โ All prerequisites met!") | |
| # Start resource monitoring in background | |
| import threading | |
| monitoring_thread = threading.Thread( | |
| target=monitor_resources, | |
| args=(logger, 30), # Monitor every 30 seconds | |
| daemon=True | |
| ) | |
| monitoring_thread.start() | |
| # Start training | |
| logger.info("๐ Starting LoRA fine-tuning...") | |
| try: | |
| finetune_main() | |
| logger.info("โ Training completed successfully!") | |
| except Exception as e: | |
| logger.error(f"โ Training failed: {e}") | |
| raise | |
| finally: | |
| logger.info("๐ Training session ended") | |
| # Save final monitoring data | |
| monitoring_file = Path("logs") / f"final_monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| # Note: In a real implementation, you'd want to capture the monitoring data | |
| logger.info(f"๐พ Final monitoring data saved: {monitoring_file}") | |
| if __name__ == "__main__": | |
| main() | |