Textilindo-AI / scripts /train_with_monitoring.py
harismlnaslm's picture
Add complete scripts directory with training, testing, and deployment tools
e207dc8
#!/usr/bin/env python3
"""
Script untuk training dengan monitoring GPU dan logging yang lengkap
"""
import os
import sys
import time
import json
import psutil
import GPUtil
from pathlib import Path
from datetime import datetime
import logging
from finetune_lora import main as finetune_main
def setup_logging():
"""Setup logging dengan format yang lengkap"""
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"training_{timestamp}.log"
# Setup logging format
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler(sys.stdout)
]
)
return logging.getLogger(__name__)
def get_system_info():
"""Get system information"""
info = {
"timestamp": datetime.now().isoformat(),
"cpu_count": psutil.cpu_count(),
"memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
"memory_available_gb": round(psutil.virtual_memory().available / (1024**3), 2),
"disk_usage": {}
}
# Disk usage
for partition in psutil.disk_partitions():
try:
usage = psutil.disk_usage(partition.mountpoint)
info["disk_usage"][partition.mountpoint] = {
"total_gb": round(usage.total / (1024**3), 2),
"used_gb": round(usage.used / (1024**3), 2),
"free_gb": round(usage.free / (1024**3), 2),
"percent": usage.percent
}
except PermissionError:
continue
return info
def get_gpu_info():
"""Get GPU information"""
try:
gpus = GPUtil.getGPUs()
gpu_info = []
for gpu in gpus:
gpu_info.append({
"id": gpu.id,
"name": gpu.name,
"memory_total_mb": gpu.memoryTotal,
"memory_used_mb": gpu.memoryUsed,
"memory_free_mb": gpu.memoryFree,
"memory_utilization_percent": gpu.memoryUtil * 100,
"gpu_utilization_percent": gpu.load * 100,
"temperature_celsius": gpu.temperature
})
return gpu_info
except Exception as e:
logging.warning(f"Could not get GPU info: {e}")
return []
def monitor_resources(logger, interval=30):
"""Monitor system resources during training"""
logger.info("๐Ÿ” Starting resource monitoring...")
start_time = time.time()
monitoring_data = []
try:
while True:
# Get current resource usage
current_time = time.time()
elapsed_time = current_time - start_time
# System info
system_info = get_system_info()
system_info["elapsed_time_seconds"] = elapsed_time
# GPU info
gpu_info = get_gpu_info()
# Memory usage
memory = psutil.virtual_memory()
system_info["memory_used_gb"] = round(memory.used / (1024**3), 2)
system_info["memory_percent"] = memory.percent
# CPU usage
system_info["cpu_percent"] = psutil.cpu_percent(interval=1)
# Combine all info
monitoring_entry = {
"timestamp": datetime.now().isoformat(),
"elapsed_time_seconds": elapsed_time,
"system": system_info,
"gpu": gpu_info
}
monitoring_data.append(monitoring_entry)
# Log summary
logger.info(f"โฑ๏ธ Elapsed: {elapsed_time/60:.1f}min | "
f"CPU: {system_info['cpu_percent']:.1f}% | "
f"RAM: {system_info['memory_percent']:.1f}%")
if gpu_info:
for gpu in gpu_info:
logger.info(f"๐ŸŽฎ GPU {gpu['id']}: "
f"Util: {gpu['gpu_utilization_percent']:.1f}% | "
f"Memory: {gpu['memory_utilization_percent']:.1f}% | "
f"Temp: {gpu['temperature_celsius']:.1f}ยฐC")
# Save monitoring data periodically
if len(monitoring_data) % 10 == 0: # Every 10 entries
monitoring_file = Path("logs") / f"monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(monitoring_file, 'w') as f:
json.dump(monitoring_data, f, indent=2)
logger.info(f"๐Ÿ’พ Monitoring data saved: {monitoring_file}")
time.sleep(interval)
except KeyboardInterrupt:
logger.info("โน๏ธ Resource monitoring stopped by user")
return monitoring_data
def main():
"""Main function untuk training dengan monitoring"""
print("๐Ÿš€ Training dengan Monitoring - Llama 3.1 8B LoRA")
print("=" * 60)
# Setup logging
logger = setup_logging()
# Log system information
logger.info("๐Ÿ–ฅ๏ธ System Information:")
system_info = get_system_info()
for key, value in system_info.items():
if key != "disk_usage":
logger.info(f" {key}: {value}")
# Log GPU information
gpu_info = get_gpu_info()
if gpu_info:
logger.info("๐ŸŽฎ GPU Information:")
for gpu in gpu_info:
logger.info(f" GPU {gpu['id']}: {gpu['name']}")
logger.info(f" Memory: {gpu['memory_total_mb']}MB total")
logger.info(f" Temperature: {gpu['temperature_celsius']}ยฐC")
else:
logger.warning("โš ๏ธ No GPU detected. Training will be very slow on CPU!")
# Check prerequisites
logger.info("๐Ÿ” Checking prerequisites...")
# Check if model exists
model_path = Path("models/llama-3.1-8b-instruct")
if not model_path.exists():
logger.error("โŒ Base model not found. Please run download_model.py first!")
return
# Check if dataset exists
data_path = Path("data/training_data.jsonl")
if not data_path.exists():
logger.error("โŒ Training dataset not found. Please run create_sample_dataset.py first!")
return
# Check if config exists
config_path = Path("configs/llama_config.yaml")
if not config_path.exists():
logger.error("โŒ Model configuration not found. Please run download_model.py first!")
return
logger.info("โœ… All prerequisites met!")
# Start resource monitoring in background
import threading
monitoring_thread = threading.Thread(
target=monitor_resources,
args=(logger, 30), # Monitor every 30 seconds
daemon=True
)
monitoring_thread.start()
# Start training
logger.info("๐Ÿš€ Starting LoRA fine-tuning...")
try:
finetune_main()
logger.info("โœ… Training completed successfully!")
except Exception as e:
logger.error(f"โŒ Training failed: {e}")
raise
finally:
logger.info("๐Ÿ“Š Training session ended")
# Save final monitoring data
monitoring_file = Path("logs") / f"final_monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
# Note: In a real implementation, you'd want to capture the monitoring data
logger.info(f"๐Ÿ’พ Final monitoring data saved: {monitoring_file}")
if __name__ == "__main__":
main()