Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script untuk membuat sample dataset JSONL untuk training | |
| """ | |
| import json | |
| import os | |
| from pathlib import Path | |
| def create_sample_dataset(): | |
| """Create sample JSONL dataset""" | |
| # Sample training data | |
| sample_data = [ | |
| { | |
| "text": "Apa itu machine learning? Machine learning adalah cabang dari artificial intelligence yang memungkinkan komputer belajar dari data tanpa diprogram secara eksplisit.", | |
| "category": "education", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Jelaskan tentang deep learning. Deep learning adalah subset dari machine learning yang menggunakan neural network dengan banyak layer untuk memproses data kompleks.", | |
| "category": "education", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Bagaimana cara kerja neural network? Neural network bekerja dengan menerima input, memproses melalui hidden layers, dan menghasilkan output berdasarkan weights yang telah dilatih.", | |
| "category": "education", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Apa keuntungan menggunakan Python untuk AI? Python memiliki library yang lengkap seperti TensorFlow, PyTorch, dan scikit-learn yang memudahkan development AI.", | |
| "category": "programming", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Jelaskan tentang transfer learning. Transfer learning adalah teknik menggunakan model yang sudah dilatih pada dataset besar dan mengadaptasinya untuk task yang lebih spesifik.", | |
| "category": "education", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Bagaimana cara optimize model machine learning? Optimasi dapat dilakukan dengan hyperparameter tuning, feature engineering, dan menggunakan teknik seperti cross-validation.", | |
| "category": "optimization", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Apa itu overfitting? Overfitting terjadi ketika model belajar terlalu detail dari training data sehingga performa pada data baru menurun.", | |
| "category": "education", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Jelaskan tentang regularization. Regularization adalah teknik untuk mencegah overfitting dengan menambahkan penalty pada model complexity.", | |
| "category": "education", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Bagaimana cara handle imbalanced dataset? Dataset tidak seimbang dapat diatasi dengan teknik sampling, class weights, atau menggunakan metrics yang tepat seperti F1-score.", | |
| "category": "data_handling", | |
| "language": "id" | |
| }, | |
| { | |
| "text": "Apa itu ensemble learning? Ensemble learning menggabungkan multiple model untuk meningkatkan performa prediksi dan mengurangi variance.", | |
| "category": "education", | |
| "language": "id" | |
| } | |
| ] | |
| # Create data directory | |
| data_dir = Path("data") | |
| data_dir.mkdir(exist_ok=True) | |
| # Write to JSONL file | |
| output_file = data_dir / "training_data.jsonl" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| for item in sample_data: | |
| json.dump(item, f, ensure_ascii=False) | |
| f.write('\n') | |
| print(f"β Sample dataset created: {output_file}") | |
| print(f"π Total samples: {len(sample_data)}") | |
| print(f"π File size: {output_file.stat().st_size / 1024:.2f} KB") | |
| # Show sample content | |
| print("\nπ Sample content:") | |
| print("-" * 50) | |
| for i, item in enumerate(sample_data[:3], 1): | |
| print(f"Sample {i}:") | |
| print(f" Text: {item['text'][:100]}...") | |
| print(f" Category: {item['category']}") | |
| print(f" Language: {item['language']}") | |
| print() | |
| def create_custom_dataset(): | |
| """Create custom dataset from user input""" | |
| print("π§ Create Custom Dataset") | |
| print("=" * 40) | |
| # Get dataset info | |
| dataset_name = input("Dataset name (without extension): ").strip() | |
| if not dataset_name: | |
| dataset_name = "custom_dataset" | |
| num_samples = input("Number of samples (default 10): ").strip() | |
| try: | |
| num_samples = int(num_samples) if num_samples else 10 | |
| except ValueError: | |
| num_samples = 10 | |
| print(f"\nπ Creating {num_samples} samples...") | |
| print("Format: Enter text for each sample (empty line to finish early)") | |
| custom_data = [] | |
| for i in range(num_samples): | |
| print(f"\nSample {i+1}/{num_samples}:") | |
| text = input("Text: ").strip() | |
| if not text: | |
| print("Empty text, finishing...") | |
| break | |
| category = input("Category (optional): ").strip() or "general" | |
| language = input("Language (optional, default 'id'): ").strip() or "id" | |
| sample = { | |
| "text": text, | |
| "category": category, | |
| "language": language | |
| } | |
| custom_data.append(sample) | |
| # Ask if user wants to continue | |
| if i < num_samples - 1: | |
| continue_input = input("Continue? (y/n, default y): ").strip().lower() | |
| if continue_input in ['n', 'no']: | |
| break | |
| if not custom_data: | |
| print("β No data entered, dataset not created") | |
| return | |
| # Create data directory | |
| data_dir = Path("data") | |
| data_dir.mkdir(exist_ok=True) | |
| # Write to JSONL file | |
| output_file = data_dir / f"{dataset_name}.jsonl" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| for item in custom_data: | |
| json.dump(item, f, ensure_ascii=False) | |
| f.write('\n') | |
| print(f"\nβ Custom dataset created: {output_file}") | |
| print(f"π Total samples: {len(custom_data)}") | |
| def main(): | |
| print("π Dataset Creator for LLM Training") | |
| print("=" * 50) | |
| print("Pilih opsi:") | |
| print("1. Create sample dataset (10 samples)") | |
| print("2. Create custom dataset") | |
| print("3. View existing datasets") | |
| choice = input("\nPilihan (1-3): ").strip() | |
| if choice == "1": | |
| create_sample_dataset() | |
| elif choice == "2": | |
| create_custom_dataset() | |
| elif choice == "3": | |
| data_dir = Path("data") | |
| if data_dir.exists(): | |
| jsonl_files = list(data_dir.glob("*.jsonl")) | |
| if jsonl_files: | |
| print(f"\nπ Found {len(jsonl_files)} JSONL files:") | |
| for file in jsonl_files: | |
| size = file.stat().st_size / 1024 | |
| print(f" - {file.name} ({size:.2f} KB)") | |
| else: | |
| print("\nπ No JSONL files found in data/ directory") | |
| else: | |
| print("\nπ Data directory does not exist") | |
| else: | |
| print("β Pilihan tidak valid") | |
| if __name__ == "__main__": | |
| main() | |