Textilindo-AI / scripts /create_sample_dataset.py
harismlnaslm's picture
Add complete scripts directory with training, testing, and deployment tools
e207dc8
raw
history blame
6.92 kB
#!/usr/bin/env python3
"""
Script untuk membuat sample dataset JSONL untuk training
"""
import json
import os
from pathlib import Path
def create_sample_dataset():
"""Create sample JSONL dataset"""
# Sample training data
sample_data = [
{
"text": "Apa itu machine learning? Machine learning adalah cabang dari artificial intelligence yang memungkinkan komputer belajar dari data tanpa diprogram secara eksplisit.",
"category": "education",
"language": "id"
},
{
"text": "Jelaskan tentang deep learning. Deep learning adalah subset dari machine learning yang menggunakan neural network dengan banyak layer untuk memproses data kompleks.",
"category": "education",
"language": "id"
},
{
"text": "Bagaimana cara kerja neural network? Neural network bekerja dengan menerima input, memproses melalui hidden layers, dan menghasilkan output berdasarkan weights yang telah dilatih.",
"category": "education",
"language": "id"
},
{
"text": "Apa keuntungan menggunakan Python untuk AI? Python memiliki library yang lengkap seperti TensorFlow, PyTorch, dan scikit-learn yang memudahkan development AI.",
"category": "programming",
"language": "id"
},
{
"text": "Jelaskan tentang transfer learning. Transfer learning adalah teknik menggunakan model yang sudah dilatih pada dataset besar dan mengadaptasinya untuk task yang lebih spesifik.",
"category": "education",
"language": "id"
},
{
"text": "Bagaimana cara optimize model machine learning? Optimasi dapat dilakukan dengan hyperparameter tuning, feature engineering, dan menggunakan teknik seperti cross-validation.",
"category": "optimization",
"language": "id"
},
{
"text": "Apa itu overfitting? Overfitting terjadi ketika model belajar terlalu detail dari training data sehingga performa pada data baru menurun.",
"category": "education",
"language": "id"
},
{
"text": "Jelaskan tentang regularization. Regularization adalah teknik untuk mencegah overfitting dengan menambahkan penalty pada model complexity.",
"category": "education",
"language": "id"
},
{
"text": "Bagaimana cara handle imbalanced dataset? Dataset tidak seimbang dapat diatasi dengan teknik sampling, class weights, atau menggunakan metrics yang tepat seperti F1-score.",
"category": "data_handling",
"language": "id"
},
{
"text": "Apa itu ensemble learning? Ensemble learning menggabungkan multiple model untuk meningkatkan performa prediksi dan mengurangi variance.",
"category": "education",
"language": "id"
}
]
# Create data directory
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
# Write to JSONL file
output_file = data_dir / "training_data.jsonl"
with open(output_file, 'w', encoding='utf-8') as f:
for item in sample_data:
json.dump(item, f, ensure_ascii=False)
f.write('\n')
print(f"βœ… Sample dataset created: {output_file}")
print(f"πŸ“Š Total samples: {len(sample_data)}")
print(f"πŸ“ File size: {output_file.stat().st_size / 1024:.2f} KB")
# Show sample content
print("\nπŸ“ Sample content:")
print("-" * 50)
for i, item in enumerate(sample_data[:3], 1):
print(f"Sample {i}:")
print(f" Text: {item['text'][:100]}...")
print(f" Category: {item['category']}")
print(f" Language: {item['language']}")
print()
def create_custom_dataset():
"""Create custom dataset from user input"""
print("πŸ”§ Create Custom Dataset")
print("=" * 40)
# Get dataset info
dataset_name = input("Dataset name (without extension): ").strip()
if not dataset_name:
dataset_name = "custom_dataset"
num_samples = input("Number of samples (default 10): ").strip()
try:
num_samples = int(num_samples) if num_samples else 10
except ValueError:
num_samples = 10
print(f"\nπŸ“ Creating {num_samples} samples...")
print("Format: Enter text for each sample (empty line to finish early)")
custom_data = []
for i in range(num_samples):
print(f"\nSample {i+1}/{num_samples}:")
text = input("Text: ").strip()
if not text:
print("Empty text, finishing...")
break
category = input("Category (optional): ").strip() or "general"
language = input("Language (optional, default 'id'): ").strip() or "id"
sample = {
"text": text,
"category": category,
"language": language
}
custom_data.append(sample)
# Ask if user wants to continue
if i < num_samples - 1:
continue_input = input("Continue? (y/n, default y): ").strip().lower()
if continue_input in ['n', 'no']:
break
if not custom_data:
print("❌ No data entered, dataset not created")
return
# Create data directory
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
# Write to JSONL file
output_file = data_dir / f"{dataset_name}.jsonl"
with open(output_file, 'w', encoding='utf-8') as f:
for item in custom_data:
json.dump(item, f, ensure_ascii=False)
f.write('\n')
print(f"\nβœ… Custom dataset created: {output_file}")
print(f"πŸ“Š Total samples: {len(custom_data)}")
def main():
print("πŸ“Š Dataset Creator for LLM Training")
print("=" * 50)
print("Pilih opsi:")
print("1. Create sample dataset (10 samples)")
print("2. Create custom dataset")
print("3. View existing datasets")
choice = input("\nPilihan (1-3): ").strip()
if choice == "1":
create_sample_dataset()
elif choice == "2":
create_custom_dataset()
elif choice == "3":
data_dir = Path("data")
if data_dir.exists():
jsonl_files = list(data_dir.glob("*.jsonl"))
if jsonl_files:
print(f"\nπŸ“ Found {len(jsonl_files)} JSONL files:")
for file in jsonl_files:
size = file.stat().st_size / 1024
print(f" - {file.name} ({size:.2f} KB)")
else:
print("\nπŸ“ No JSONL files found in data/ directory")
else:
print("\nπŸ“ Data directory does not exist")
else:
print("❌ Pilihan tidak valid")
if __name__ == "__main__":
main()