import os from datasets import load_dataset import pandas as pd import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import gradio as gr # Step 1: Load dataset and save to CSV (auto-training data) def load_and_save_dataset(): print("Loading dataset from Hugging Face...") dataset = load_dataset("HuggingFaceFW/fineweb", split="train") print("Saving dataset to data.csv...") dataset.to_csv("data.csv") print("Done! Data saved to data.csv.") return "Dataset loaded and saved to data.csv." # Run on startup load_and_save_dataset() # Step 2: Load GPT-2 model for inference model_name = "gpt2" # or "distilgpt2" for faster inference tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Create generator pipeline generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=-1) # Function to generate responses def generate_response(prompt): responses = generator( prompt, max_length=100, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, num_return_sequences=1 ) return responses[0]['generated_text'].strip() # Step 3: Build Gradio interface with gr.Blocks() as demo: gr.Markdown("## GPT-2 Based AI Assistant with Dataset Loaded from Hugging Face") gr.Textbox(value="Loading dataset...", interactive=False, lines=2) fetch_button = gr.Button("Load Dataset and Save CSV") output_message = gr.Textbox() def fetch_and_confirm(): msg = load_and_save_dataset() return msg fetch_button.click(fetch_and_confirm, outputs=output_message) gr.Markdown("### Ask the AI Assistant") prompt_input = gr.Textbox(label="Enter your prompt", placeholder="Say something...") response_output = gr.Textbox(label="Response", lines=10) def respond(prompt): return generate_response(prompt) gr.Button("Ask").click(respond, inputs=prompt_input, outputs=response_output) demo.launch()