import subprocess import sys # Install llama-cpp-python at runtime so it builds against the current OS (glibc) subprocess.run( [sys.executable, "-m", "pip", "install", "-U", "llama-cpp-python==0.3.16"], check=True, ) # ---------------- CONFIG ---------------- BASE_REPO_ID = "unsloth/Llama-3.2-3B-Instruct-GGUF" BASE_FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" FT_REPO_ID = "JoarP/Llama-3.2-3B-FineTome5K-gguf" FT_FILENAME = "v1" N_CTX = 2048 N_THREADS = None import gradio as gr from agent import respond, build_prompt from llama_cpp import Llama # ------------- LOAD MODELS ON CPU -------------- print("Loading finetuned model") llm_ft = Llama.from_pretrained( repo_id=FT_REPO_ID, filename=FT_FILENAME, n_ctx=N_CTX, n_threads=N_THREADS, ) AVAILABLE_MODELS = { "Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)": llm_ft, } # -------------Using one model for faster deployement during development ------------------- # print("Loading base model...") # llm_base = Llama.from_pretrained( # repo_id=BASE_REPO_ID, # filename=BASE_FILENAME, # n_ctx=N_CTX, # n_threads=N_THREADS, # ) # AVAILABLE_MODELS = { # "Base: Llama 3.2 3B Instruct (q4_k_m)": llm_base, # } # try: # print("Attempting to load fine-tuned model...") # llm_ft = Llama.from_pretrained( # repo_id=FT_REPO_ID, # filename=FT_FILENAME, # n_ctx=N_CTX, # n_threads=N_THREADS, # ) # AVAILABLE_MODELS["Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)"] = llm_ft # FT_LOAD_ERROR = None # except Exception as e: # llm_ft = None # FT_LOAD_ERROR = str(e) # print(f"Could not load fine-tuned model yet: {e}") # System message: SYSTEM_MESSAGE_WEATHER = """ You are a helpful assistant that answers user questions using any external information provided in the system message. The system message may include a section like: "You have executed the following tools (name, args, result):" followed by one or more lines of the form: - tool_name(args_dict) -> result_value Instructions: - Treat these tool results as ground truth for the current reply. - Use them to give a clear, concise, and friendly answer to the user's latest question. - Do not repeat the raw tool logs verbatim unless it is natural to do so. - You may summarize or rephrase the results in natural language. - If multiple results are present, combine them into a single coherent answer. - If no tool results are present, answer the question based on your own knowledge and the conversation history. - Do not mention that you are using “tools” or “tool calls”; just speak as a normal assistant. === EXAMPLE === System (excerpt): You have executed the following tools (name, args, result): - get_temperature({'location': 'Berlin'}) -> 20 - get_weather({'location': 'Berlin'}) -> sunny User: What is it like in Berlin right now? Assistant: It's sunny in Berlin right now, with a temperature of about 20 degrees. """ SYSTEM_MESSAGE_GENERAL = """ You are a friendly, helpful, and knowledgeable AI assistant. Your goals: - Give clear, accurate, and concise answers. - Be honest when you don't know something. - Use the conversation history to stay consistent. - Ask clarifying questions when the user’s request is ambiguous. - Avoid unnecessary repetition or overly long explanations. - Be polite, neutral, and informative. You can answer questions on any topic, including: - general knowledge - mathematics and reasoning - writing and summarization - programming and debugging - everyday advice and explanations Do not claim access to external tools, APIs, the internet, or real-time data. All your responses must be based only on your internal knowledge and the conversation context. Your tone: helpful, calm, and professional. """ # ------------- WRAPPER FUNCTION ---------------- # Needed to be able to pass the llm to respond() inside agent.py def app_respond(message, history, system_message, model_choice="Base: Llama 3.2 3B Instruct (q4_k_m)"): """ Wrapper used by Gradio. - model_choice: string from the dropdown (key in AVAILABLE_MODELS) """ llm = AVAILABLE_MODELS.get(model_choice) if llm is None: # Fallback: first model in dict llm = next(iter(AVAILABLE_MODELS.values())) # Delegate to the core agent logic (which expects an llm object) for chunk in respond(message, history, system_message, llm): yield chunk # ------------- No agent and just a single LLM call ---------------- def respond_fast(message, history, system_message, model_choice): """ Fast path: no tools, no agent. Just a single LLM call with the given system message and chat history. """ # Pick model from dropdown llm = AVAILABLE_MODELS.get(model_choice) if llm is None: llm = next(iter(AVAILABLE_MODELS.values())) # Build a simple chat-style prompt prompt = build_prompt(system_message, history, message) # Single streaming generation stream = llm( prompt, max_tokens=256, temperature=0.7, top_p=0.9, stop=["User:", "System:"], stream=True, ) partial = "" for out in stream: token = out["choices"][0]["text"] partial += token yield partial # ------------- GRADIO UI ---------------- with gr.Blocks() as demo: gr.Markdown( "# Finetuned Llama 3.2 3B (CPU, GGUF) in an Agentic Framework\n" "Switch between a general assistant and a live weather assistant." ) with gr.Tabs(): # -------- TAB 1: GENERAL LLM ASSISTANT -------- with gr.Tab("💬 General Assistant"): with gr.Row(): with gr.Column(scale=1): gr.Markdown( "### General Assistant\n" "Chat with the base or fine-tuned model. Use this mode for any kind of question." ) model_dropdown = gr.Dropdown( label="Model", choices=list(AVAILABLE_MODELS.keys()), value=list(AVAILABLE_MODELS.keys())[0], interactive=True, ) with gr.Column(scale=3, elem_id="general-chat"): general_chatbot = gr.ChatInterface( fn=respond_fast, additional_inputs=[ gr.State(SYSTEM_MESSAGE_GENERAL), model_dropdown, ], ) # -------- TAB 2: LIVE WEATHER ASSISTANT -------- with gr.Tab("☀️ LIVE Weather Assistant"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Live Weather Assistant\n" "Fetches up to date weather data" ) model_dropdown = gr.Dropdown( label="Model", choices=list(AVAILABLE_MODELS.keys()), value=list(AVAILABLE_MODELS.keys())[0], interactive=True, ) with gr.Column(scale=3, elem_id="weather-chat"): # 🌤️ ASSISTANT HEADER (name + avatar + tagline) gr.HTML( """
M
Meteo-Mila
I know everything about the current weather and temperature.
I can also provide forecasts into the future! 🌦️
""" ) general_chatbot = gr.ChatInterface( fn=app_respond, additional_inputs=[ gr.State(SYSTEM_MESSAGE_WEATHER), model_dropdown, ], ) if __name__ == "__main__": demo.launch()