Joar Paganus
update UI
efb82bb
raw
history blame
9.6 kB
import subprocess
import sys
# Install llama-cpp-python at runtime so it builds against the current OS (glibc)
subprocess.run(
[sys.executable, "-m", "pip", "install", "-U", "llama-cpp-python==0.3.16"],
check=True,
)
# ---------------- CONFIG ----------------
BASE_REPO_ID = "unsloth/Llama-3.2-3B-Instruct-GGUF"
BASE_FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
FT_REPO_ID = "JoarP/Llama-3.2-3B-FineTome5K-gguf"
FT_FILENAME = "v1"
N_CTX = 2048
N_THREADS = None
import gradio as gr
from agent import respond, build_prompt
from llama_cpp import Llama
# ------------- LOAD MODELS ON CPU --------------
print("Loading finetuned model")
llm_ft = Llama.from_pretrained(
repo_id=FT_REPO_ID,
filename=FT_FILENAME,
n_ctx=N_CTX,
n_threads=N_THREADS,
)
AVAILABLE_MODELS = {
"Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)": llm_ft,
}
# -------------Using one model for faster deployement during development -------------------
# print("Loading base model...")
# llm_base = Llama.from_pretrained(
# repo_id=BASE_REPO_ID,
# filename=BASE_FILENAME,
# n_ctx=N_CTX,
# n_threads=N_THREADS,
# )
# AVAILABLE_MODELS = {
# "Base: Llama 3.2 3B Instruct (q4_k_m)": llm_base,
# }
# try:
# print("Attempting to load fine-tuned model...")
# llm_ft = Llama.from_pretrained(
# repo_id=FT_REPO_ID,
# filename=FT_FILENAME,
# n_ctx=N_CTX,
# n_threads=N_THREADS,
# )
# AVAILABLE_MODELS["Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)"] = llm_ft
# FT_LOAD_ERROR = None
# except Exception as e:
# llm_ft = None
# FT_LOAD_ERROR = str(e)
# print(f"Could not load fine-tuned model yet: {e}")
# System message:
SYSTEM_MESSAGE_WEATHER = """
You are a helpful assistant that answers user questions using any external information provided in the system message.
The system message may include a section like:
"You have executed the following tools (name, args, result):"
followed by one or more lines of the form:
- tool_name(args_dict) -> result_value
Instructions:
- Treat these tool results as ground truth for the current reply.
- Use them to give a clear, concise, and friendly answer to the user's latest question.
- Do not repeat the raw tool logs verbatim unless it is natural to do so.
- You may summarize or rephrase the results in natural language.
- If multiple results are present, combine them into a single coherent answer.
- If no tool results are present, answer the question based on your own knowledge and the conversation history.
- Do not mention that you are using “tools” or “tool calls”; just speak as a normal assistant.
=== EXAMPLE ===
System (excerpt):
You have executed the following tools (name, args, result):
- get_temperature({'location': 'Berlin'}) -> 20
- get_weather({'location': 'Berlin'}) -> sunny
User:
What is it like in Berlin right now?
Assistant:
It's sunny in Berlin right now, with a temperature of about 20 degrees.
"""
SYSTEM_MESSAGE_GENERAL = """
You are a friendly, helpful, and knowledgeable AI assistant.
Your goals:
- Give clear, accurate, and concise answers.
- Be honest when you don't know something.
- Use the conversation history to stay consistent.
- Ask clarifying questions when the user’s request is ambiguous.
- Avoid unnecessary repetition or overly long explanations.
- Be polite, neutral, and informative.
You can answer questions on any topic, including:
- general knowledge
- mathematics and reasoning
- writing and summarization
- programming and debugging
- everyday advice and explanations
Do not claim access to external tools, APIs, the internet, or real-time data.
All your responses must be based only on your internal knowledge and the conversation context.
Your tone: helpful, calm, and professional.
"""
# ------------- WRAPPER FUNCTION ----------------
# Needed to be able to pass the llm to respond() inside agent.py
def app_respond(message, history, system_message, model_choice="Base: Llama 3.2 3B Instruct (q4_k_m)"):
"""
Wrapper used by Gradio.
- model_choice: string from the dropdown (key in AVAILABLE_MODELS)
"""
llm = AVAILABLE_MODELS.get(model_choice)
if llm is None:
# Fallback: first model in dict
llm = next(iter(AVAILABLE_MODELS.values()))
# Delegate to the core agent logic (which expects an llm object)
for chunk in respond(message, history, system_message, llm):
yield chunk
# ------------- No agent and just a single LLM call ----------------
def respond_fast(message, history, system_message, model_choice):
"""
Fast path: no tools, no agent. Just a single LLM call with the
given system message and chat history.
"""
# Pick model from dropdown
llm = AVAILABLE_MODELS.get(model_choice)
if llm is None:
llm = next(iter(AVAILABLE_MODELS.values()))
# Build a simple chat-style prompt
prompt = build_prompt(system_message, history, message)
# Single streaming generation
stream = llm(
prompt,
max_tokens=256,
temperature=0.7,
top_p=0.9,
stop=["User:", "System:"],
stream=True,
)
partial = ""
for out in stream:
token = out["choices"][0]["text"]
partial += token
yield partial
# ------------- GRADIO UI ----------------
with gr.Blocks() as demo:
gr.Markdown(
"# Finetuned Llama 3.2 3B (CPU, GGUF) in an Agentic Framework\n"
"Switch between a general assistant and a live weather assistant."
)
with gr.Tabs():
# -------- TAB 1: GENERAL LLM ASSISTANT --------
with gr.Tab("💬 General Assistant"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"### General Assistant\n"
"Chat with the base or fine-tuned model. Use this mode for any kind of question."
)
model_dropdown = gr.Dropdown(
label="Model",
choices=list(AVAILABLE_MODELS.keys()),
value=list(AVAILABLE_MODELS.keys())[0],
interactive=True,
)
with gr.Column(scale=3, elem_id="general-chat"):
general_chatbot = gr.ChatInterface(
fn=respond_fast,
additional_inputs=[
gr.State(SYSTEM_MESSAGE_GENERAL),
model_dropdown,
],
)
# -------- TAB 2: LIVE WEATHER ASSISTANT --------
with gr.Tab("☀️ LIVE Weather Assistant"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Live Weather Assistant\n"
"Fetches up to date weather data"
)
model_dropdown = gr.Dropdown(
label="Model",
choices=list(AVAILABLE_MODELS.keys()),
value=list(AVAILABLE_MODELS.keys())[0],
interactive=True,
)
with gr.Column(scale=3, elem_id="weather-chat"):
# 🌤️ ASSISTANT HEADER (name + avatar + tagline)
gr.HTML(
"""
<div style="
display: flex;
align-items: center;
gap: 15px;
padding: 12px 16px;
border-radius: 12px;
margin-bottom: 10px;
">
<!-- Avatar circle -->
<div style="
width: 64px;
height: 64px;
border-radius: 50%;
background: radial-gradient(circle at 30% 30%, #facc15, #eab308, #ca8a04);
display: flex;
align-items: center;
justify-content: center;
font-weight: 700;
font-size: 26px;
color: #1f2937;
box-shadow: 0 4px 10px rgba(0,0,0,0.15);
">
M
</div>
<!-- Name + description -->
<div>
<div style="font-size: 20px; font-weight: 700; color: #333;">
Meteo-Mila
</div>
<div style="font-size: 14px; color: #555; margin-top: 2px;">
I know everything about the current weather and temperature.<br>
I can also provide forecasts into the future! 🌦️
</div>
</div>
</div>
"""
)
general_chatbot = gr.ChatInterface(
fn=app_respond,
additional_inputs=[
gr.State(SYSTEM_MESSAGE_WEATHER),
model_dropdown,
],
)
if __name__ == "__main__":
demo.launch()