|
|
import subprocess |
|
|
import sys |
|
|
|
|
|
|
|
|
subprocess.run( |
|
|
[sys.executable, "-m", "pip", "install", "-U", "llama-cpp-python==0.3.16"], |
|
|
check=True, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
BASE_REPO_ID = "unsloth/Llama-3.2-3B-Instruct-GGUF" |
|
|
BASE_FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" |
|
|
|
|
|
FT_REPO_ID = "JoarP/Llama-3.2-3B-FineTome5K-gguf" |
|
|
FT_FILENAME = "v1" |
|
|
|
|
|
N_CTX = 2048 |
|
|
N_THREADS = None |
|
|
|
|
|
import gradio as gr |
|
|
from agent import respond, build_prompt |
|
|
from llama_cpp import Llama |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading finetuned model") |
|
|
llm_ft = Llama.from_pretrained( |
|
|
repo_id=FT_REPO_ID, |
|
|
filename=FT_FILENAME, |
|
|
n_ctx=N_CTX, |
|
|
n_threads=N_THREADS, |
|
|
) |
|
|
AVAILABLE_MODELS = { |
|
|
"Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)": llm_ft, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SYSTEM_MESSAGE_WEATHER = """ |
|
|
You are a helpful assistant that answers user questions using any external information provided in the system message. |
|
|
|
|
|
The system message may include a section like: |
|
|
"You have executed the following tools (name, args, result):" |
|
|
followed by one or more lines of the form: |
|
|
- tool_name(args_dict) -> result_value |
|
|
|
|
|
Instructions: |
|
|
- Treat these tool results as ground truth for the current reply. |
|
|
- Use them to give a clear, concise, and friendly answer to the user's latest question. |
|
|
- Do not repeat the raw tool logs verbatim unless it is natural to do so. |
|
|
- You may summarize or rephrase the results in natural language. |
|
|
- If multiple results are present, combine them into a single coherent answer. |
|
|
- If no tool results are present, answer the question based on your own knowledge and the conversation history. |
|
|
- Do not mention that you are using “tools” or “tool calls”; just speak as a normal assistant. |
|
|
|
|
|
=== EXAMPLE === |
|
|
|
|
|
System (excerpt): |
|
|
You have executed the following tools (name, args, result): |
|
|
- get_temperature({'location': 'Berlin'}) -> 20 |
|
|
- get_weather({'location': 'Berlin'}) -> sunny |
|
|
|
|
|
User: |
|
|
What is it like in Berlin right now? |
|
|
|
|
|
Assistant: |
|
|
It's sunny in Berlin right now, with a temperature of about 20 degrees. |
|
|
""" |
|
|
|
|
|
SYSTEM_MESSAGE_GENERAL = """ |
|
|
You are a friendly, helpful, and knowledgeable AI assistant. |
|
|
|
|
|
Your goals: |
|
|
- Give clear, accurate, and concise answers. |
|
|
- Be honest when you don't know something. |
|
|
- Use the conversation history to stay consistent. |
|
|
- Ask clarifying questions when the user’s request is ambiguous. |
|
|
- Avoid unnecessary repetition or overly long explanations. |
|
|
- Be polite, neutral, and informative. |
|
|
|
|
|
You can answer questions on any topic, including: |
|
|
- general knowledge |
|
|
- mathematics and reasoning |
|
|
- writing and summarization |
|
|
- programming and debugging |
|
|
- everyday advice and explanations |
|
|
|
|
|
Do not claim access to external tools, APIs, the internet, or real-time data. |
|
|
All your responses must be based only on your internal knowledge and the conversation context. |
|
|
|
|
|
Your tone: helpful, calm, and professional. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def app_respond(message, history, system_message, model_choice="Base: Llama 3.2 3B Instruct (q4_k_m)"): |
|
|
""" |
|
|
Wrapper used by Gradio. |
|
|
- model_choice: string from the dropdown (key in AVAILABLE_MODELS) |
|
|
""" |
|
|
llm = AVAILABLE_MODELS.get(model_choice) |
|
|
if llm is None: |
|
|
|
|
|
llm = next(iter(AVAILABLE_MODELS.values())) |
|
|
|
|
|
|
|
|
for chunk in respond(message, history, system_message, llm): |
|
|
yield chunk |
|
|
|
|
|
|
|
|
def respond_fast(message, history, system_message, model_choice): |
|
|
""" |
|
|
Fast path: no tools, no agent. Just a single LLM call with the |
|
|
given system message and chat history. |
|
|
""" |
|
|
|
|
|
llm = AVAILABLE_MODELS.get(model_choice) |
|
|
if llm is None: |
|
|
llm = next(iter(AVAILABLE_MODELS.values())) |
|
|
|
|
|
|
|
|
prompt = build_prompt(system_message, history, message) |
|
|
|
|
|
|
|
|
stream = llm( |
|
|
prompt, |
|
|
max_tokens=256, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
stop=["User:", "System:"], |
|
|
stream=True, |
|
|
) |
|
|
|
|
|
partial = "" |
|
|
for out in stream: |
|
|
token = out["choices"][0]["text"] |
|
|
partial += token |
|
|
yield partial |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown( |
|
|
"# Finetuned Llama 3.2 3B (CPU, GGUF) in an Agentic Framework\n" |
|
|
"Switch between a general assistant and a live weather assistant." |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("💬 General Assistant"): |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown( |
|
|
"### General Assistant\n" |
|
|
"Chat with the base or fine-tuned model. Use this mode for any kind of question." |
|
|
) |
|
|
model_dropdown = gr.Dropdown( |
|
|
label="Model", |
|
|
choices=list(AVAILABLE_MODELS.keys()), |
|
|
value=list(AVAILABLE_MODELS.keys())[0], |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
with gr.Column(scale=3, elem_id="general-chat"): |
|
|
general_chatbot = gr.ChatInterface( |
|
|
fn=respond_fast, |
|
|
additional_inputs=[ |
|
|
gr.State(SYSTEM_MESSAGE_GENERAL), |
|
|
model_dropdown, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("☀️ LIVE Weather Assistant"): |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### Live Weather Assistant\n" |
|
|
"Fetches up to date weather data" |
|
|
) |
|
|
model_dropdown = gr.Dropdown( |
|
|
label="Model", |
|
|
choices=list(AVAILABLE_MODELS.keys()), |
|
|
value=list(AVAILABLE_MODELS.keys())[0], |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
with gr.Column(scale=3, elem_id="weather-chat"): |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div style=" |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 15px; |
|
|
padding: 12px 16px; |
|
|
border-radius: 12px; |
|
|
margin-bottom: 10px; |
|
|
"> |
|
|
|
|
|
<!-- Avatar circle --> |
|
|
<div style=" |
|
|
width: 64px; |
|
|
height: 64px; |
|
|
border-radius: 50%; |
|
|
background: radial-gradient(circle at 30% 30%, #facc15, #eab308, #ca8a04); |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
font-weight: 700; |
|
|
font-size: 26px; |
|
|
color: #1f2937; |
|
|
box-shadow: 0 4px 10px rgba(0,0,0,0.15); |
|
|
"> |
|
|
M |
|
|
</div> |
|
|
|
|
|
<!-- Name + description --> |
|
|
<div> |
|
|
<div style="font-size: 20px; font-weight: 700; color: #333;"> |
|
|
Meteo-Mila |
|
|
</div> |
|
|
<div style="font-size: 14px; color: #555; margin-top: 2px;"> |
|
|
I know everything about the current weather and temperature.<br> |
|
|
I can also provide forecasts into the future! 🌦️ |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
general_chatbot = gr.ChatInterface( |
|
|
fn=app_respond, |
|
|
additional_inputs=[ |
|
|
gr.State(SYSTEM_MESSAGE_WEATHER), |
|
|
model_dropdown, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |