Spaces:

JoarP
/

enhancing-llama-with-icl-demo

Sleeping

Joar Paganus

update UI

efb82bb 18 days ago

9.6 kB

	import subprocess
	import sys

	# Install llama-cpp-python at runtime so it builds against the current OS (glibc)
	subprocess.run(
	[sys.executable, "-m", "pip", "install", "-U", "llama-cpp-python==0.3.16"],
	check=True,
	)

	# ---------------- CONFIG ----------------

	BASE_REPO_ID = "unsloth/Llama-3.2-3B-Instruct-GGUF"
	BASE_FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

	FT_REPO_ID = "JoarP/Llama-3.2-3B-FineTome5K-gguf"
	FT_FILENAME = "v1"

	N_CTX = 2048
	N_THREADS = None

	import gradio as gr
	from agent import respond, build_prompt
	from llama_cpp import Llama


	# ------------- LOAD MODELS ON CPU --------------

	print("Loading finetuned model")
	llm_ft = Llama.from_pretrained(
	repo_id=FT_REPO_ID,
	filename=FT_FILENAME,
	n_ctx=N_CTX,
	n_threads=N_THREADS,
	)
	AVAILABLE_MODELS = {
	"Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)": llm_ft,
	}

	# -------------Using one model for faster deployement during development -------------------

	# print("Loading base model...")
	# llm_base = Llama.from_pretrained(
	# repo_id=BASE_REPO_ID,
	# filename=BASE_FILENAME,
	# n_ctx=N_CTX,
	# n_threads=N_THREADS,
	# )

	# AVAILABLE_MODELS = {
	# "Base: Llama 3.2 3B Instruct (q4_k_m)": llm_base,
	# }

	# try:
	# print("Attempting to load fine-tuned model...")
	# llm_ft = Llama.from_pretrained(
	# repo_id=FT_REPO_ID,
	# filename=FT_FILENAME,
	# n_ctx=N_CTX,
	# n_threads=N_THREADS,
	# )
	# AVAILABLE_MODELS["Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)"] = llm_ft
	# FT_LOAD_ERROR = None
	# except Exception as e:
	# llm_ft = None
	# FT_LOAD_ERROR = str(e)
	# print(f"Could not load fine-tuned model yet: {e}")



	# System message:
	SYSTEM_MESSAGE_WEATHER = """
	You are a helpful assistant that answers user questions using any external information provided in the system message.

	The system message may include a section like:
	"You have executed the following tools (name, args, result):"
	followed by one or more lines of the form:
	- tool_name(args_dict) -> result_value

	Instructions:
	- Treat these tool results as ground truth for the current reply.
	- Use them to give a clear, concise, and friendly answer to the user's latest question.
	- Do not repeat the raw tool logs verbatim unless it is natural to do so.
	- You may summarize or rephrase the results in natural language.
	- If multiple results are present, combine them into a single coherent answer.
	- If no tool results are present, answer the question based on your own knowledge and the conversation history.
	- Do not mention that you are using “tools” or “tool calls”; just speak as a normal assistant.

	=== EXAMPLE ===

	System (excerpt):
	You have executed the following tools (name, args, result):
	- get_temperature({'location': 'Berlin'}) -> 20
	- get_weather({'location': 'Berlin'}) -> sunny

	User:
	What is it like in Berlin right now?

	Assistant:
	It's sunny in Berlin right now, with a temperature of about 20 degrees.
	"""

	SYSTEM_MESSAGE_GENERAL = """
	You are a friendly, helpful, and knowledgeable AI assistant.

	Your goals:
	- Give clear, accurate, and concise answers.
	- Be honest when you don't know something.
	- Use the conversation history to stay consistent.
	- Ask clarifying questions when the user’s request is ambiguous.
	- Avoid unnecessary repetition or overly long explanations.
	- Be polite, neutral, and informative.

	You can answer questions on any topic, including:
	- general knowledge
	- mathematics and reasoning
	- writing and summarization
	- programming and debugging
	- everyday advice and explanations

	Do not claim access to external tools, APIs, the internet, or real-time data.
	All your responses must be based only on your internal knowledge and the conversation context.

	Your tone: helpful, calm, and professional.
	"""


	# ------------- WRAPPER FUNCTION ----------------
	# Needed to be able to pass the llm to respond() inside agent.py

	def app_respond(message, history, system_message, model_choice="Base: Llama 3.2 3B Instruct (q4_k_m)"):
	"""
	Wrapper used by Gradio.
	- model_choice: string from the dropdown (key in AVAILABLE_MODELS)
	"""
	llm = AVAILABLE_MODELS.get(model_choice)
	if llm is None:
	# Fallback: first model in dict
	llm = next(iter(AVAILABLE_MODELS.values()))

	# Delegate to the core agent logic (which expects an llm object)
	for chunk in respond(message, history, system_message, llm):
	yield chunk

	# ------------- No agent and just a single LLM call ----------------
	def respond_fast(message, history, system_message, model_choice):
	"""
	Fast path: no tools, no agent. Just a single LLM call with the
	given system message and chat history.
	"""
	# Pick model from dropdown
	llm = AVAILABLE_MODELS.get(model_choice)
	if llm is None:
	llm = next(iter(AVAILABLE_MODELS.values()))

	# Build a simple chat-style prompt
	prompt = build_prompt(system_message, history, message)

	# Single streaming generation
	stream = llm(
	prompt,
	max_tokens=256,
	temperature=0.7,
	top_p=0.9,
	stop=["User:", "System:"],
	stream=True,
	)

	partial = ""
	for out in stream:
	token = out["choices"][0]["text"]
	partial += token
	yield partial


	# ------------- GRADIO UI ----------------

	with gr.Blocks() as demo:
	gr.Markdown(
	"# Finetuned Llama 3.2 3B (CPU, GGUF) in an Agentic Framework\n"
	"Switch between a general assistant and a live weather assistant."
	)

	with gr.Tabs():
	# -------- TAB 1: GENERAL LLM ASSISTANT --------
	with gr.Tab("💬 General Assistant"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown(
	"### General Assistant\n"
	"Chat with the base or fine-tuned model. Use this mode for any kind of question."
	)
	model_dropdown = gr.Dropdown(
	label="Model",
	choices=list(AVAILABLE_MODELS.keys()),
	value=list(AVAILABLE_MODELS.keys())[0],
	interactive=True,
	)

	with gr.Column(scale=3, elem_id="general-chat"):
	general_chatbot = gr.ChatInterface(
	fn=respond_fast,
	additional_inputs=[
	gr.State(SYSTEM_MESSAGE_GENERAL),
	model_dropdown,
	],
	)

	# -------- TAB 2: LIVE WEATHER ASSISTANT --------
	with gr.Tab("☀️ LIVE Weather Assistant"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Live Weather Assistant\n"
	"Fetches up to date weather data"
	)
	model_dropdown = gr.Dropdown(
	label="Model",
	choices=list(AVAILABLE_MODELS.keys()),
	value=list(AVAILABLE_MODELS.keys())[0],
	interactive=True,
	)

	with gr.Column(scale=3, elem_id="weather-chat"):
	# 🌤️ ASSISTANT HEADER (name + avatar + tagline)
	gr.HTML(
	"""
	<div style="
	display: flex;
	align-items: center;
	gap: 15px;
	padding: 12px 16px;
	border-radius: 12px;
	margin-bottom: 10px;
	">

	<!-- Avatar circle -->
	<div style="
	width: 64px;
	height: 64px;
	border-radius: 50%;
	background: radial-gradient(circle at 30% 30%, #facc15, #eab308, #ca8a04);
	display: flex;
	align-items: center;
	justify-content: center;
	font-weight: 700;
	font-size: 26px;
	color: #1f2937;
	box-shadow: 0 4px 10px rgba(0,0,0,0.15);
	">
	M
	</div>

	<!-- Name + description -->
	<div>
	<div style="font-size: 20px; font-weight: 700; color: #333;">
	Meteo-Mila
	</div>
	<div style="font-size: 14px; color: #555; margin-top: 2px;">
	I know everything about the current weather and temperature.<br>
	I can also provide forecasts into the future! 🌦️
	</div>
	</div>

	</div>
	"""
	)

	general_chatbot = gr.ChatInterface(
	fn=app_respond,
	additional_inputs=[
	gr.State(SYSTEM_MESSAGE_WEATHER),
	model_dropdown,
	],
	)


	if __name__ == "__main__":
	demo.launch()