import subprocess
import sys

# Install llama-cpp-python at runtime so it builds against the current OS (glibc)
subprocess.run(
    [sys.executable, "-m", "pip", "install", "-U", "llama-cpp-python==0.3.16"],
    check=True,
)

# ---------------- CONFIG ----------------

BASE_REPO_ID = "unsloth/Llama-3.2-3B-Instruct-GGUF"
BASE_FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

FT_REPO_ID = "JoarP/Llama-3.2-3B-FineTome5K-gguf"
FT_FILENAME = "v1"

N_CTX = 2048
N_THREADS = None

import gradio as gr
from agent import respond, build_prompt
from llama_cpp import Llama


# ------------- LOAD MODELS ON CPU --------------

print("Loading finetuned model")
llm_ft = Llama.from_pretrained(
        repo_id=FT_REPO_ID,
        filename=FT_FILENAME,
        n_ctx=N_CTX,
        n_threads=N_THREADS,
    )
AVAILABLE_MODELS = {
    "Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)": llm_ft,
}

# -------------Using one model for faster deployement during development -------------------

# print("Loading base model...")
# llm_base = Llama.from_pretrained(
#     repo_id=BASE_REPO_ID,
#     filename=BASE_FILENAME,
#     n_ctx=N_CTX,
#     n_threads=N_THREADS,
# )

# AVAILABLE_MODELS = {
#     "Base: Llama 3.2 3B Instruct (q4_k_m)": llm_base,
# }

# try:
#     print("Attempting to load fine-tuned model...")
#     llm_ft = Llama.from_pretrained(
#         repo_id=FT_REPO_ID,
#         filename=FT_FILENAME,
#         n_ctx=N_CTX,
#         n_threads=N_THREADS,
#     )
#     AVAILABLE_MODELS["Fine-tuned: Llama 3.2 3B FineTome (q4_k_m)"] = llm_ft
#     FT_LOAD_ERROR = None
# except Exception as e:
#     llm_ft = None
#     FT_LOAD_ERROR = str(e)
#     print(f"Could not load fine-tuned model yet: {e}")


# System message:
SYSTEM_MESSAGE_WEATHER = """
You are a helpful assistant that answers user questions using any external information provided in the system message.

The system message may include a section like:
"You have executed the following tools (name, args, result):"
followed by one or more lines of the form:
- tool_name(args_dict) -> result_value

Instructions:
- Treat these tool results as ground truth for the current reply.
- Use them to give a clear, concise, and friendly answer to the user's latest question.
- Do not repeat the raw tool logs verbatim unless it is natural to do so.
- You may summarize or rephrase the results in natural language.
- If multiple results are present, combine them into a single coherent answer.
- If no tool results are present, answer the question based on your own knowledge and the conversation history.
- Do not mention that you are using “tools” or “tool calls”; just speak as a normal assistant.

=== EXAMPLE ===

System (excerpt):
You have executed the following tools (name, args, result):
- get_temperature({'location': 'Berlin'}) -> 20
- get_weather({'location': 'Berlin'}) -> sunny

User:
What is it like in Berlin right now?

Assistant:
It's sunny in Berlin right now, with a temperature of about 20 degrees.
"""

SYSTEM_MESSAGE_GENERAL = """
You are a friendly, helpful, and knowledgeable AI assistant.

Your goals:
- Give clear, accurate, and concise answers.
- Be honest when you don't know something.
- Use the conversation history to stay consistent.
- Ask clarifying questions when the user’s request is ambiguous.
- Avoid unnecessary repetition or overly long explanations.
- Be polite, neutral, and informative.

You can answer questions on any topic, including:
- general knowledge
- mathematics and reasoning
- writing and summarization
- programming and debugging
- everyday advice and explanations

Do not claim access to external tools, APIs, the internet, or real-time data.
All your responses must be based only on your internal knowledge and the conversation context.

Your tone: helpful, calm, and professional.
"""


# ------------- WRAPPER FUNCTION ----------------
# Needed to be able to pass the llm to respond() inside agent.py

def app_respond(message, history, system_message, model_choice="Base: Llama 3.2 3B Instruct (q4_k_m)"):
    """
    Wrapper used by Gradio.
    - model_choice: string from the dropdown (key in AVAILABLE_MODELS)
    """
    llm = AVAILABLE_MODELS.get(model_choice)
    if llm is None:
        # Fallback: first model in dict
        llm = next(iter(AVAILABLE_MODELS.values()))

    # Delegate to the core agent logic (which expects an llm object)
    for chunk in respond(message, history, system_message, llm):
        yield chunk

# ------------- No agent and just a single LLM call ----------------
def respond_fast(message, history, system_message, model_choice):
    """
    Fast path: no tools, no agent. Just a single LLM call with the
    given system message and chat history.
    """
    # Pick model from dropdown
    llm = AVAILABLE_MODELS.get(model_choice)
    if llm is None:
        llm = next(iter(AVAILABLE_MODELS.values()))

    # Build a simple chat-style prompt
    prompt = build_prompt(system_message, history, message)

    # Single streaming generation
    stream = llm(
        prompt,
        max_tokens=256,
        temperature=0.7,
        top_p=0.9,
        stop=["User:", "System:"],
        stream=True,
    )

    partial = ""
    for out in stream:
        token = out["choices"][0]["text"]
        partial += token
        yield partial


# ------------- GRADIO UI ----------------

with gr.Blocks() as demo:
    gr.Markdown(
        "# Finetuned Llama 3.2 3B (CPU, GGUF) in an Agentic Framework\n"
        "Switch between a general assistant and a live weather assistant."
    )

    with gr.Tabs():
        # -------- TAB 1: GENERAL LLM ASSISTANT --------
        with gr.Tab("💬 General Assistant"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown(
                        "### General Assistant\n"
                        "Chat with the base or fine-tuned model. Use this mode for any kind of question."
                    )
                    model_dropdown = gr.Dropdown(
                        label="Model",
                        choices=list(AVAILABLE_MODELS.keys()),
                        value=list(AVAILABLE_MODELS.keys())[0],
                        interactive=True,
                    )

                with gr.Column(scale=3, elem_id="general-chat"):
                    general_chatbot = gr.ChatInterface(
                        fn=respond_fast,
                        additional_inputs=[
                            gr.State(SYSTEM_MESSAGE_GENERAL),
                            model_dropdown,
                        ],
                    )

        # -------- TAB 2: LIVE WEATHER ASSISTANT --------
        with gr.Tab("☀️ LIVE Weather Assistant"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Live Weather Assistant\n"
                            "Fetches up to date weather data"
                    )
                    model_dropdown = gr.Dropdown(
                        label="Model",
                        choices=list(AVAILABLE_MODELS.keys()),
                        value=list(AVAILABLE_MODELS.keys())[0],
                        interactive=True,
                    )

                with gr.Column(scale=3, elem_id="weather-chat"):
                    # 🌤️ ASSISTANT HEADER (name + avatar + tagline)
                    gr.HTML(
                        """
                        <div style="
                            display: flex;
                            align-items: center;
                            gap: 15px;
                            padding: 12px 16px;
                            border-radius: 12px;
                            margin-bottom: 10px;
                        ">

                            <!-- Avatar circle -->
                            <div style="
                                width: 64px;
                                height: 64px;
                                border-radius: 50%;
                                background: radial-gradient(circle at 30% 30%, #facc15, #eab308, #ca8a04);
                                display: flex;
                                align-items: center;
                                justify-content: center;
                                font-weight: 700;
                                font-size: 26px;
                                color: #1f2937;
                                box-shadow: 0 4px 10px rgba(0,0,0,0.15);
                            ">
                                M
                            </div>

                            <!-- Name + description -->
                            <div>
                                <div style="font-size: 20px; font-weight: 700; color: #333;">
                                    Meteo-Mila
                                </div>
                                <div style="font-size: 14px; color: #555; margin-top: 2px;">
                                    I know everything about the current weather and temperature.<br>
                                    I can also provide forecasts into the future! 🌦️
                                </div>
                            </div>

                        </div>
                        """
                    )

                    general_chatbot = gr.ChatInterface(
                        fn=app_respond,
                        additional_inputs=[
                            gr.State(SYSTEM_MESSAGE_WEATHER),
                            model_dropdown,
                        ],
                    )


if __name__ == "__main__":
    demo.launch()