Spaces:

DipakKuma
/

My-Gemma-Chatbot

Sleeping

App Files Files Community

DipakKuma commited on Oct 30

Commit

853c66a

verified ·

1 Parent(s): 2914262

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -37

app.py CHANGED Viewed

@@ -1,42 +1,78 @@
-# app.py (Gradio Space)
-from llama_cpp import Llama
-from huggingface_hub import snapshot_download
 import os
 import gradio as gr
-# ===== SETTINGS =====
-REPO_ID = "lmstudio-community/gemma-3-4b-it-GGUF"   # અથવા તમારું HF repo id
-FILENAME = "gemma-3-4b-it-Q8_0.gguf"               # નામ સચોટ હોવું જરૂરી
-CACHE_DIR = "/tmp/model_cache"                     # Space માં ephemeral dir
-os.environ.setdefault("HF_HOME", "/tmp/hf_home")
-os.environ.setdefault("LLAMA_CACHE", CACHE_DIR)     # llama.cpp caching
-# OPTIONAL: download model into cache (snapshot_download uses HF_TOKEN automatically via secrets)
-print("Downloading model (if needed)...")
-repo_path = snapshot_download(repo_id=REPO_ID, cache_dir=CACHE_DIR)
-model_file_path = os.path.join(repo_path, FILENAME)
-print("Model file path:", model_file_path)
-# Load llama.cpp model (from file in repo_path)
-llm = Llama.from_pretrained(repo_id=REPO_ID, filename=FILENAME)  # llama-cpp-python helper
-def chat(prompt, max_tokens=256, temp=0.2):
-    resp = llm.create_chat_completion(
-        messages=[{"role":"system","content":"You are a helpful assistant."},
-                  {"role":"user","content": prompt}],
-        max_tokens=max_tokens,
-        temperature=temp,
-        stop=["<eos>"],
-    )
-    return resp["choices"][0]["message"]["content"]
-with gr.Blocks() as demo:
-    gr.Markdown("## Gemma-3-4b-it (GGUF) demo")
-    txt = gr.Textbox(lines=4, placeholder="તમારો પ્રોમ્પ્ટ લખો...")
-    out = gr.Textbox(lines=12)
-    btn = gr.Button("Run")
-    btn.click(fn=lambda p: chat(p), inputs=[txt], outputs=[out])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+# app.py
 import os
+import time
+from huggingface_hub import snapshot_download
+from llama_cpp import Llama
 import gradio as gr
+# -------------------------
+# CONFIG - edit if જરૂર હોય
+# -------------------------
+# repo id on HF that contains your .gguf file (public repo recommended for Spaces)
+# Examples you might try: "your-username/gemma-3-4b-it-q4_0-gguf" or "google/gemma-3-4b-it-qat-q4_0-gguf"
+HF_MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
+GGUF_FILENAME = os.environ.get("GGUF_FILENAME", None)  # optional if multiple files
+# where to cache the model inside the Space runtime
+MODEL_CACHE_DIR = "/tmp/model_cache"
+os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
+def download_gguf(repo_id, local_dir, allow_patterns=None):
+    "Download GGUF from HF hub (uses snapshot_download). Returns local path to .gguf"
+    print(f"Downloading model from {repo_id} to {local_dir} ... (may take a while)")
+    # allow_patterns can be like ["*.gguf"]
+    path = snapshot_download(repo_id, cache_dir=local_dir, allow_patterns=allow_patterns)
+    # find gguf file
+    for root, _, files in os.walk(path):
+        for f in files:
+            if f.endswith(".gguf"):
+                local_path = os.path.join(root, f)
+                print("Found gguf:", local_path)
+                return local_path
+    raise FileNotFoundError("No .gguf file found in repository. Please set MODEL_REPO to the HF repo containing the .gguf.")
+# Download model on first run (cached across runs in the Space underlying VM while active)
+MODEL_PATH = None
+try:
+    MODEL_PATH = download_gguf(HF_MODEL_REPO, MODEL_CACHE_DIR, allow_patterns=["*.gguf"])
+except Exception as e:
+    print("Model download failed:", e)
+llm = None
+if MODEL_PATH:
+    try:
+        print("Loading model with llama-cpp-python:", MODEL_PATH)
+        # tweak n_ctx / n_threads if you hit memory/CPU limits
+        llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
+        print("Model loaded.")
+    except Exception as e:
+        print("Failed to load model in llama-cpp-python:", e)
+        llm = None
+def generate(prompt, max_tokens=256, temp=0.8):
+    if llm is None:
+        return "Model not loaded. Check logs or try smaller model / upgrade Space hardware."
+    try:
+        # llama-cpp-python create_completion usage
+        out = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temp)
+        return out["choices"][0]["text"]
+    except Exception as e:
+        return f"Error during inference: {e}"
+# -------------------
+# Gradio UI (Desi-ish)
+# -------------------
+with gr.Blocks(title="Gemma3 (GGUF) - Desi Chat") as demo:
+    gr.Markdown("## 💎 Gemma3 (GGUF) — Hugging Face Space (Free tier friendly)")
+    with gr.Row():
+        inp = gr.Textbox(lines=4, label="તમારો પ્રશ્ન (Gujarati/English)", placeholder="હેલો, મોજી કઈ રીતે છે?")
+        with gr.Column(scale=1):
+            max_t = gr.Slider(label="Max tokens", minimum=16, maximum=1024, value=256, step=16)
+            temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, value=0.8, step=0.1)
+            btn = gr.Button("જવાબ આપો")
+    out = gr.Textbox(label="જવાબ", lines=8)
+    btn.click(fn=generate, inputs=[inp, max_t, temp], outputs=out)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))