Spaces:

DipakKuma
/

My-Gemma-Chatbot

Sleeping

App Files Files Community

DipakKuma commited on Oct 30

Commit

bf4458b

verified ·

1 Parent(s): 91cbc1a

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -56

app.py CHANGED Viewed

@@ -1,17 +1,7 @@
 # app.py
 """
 Gemma3 (GGUF) - Gradio Space app (fallback-ready)
-Behavior:
-- If llama-cpp-python is available and a local .gguf model_path is provided, it will use local inference.
-- Otherwise, it will fallback to Hugging Face Inference API (requires HUGGINGFACE_HUB_TOKEN for private models).
-- Designed to run on Hugging Face Spaces (CPU) as a frontend-only if llama-cpp-python cannot be built.
-Environment variables (optional):
-- MODEL_REPO: HF repo id that contains the .gguf or hosted model (e.g. "your-user/gemma-3-4b-gguf")
-- GGUF_PATH: local path to a .gguf file inside the Space (if you uploaded it)
-- HUGGINGFACE_HUB_TOKEN: needed for private HF model access via InferenceClient
-- HF_INFERENCE_MODEL: model id used by Inference API (if different from MODEL_REPO)
 """
 import os
@@ -24,12 +14,11 @@ import gradio as gr
 # Try to import llama-cpp-python (native) — may fail in Spaces build
 # -------------------------------------------------------------------------
 LLAMA_AVAILABLE = False
-llama = None
 try:
     from llama_cpp import Llama
     LLAMA_AVAILABLE = True
 except Exception as e:
-    # Import failed (likely build/compile issue). We'll fallback.
     print("llama-cpp-python not available:", e)
     LLAMA_AVAILABLE = False
@@ -40,7 +29,7 @@ HF_AVAILABLE = False
 hf_client = None
 try:
     from huggingface_hub import InferenceClient
-    # If user provided token as secret/environment, InferenceClient will pick it up automatically
     hf_client = InferenceClient()
     HF_AVAILABLE = True
 except Exception as e:
@@ -52,32 +41,24 @@ except Exception as e:
 # -------------------------------------------------------------------------
 MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
 GGUF_PATH = os.environ.get("GGUF_PATH", None)  # if the gguf is uploaded to the Space
-HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", MODEL_REPO)
-# Tune these defaults if needed
 DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
 DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))
 # -------------------------------------------------------------------------
 # If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
 # -------------------------------------------------------------------------
-llm = None
 if LLAMA_AVAILABLE:
     try:
-        # Prefer an explicitly uploaded GGUF in the repo (GGUF_PATH), else try MODEL_REPO download path (if available)
-        model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")  # common upload path in Spaces
-        # If GGUF_PATH not set and model repo id provided, snapshot_download could be used,
-        # but many Spaces avoid heavy downloads at runtime; keep simple for now.
         if GGUF_PATH and os.path.exists(GGUF_PATH):
             model_path_to_try = GGUF_PATH
         elif os.path.exists(model_path_to_try):
-            # ok
             pass
         else:
-            # No local gguf found; do not attempt to load a non-existent file
             raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")
         print("Loading local model via llama-cpp-python from:", model_path_to_try)
-        # tune n_ctx and n_threads to Space limits (reduce if OOM)
         llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
         print("Loaded local model successfully.")
     except Exception as e:
@@ -90,49 +71,76 @@ if LLAMA_AVAILABLE:
 # Helper functions for inference
 # -------------------------------------------------------------------------
 def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
-    """Generate text using llama-cpp-python Llama instance (local GGUF)."""
     if not llm:
         return "Local model not loaded."
     try:
         resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
-        # llama-cpp-python returns a dict with choices list
         return resp["choices"][0]["text"]
     except Exception as e:
         print("Error in local_generate:", e)
         return f"Local generation error: {e}"
 def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
-    """Generate text using Hugging Face Inference API (InferenceClient)."""
     if not HF_AVAILABLE or hf_client is None:
         return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."
     try:
-        # InferenceClient.text_generation returns a list/dict depending on package versions.
-        # We request the model given by HF_INFERENCE_MODEL
-        # Use parameters mapping common with HF: max_new_tokens, temperature
-        raw = hf_client.text_generation(
-            model=HF_INFERENCE_MODEL,
-            inputs=prompt,
-            max_new_tokens=max_tokens,
-            temperature=temperature
-        )
-        # raw often is a list of dicts: [{"generated_text": "..."}] or {"generated_text": "..."}
         if isinstance(raw, list) and len(raw) > 0:
-            # prefer "generated_text" key
             first = raw[0]
             if isinstance(first, dict):
-                return first.get("generated_text") or first.get("generated_text", str(first))
             return str(first)
-        elif isinstance(raw, dict):
-            return raw.get("generated_text") or str(raw)
-        else:
-            return str(raw)
     except Exception as e:
         print("HF generation error:", e)
         print(traceback.format_exc())
         return f"Hugging Face generation error: {e}"
 def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
-    """Unified generate entry-point used by the UI."""
     prompt = (prompt or "").strip()
     if not prompt:
         return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
@@ -142,25 +150,19 @@ def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: flo
     elif HF_AVAILABLE and hf_client:
         return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
     else:
-        # Neither local nor HF available
         return (
             "No model runtime is available.\n\n"
             "Options:\n"
             "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
-            "   OR ensure a local gguf file exists at the default upload path.\n"
             "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
-            "3) Remove llama-cpp-python from requirements if its build is failing and rely solely on HF Inference.\n\n"
-            "Check Space logs for more details."
         )
 # -------------------------------------------------------------------------
-# Gradio UI - Desi-friendly simple layout (works with Gradio 5.x)
 # -------------------------------------------------------------------------
 title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
 description_text = """
 **Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
-- If you want purely local inference in the Space, upload the `.gguf` file and set `GGUF_PATH` to that path.
-- If using HF Inference, set `HUGGINGFACE_HUB_TOKEN` (secret) and `HF_INFERENCE_MODEL` as needed.
 """
 with gr.Blocks(title=title_text) as demo:
@@ -178,18 +180,14 @@ with gr.Blocks(title=title_text) as demo:
             status_md = gr.Markdown(
                 f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
                 f"- MODEL_REPO: `{MODEL_REPO}`\n"
-                f"- HF model (inference): `{HF_INFERENCE_MODEL}`\n"
             )
             tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")
     output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
-    # Hook up
     submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])
-# If run as main (local dev)
 if __name__ == "__main__":
-    # Useful debug info:
     print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
     print("HF_AVAILABLE:", HF_AVAILABLE)
     print("MODEL_REPO:", MODEL_REPO)

 # app.py
 """
 Gemma3 (GGUF) - Gradio Space app (fallback-ready)
+Updated: fix for Hugging Face InferenceClient.text_generation() signature
 """
 import os
 # Try to import llama-cpp-python (native) — may fail in Spaces build
 # -------------------------------------------------------------------------
 LLAMA_AVAILABLE = False
+llm = None
 try:
     from llama_cpp import Llama
     LLAMA_AVAILABLE = True
 except Exception as e:
     print("llama-cpp-python not available:", e)
     LLAMA_AVAILABLE = False
 hf_client = None
 try:
     from huggingface_hub import InferenceClient
+    # InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
     hf_client = InferenceClient()
     HF_AVAILABLE = True
 except Exception as e:
 # -------------------------------------------------------------------------
 MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
 GGUF_PATH = os.environ.get("GGUF_PATH", None)  # if the gguf is uploaded to the Space
+HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "")  # optional override for HF inference model id
 DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
 DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))
 # -------------------------------------------------------------------------
 # If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
 # -------------------------------------------------------------------------
 if LLAMA_AVAILABLE:
     try:
+        model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
         if GGUF_PATH and os.path.exists(GGUF_PATH):
             model_path_to_try = GGUF_PATH
         elif os.path.exists(model_path_to_try):
             pass
         else:
             raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")
         print("Loading local model via llama-cpp-python from:", model_path_to_try)
         llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
         print("Loaded local model successfully.")
     except Exception as e:
 # Helper functions for inference
 # -------------------------------------------------------------------------
 def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
     if not llm:
         return "Local model not loaded."
     try:
         resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
         return resp["choices"][0]["text"]
     except Exception as e:
         print("Error in local_generate:", e)
         return f"Local generation error: {e}"
 def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
+    """
+    Corrected HF usage:
+    - Pass prompt as positional first arg to text_generation()
+    - Use max_new_tokens (not max_tokens)
+    - Optionally pass model=HF_INFERENCE_MODEL if set
+    """
     if not HF_AVAILABLE or hf_client is None:
         return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."
     try:
+        kwargs = {
+            "max_new_tokens": int(max_tokens),
+            "temperature": float(temperature),
+            # you can also set stream=True or details=True if desired
+        }
+        # include model override only if provided (avoid passing empty string)
+        if HF_INFERENCE_MODEL:
+            kwargs["model"] = HF_INFERENCE_MODEL
+        # NOTE: text_generation expects the prompt as first positional arg.
+        raw = hf_client.text_generation(prompt, **kwargs)
+        # raw may be:
+        #  - a simple string with generated text,
+        #  - a TextGenerationOutput object (dataclass-like) or dict,
+        #  - a list containing dict(s) depending on version/backends
+        # Normalize to a string response:
+        # case: simple str
+        if isinstance(raw, str):
+            return raw
+        # case: list (e.g., [{"generated_text": "..."}])
         if isinstance(raw, list) and len(raw) > 0:
             first = raw[0]
             if isinstance(first, dict):
+                # prefer keys commonly returned
+                return first.get("generated_text") or first.get("text") or str(first)
             return str(first)
+        # case: object with attribute generated_text or dict-like
+        if hasattr(raw, "generated_text"):
+            return getattr(raw, "generated_text")
+        if isinstance(raw, dict):
+            # try common keys
+            return raw.get("generated_text") or raw.get("text") or str(raw)
+        # fallback to string conversion
+        return str(raw)
+    except TypeError as te:
+        # common mistake: wrong kw names (we tried to guard this), print helpful msg
+        print("TypeError from hf_client.text_generation:", te)
+        print(traceback.format_exc())
+        return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
     except Exception as e:
         print("HF generation error:", e)
         print(traceback.format_exc())
         return f"Hugging Face generation error: {e}"
 def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
     prompt = (prompt or "").strip()
     if not prompt:
         return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
     elif HF_AVAILABLE and hf_client:
         return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
     else:
         return (
             "No model runtime is available.\n\n"
             "Options:\n"
             "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
             "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
         )
 # -------------------------------------------------------------------------
+# Gradio UI
 # -------------------------------------------------------------------------
 title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
 description_text = """
 **Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
 """
 with gr.Blocks(title=title_text) as demo:
             status_md = gr.Markdown(
                 f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
                 f"- MODEL_REPO: `{MODEL_REPO}`\n"
+                f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
             )
             tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")
     output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
     submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])
 if __name__ == "__main__":
     print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
     print("HF_AVAILABLE:", HF_AVAILABLE)
     print("MODEL_REPO:", MODEL_REPO)