DipakKuma commited on
Commit
bf4458b
·
verified ·
1 Parent(s): 91cbc1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -56
app.py CHANGED
@@ -1,17 +1,7 @@
1
  # app.py
2
  """
3
  Gemma3 (GGUF) - Gradio Space app (fallback-ready)
4
-
5
- Behavior:
6
- - If llama-cpp-python is available and a local .gguf model_path is provided, it will use local inference.
7
- - Otherwise, it will fallback to Hugging Face Inference API (requires HUGGINGFACE_HUB_TOKEN for private models).
8
- - Designed to run on Hugging Face Spaces (CPU) as a frontend-only if llama-cpp-python cannot be built.
9
-
10
- Environment variables (optional):
11
- - MODEL_REPO: HF repo id that contains the .gguf or hosted model (e.g. "your-user/gemma-3-4b-gguf")
12
- - GGUF_PATH: local path to a .gguf file inside the Space (if you uploaded it)
13
- - HUGGINGFACE_HUB_TOKEN: needed for private HF model access via InferenceClient
14
- - HF_INFERENCE_MODEL: model id used by Inference API (if different from MODEL_REPO)
15
  """
16
 
17
  import os
@@ -24,12 +14,11 @@ import gradio as gr
24
  # Try to import llama-cpp-python (native) — may fail in Spaces build
25
  # -------------------------------------------------------------------------
26
  LLAMA_AVAILABLE = False
27
- llama = None
28
  try:
29
  from llama_cpp import Llama
30
  LLAMA_AVAILABLE = True
31
  except Exception as e:
32
- # Import failed (likely build/compile issue). We'll fallback.
33
  print("llama-cpp-python not available:", e)
34
  LLAMA_AVAILABLE = False
35
 
@@ -40,7 +29,7 @@ HF_AVAILABLE = False
40
  hf_client = None
41
  try:
42
  from huggingface_hub import InferenceClient
43
- # If user provided token as secret/environment, InferenceClient will pick it up automatically
44
  hf_client = InferenceClient()
45
  HF_AVAILABLE = True
46
  except Exception as e:
@@ -52,32 +41,24 @@ except Exception as e:
52
  # -------------------------------------------------------------------------
53
  MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
54
  GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space
55
- HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", MODEL_REPO)
56
- # Tune these defaults if needed
57
  DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
58
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))
59
 
60
  # -------------------------------------------------------------------------
61
  # If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
62
  # -------------------------------------------------------------------------
63
- llm = None
64
  if LLAMA_AVAILABLE:
65
  try:
66
- # Prefer an explicitly uploaded GGUF in the repo (GGUF_PATH), else try MODEL_REPO download path (if available)
67
- model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf") # common upload path in Spaces
68
- # If GGUF_PATH not set and model repo id provided, snapshot_download could be used,
69
- # but many Spaces avoid heavy downloads at runtime; keep simple for now.
70
  if GGUF_PATH and os.path.exists(GGUF_PATH):
71
  model_path_to_try = GGUF_PATH
72
  elif os.path.exists(model_path_to_try):
73
- # ok
74
  pass
75
  else:
76
- # No local gguf found; do not attempt to load a non-existent file
77
  raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")
78
 
79
  print("Loading local model via llama-cpp-python from:", model_path_to_try)
80
- # tune n_ctx and n_threads to Space limits (reduce if OOM)
81
  llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
82
  print("Loaded local model successfully.")
83
  except Exception as e:
@@ -90,49 +71,76 @@ if LLAMA_AVAILABLE:
90
  # Helper functions for inference
91
  # -------------------------------------------------------------------------
92
  def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
93
- """Generate text using llama-cpp-python Llama instance (local GGUF)."""
94
  if not llm:
95
  return "Local model not loaded."
96
  try:
97
  resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
98
- # llama-cpp-python returns a dict with choices list
99
  return resp["choices"][0]["text"]
100
  except Exception as e:
101
  print("Error in local_generate:", e)
102
  return f"Local generation error: {e}"
103
 
104
  def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
105
- """Generate text using Hugging Face Inference API (InferenceClient)."""
 
 
 
 
 
106
  if not HF_AVAILABLE or hf_client is None:
107
  return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."
 
108
  try:
109
- # InferenceClient.text_generation returns a list/dict depending on package versions.
110
- # We request the model given by HF_INFERENCE_MODEL
111
- # Use parameters mapping common with HF: max_new_tokens, temperature
112
- raw = hf_client.text_generation(
113
- model=HF_INFERENCE_MODEL,
114
- inputs=prompt,
115
- max_new_tokens=max_tokens,
116
- temperature=temperature
117
- )
118
- # raw often is a list of dicts: [{"generated_text": "..."}] or {"generated_text": "..."}
 
 
 
 
 
 
 
 
 
 
 
 
119
  if isinstance(raw, list) and len(raw) > 0:
120
- # prefer "generated_text" key
121
  first = raw[0]
122
  if isinstance(first, dict):
123
- return first.get("generated_text") or first.get("generated_text", str(first))
 
124
  return str(first)
125
- elif isinstance(raw, dict):
126
- return raw.get("generated_text") or str(raw)
127
- else:
128
- return str(raw)
 
 
 
 
 
 
 
 
 
 
 
 
129
  except Exception as e:
130
  print("HF generation error:", e)
131
  print(traceback.format_exc())
132
  return f"Hugging Face generation error: {e}"
133
 
134
  def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
135
- """Unified generate entry-point used by the UI."""
136
  prompt = (prompt or "").strip()
137
  if not prompt:
138
  return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
@@ -142,25 +150,19 @@ def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: flo
142
  elif HF_AVAILABLE and hf_client:
143
  return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
144
  else:
145
- # Neither local nor HF available
146
  return (
147
  "No model runtime is available.\n\n"
148
  "Options:\n"
149
  "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
150
- " OR ensure a local gguf file exists at the default upload path.\n"
151
  "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
152
- "3) Remove llama-cpp-python from requirements if its build is failing and rely solely on HF Inference.\n\n"
153
- "Check Space logs for more details."
154
  )
155
 
156
  # -------------------------------------------------------------------------
157
- # Gradio UI - Desi-friendly simple layout (works with Gradio 5.x)
158
  # -------------------------------------------------------------------------
159
  title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
160
  description_text = """
161
  **Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
162
- - If you want purely local inference in the Space, upload the `.gguf` file and set `GGUF_PATH` to that path.
163
- - If using HF Inference, set `HUGGINGFACE_HUB_TOKEN` (secret) and `HF_INFERENCE_MODEL` as needed.
164
  """
165
 
166
  with gr.Blocks(title=title_text) as demo:
@@ -178,18 +180,14 @@ with gr.Blocks(title=title_text) as demo:
178
  status_md = gr.Markdown(
179
  f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
180
  f"- MODEL_REPO: `{MODEL_REPO}`\n"
181
- f"- HF model (inference): `{HF_INFERENCE_MODEL}`\n"
182
  )
183
  tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")
184
 
185
  output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
186
-
187
- # Hook up
188
  submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])
189
 
190
- # If run as main (local dev)
191
  if __name__ == "__main__":
192
- # Useful debug info:
193
  print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
194
  print("HF_AVAILABLE:", HF_AVAILABLE)
195
  print("MODEL_REPO:", MODEL_REPO)
 
1
  # app.py
2
  """
3
  Gemma3 (GGUF) - Gradio Space app (fallback-ready)
4
+ Updated: fix for Hugging Face InferenceClient.text_generation() signature
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import os
 
14
  # Try to import llama-cpp-python (native) — may fail in Spaces build
15
  # -------------------------------------------------------------------------
16
  LLAMA_AVAILABLE = False
17
+ llm = None
18
  try:
19
  from llama_cpp import Llama
20
  LLAMA_AVAILABLE = True
21
  except Exception as e:
 
22
  print("llama-cpp-python not available:", e)
23
  LLAMA_AVAILABLE = False
24
 
 
29
  hf_client = None
30
  try:
31
  from huggingface_hub import InferenceClient
32
+ # InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
33
  hf_client = InferenceClient()
34
  HF_AVAILABLE = True
35
  except Exception as e:
 
41
  # -------------------------------------------------------------------------
42
  MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
43
  GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space
44
+ HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "") # optional override for HF inference model id
 
45
  DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
46
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))
47
 
48
  # -------------------------------------------------------------------------
49
  # If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
50
  # -------------------------------------------------------------------------
 
51
  if LLAMA_AVAILABLE:
52
  try:
53
+ model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
 
 
 
54
  if GGUF_PATH and os.path.exists(GGUF_PATH):
55
  model_path_to_try = GGUF_PATH
56
  elif os.path.exists(model_path_to_try):
 
57
  pass
58
  else:
 
59
  raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")
60
 
61
  print("Loading local model via llama-cpp-python from:", model_path_to_try)
 
62
  llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
63
  print("Loaded local model successfully.")
64
  except Exception as e:
 
71
  # Helper functions for inference
72
  # -------------------------------------------------------------------------
73
  def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
 
74
  if not llm:
75
  return "Local model not loaded."
76
  try:
77
  resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
 
78
  return resp["choices"][0]["text"]
79
  except Exception as e:
80
  print("Error in local_generate:", e)
81
  return f"Local generation error: {e}"
82
 
83
  def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
84
+ """
85
+ Corrected HF usage:
86
+ - Pass prompt as positional first arg to text_generation()
87
+ - Use max_new_tokens (not max_tokens)
88
+ - Optionally pass model=HF_INFERENCE_MODEL if set
89
+ """
90
  if not HF_AVAILABLE or hf_client is None:
91
  return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."
92
+
93
  try:
94
+ kwargs = {
95
+ "max_new_tokens": int(max_tokens),
96
+ "temperature": float(temperature),
97
+ # you can also set stream=True or details=True if desired
98
+ }
99
+ # include model override only if provided (avoid passing empty string)
100
+ if HF_INFERENCE_MODEL:
101
+ kwargs["model"] = HF_INFERENCE_MODEL
102
+
103
+ # NOTE: text_generation expects the prompt as first positional arg.
104
+ raw = hf_client.text_generation(prompt, **kwargs)
105
+
106
+ # raw may be:
107
+ # - a simple string with generated text,
108
+ # - a TextGenerationOutput object (dataclass-like) or dict,
109
+ # - a list containing dict(s) depending on version/backends
110
+ # Normalize to a string response:
111
+ # case: simple str
112
+ if isinstance(raw, str):
113
+ return raw
114
+
115
+ # case: list (e.g., [{"generated_text": "..."}])
116
  if isinstance(raw, list) and len(raw) > 0:
 
117
  first = raw[0]
118
  if isinstance(first, dict):
119
+ # prefer keys commonly returned
120
+ return first.get("generated_text") or first.get("text") or str(first)
121
  return str(first)
122
+
123
+ # case: object with attribute generated_text or dict-like
124
+ if hasattr(raw, "generated_text"):
125
+ return getattr(raw, "generated_text")
126
+ if isinstance(raw, dict):
127
+ # try common keys
128
+ return raw.get("generated_text") or raw.get("text") or str(raw)
129
+
130
+ # fallback to string conversion
131
+ return str(raw)
132
+
133
+ except TypeError as te:
134
+ # common mistake: wrong kw names (we tried to guard this), print helpful msg
135
+ print("TypeError from hf_client.text_generation:", te)
136
+ print(traceback.format_exc())
137
+ return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
138
  except Exception as e:
139
  print("HF generation error:", e)
140
  print(traceback.format_exc())
141
  return f"Hugging Face generation error: {e}"
142
 
143
  def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
 
144
  prompt = (prompt or "").strip()
145
  if not prompt:
146
  return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
 
150
  elif HF_AVAILABLE and hf_client:
151
  return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
152
  else:
 
153
  return (
154
  "No model runtime is available.\n\n"
155
  "Options:\n"
156
  "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
 
157
  "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
 
 
158
  )
159
 
160
  # -------------------------------------------------------------------------
161
+ # Gradio UI
162
  # -------------------------------------------------------------------------
163
  title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
164
  description_text = """
165
  **Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
 
 
166
  """
167
 
168
  with gr.Blocks(title=title_text) as demo:
 
180
  status_md = gr.Markdown(
181
  f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
182
  f"- MODEL_REPO: `{MODEL_REPO}`\n"
183
+ f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
184
  )
185
  tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")
186
 
187
  output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
 
 
188
  submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])
189
 
 
190
  if __name__ == "__main__":
 
191
  print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
192
  print("HF_AVAILABLE:", HF_AVAILABLE)
193
  print("MODEL_REPO:", MODEL_REPO)