DipakKuma commited on
Commit
853c66a
·
verified ·
1 Parent(s): 2914262

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -37
app.py CHANGED
@@ -1,42 +1,78 @@
1
- # app.py (Gradio Space)
2
- from llama_cpp import Llama
3
- from huggingface_hub import snapshot_download
4
  import os
 
 
 
5
  import gradio as gr
6
 
7
- # ===== SETTINGS =====
8
- REPO_ID = "lmstudio-community/gemma-3-4b-it-GGUF" # અથવા તમારું HF repo id
9
- FILENAME = "gemma-3-4b-it-Q8_0.gguf" # નામ સચોટ હોવું જરૂરી
10
- CACHE_DIR = "/tmp/model_cache" # Space માં ephemeral dir
11
-
12
- os.environ.setdefault("HF_HOME", "/tmp/hf_home")
13
- os.environ.setdefault("LLAMA_CACHE", CACHE_DIR) # llama.cpp caching
14
-
15
- # OPTIONAL: download model into cache (snapshot_download uses HF_TOKEN automatically via secrets)
16
- print("Downloading model (if needed)...")
17
- repo_path = snapshot_download(repo_id=REPO_ID, cache_dir=CACHE_DIR)
18
- model_file_path = os.path.join(repo_path, FILENAME)
19
- print("Model file path:", model_file_path)
20
-
21
- # Load llama.cpp model (from file in repo_path)
22
- llm = Llama.from_pretrained(repo_id=REPO_ID, filename=FILENAME) # llama-cpp-python helper
23
-
24
- def chat(prompt, max_tokens=256, temp=0.2):
25
- resp = llm.create_chat_completion(
26
- messages=[{"role":"system","content":"You are a helpful assistant."},
27
- {"role":"user","content": prompt}],
28
- max_tokens=max_tokens,
29
- temperature=temp,
30
- stop=["<eos>"],
31
- )
32
- return resp["choices"][0]["message"]["content"]
33
-
34
- with gr.Blocks() as demo:
35
- gr.Markdown("## Gemma-3-4b-it (GGUF) demo")
36
- txt = gr.Textbox(lines=4, placeholder="તમારો પ્રોમ્પ્ટ લખો...")
37
- out = gr.Textbox(lines=12)
38
- btn = gr.Button("Run")
39
- btn.click(fn=lambda p: chat(p), inputs=[txt], outputs=[out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  if __name__ == "__main__":
42
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ # app.py
 
 
2
  import os
3
+ import time
4
+ from huggingface_hub import snapshot_download
5
+ from llama_cpp import Llama
6
  import gradio as gr
7
 
8
+ # -------------------------
9
+ # CONFIG - edit if જરૂર હોય
10
+ # -------------------------
11
+ # repo id on HF that contains your .gguf file (public repo recommended for Spaces)
12
+ # Examples you might try: "your-username/gemma-3-4b-it-q4_0-gguf" or "google/gemma-3-4b-it-qat-q4_0-gguf"
13
+ HF_MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
14
+ GGUF_FILENAME = os.environ.get("GGUF_FILENAME", None) # optional if multiple files
15
+
16
+ # where to cache the model inside the Space runtime
17
+ MODEL_CACHE_DIR = "/tmp/model_cache"
18
+
19
+ os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
20
+
21
+ def download_gguf(repo_id, local_dir, allow_patterns=None):
22
+ "Download GGUF from HF hub (uses snapshot_download). Returns local path to .gguf"
23
+ print(f"Downloading model from {repo_id} to {local_dir} ... (may take a while)")
24
+ # allow_patterns can be like ["*.gguf"]
25
+ path = snapshot_download(repo_id, cache_dir=local_dir, allow_patterns=allow_patterns)
26
+ # find gguf file
27
+ for root, _, files in os.walk(path):
28
+ for f in files:
29
+ if f.endswith(".gguf"):
30
+ local_path = os.path.join(root, f)
31
+ print("Found gguf:", local_path)
32
+ return local_path
33
+ raise FileNotFoundError("No .gguf file found in repository. Please set MODEL_REPO to the HF repo containing the .gguf.")
34
+
35
+ # Download model on first run (cached across runs in the Space underlying VM while active)
36
+ MODEL_PATH = None
37
+ try:
38
+ MODEL_PATH = download_gguf(HF_MODEL_REPO, MODEL_CACHE_DIR, allow_patterns=["*.gguf"])
39
+ except Exception as e:
40
+ print("Model download failed:", e)
41
+
42
+ llm = None
43
+ if MODEL_PATH:
44
+ try:
45
+ print("Loading model with llama-cpp-python:", MODEL_PATH)
46
+ # tweak n_ctx / n_threads if you hit memory/CPU limits
47
+ llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
48
+ print("Model loaded.")
49
+ except Exception as e:
50
+ print("Failed to load model in llama-cpp-python:", e)
51
+ llm = None
52
+
53
+ def generate(prompt, max_tokens=256, temp=0.8):
54
+ if llm is None:
55
+ return "Model not loaded. Check logs or try smaller model / upgrade Space hardware."
56
+ try:
57
+ # llama-cpp-python create_completion usage
58
+ out = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temp)
59
+ return out["choices"][0]["text"]
60
+ except Exception as e:
61
+ return f"Error during inference: {e}"
62
+
63
+ # -------------------
64
+ # Gradio UI (Desi-ish)
65
+ # -------------------
66
+ with gr.Blocks(title="Gemma3 (GGUF) - Desi Chat") as demo:
67
+ gr.Markdown("## 💎 Gemma3 (GGUF) — Hugging Face Space (Free tier friendly)")
68
+ with gr.Row():
69
+ inp = gr.Textbox(lines=4, label="તમારો પ્રશ્ન (Gujarati/English)", placeholder="હેલો, મોજી કઈ રીતે છે?")
70
+ with gr.Column(scale=1):
71
+ max_t = gr.Slider(label="Max tokens", minimum=16, maximum=1024, value=256, step=16)
72
+ temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, value=0.8, step=0.1)
73
+ btn = gr.Button("જવાબ આપો")
74
+ out = gr.Textbox(label="જવાબ", lines=8)
75
+ btn.click(fn=generate, inputs=[inp, max_t, temp], outputs=out)
76
 
77
  if __name__ == "__main__":
78
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))