EYEDOL commited on
Commit
d6fe098
·
verified ·
1 Parent(s): 00b9e10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -23
app.py CHANGED
@@ -2,21 +2,18 @@
2
  """
3
  Refactored Salama Assistant: text-only chatbot (STT and TTS removed)
4
  Drop this file into your Hugging Face Space (replace existing app.py) or run locally.
5
- Requirements:
6
- - transformers
7
- - peft
8
- - gradio
9
- - huggingface_hub
10
- - torch
11
-
12
- Notes:
13
- - Set HF_TOKEN in env for private models or use Spaces secret.
14
- - This keeps the LLM + PEFT adapter loading and streaming text responses into the Gradio chat UI.
15
  """
16
 
17
  import os
18
  import threading
19
  import gradio as gr
 
 
20
  import torch
21
  from huggingface_hub import login
22
  from transformers import (
@@ -42,6 +39,26 @@ else:
42
  print("Warning: HF_TOKEN not found in env. Private repos may fail to load.")
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  class WeeboAssistant:
46
  def __init__(self):
47
  self.SYSTEM_PROMPT = (
@@ -53,49 +70,93 @@ class WeeboAssistant:
53
  def _init_models(self):
54
  print("Initializing models...")
55
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
56
  self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
57
- print(f"Using device: {self.device}")
 
 
 
 
58
 
59
  # 1) Tokenizer (prefer base tokenizer)
60
  try:
61
  self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
 
62
  except Exception as e:
63
  print("Warning: could not load base tokenizer, falling back to adapter tokenizer. Error:", e)
64
  self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # 2) Load base model
67
- device_map = "auto" if torch.cuda.is_available() else None
68
  try:
69
  self.llm_model = AutoModelForCausalLM.from_pretrained(
70
  BASE_MODEL_ID,
71
- torch_dtype=self.torch_dtype,
72
- low_cpu_mem_usage=True,
73
- device_map=device_map,
74
- trust_remote_code=True,
75
  )
 
76
  except Exception as e:
77
  raise RuntimeError(
78
  "Failed to load base model. Ensure the base model ID is correct and the HF_TOKEN has access if private. Error: "
79
  + str(e)
80
  )
81
 
82
- # 3) Load and apply PEFT adapter (adapter-only repo)
83
  try:
84
- peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
85
- self.llm_model = PeftModel.from_pretrained(
86
- self.llm_model,
87
- ADAPTER_REPO_ID,
 
 
 
 
 
 
88
  device_map=device_map,
89
  torch_dtype=self.torch_dtype,
90
  low_cpu_mem_usage=True,
91
  )
 
 
 
 
 
 
 
 
92
  except Exception as e:
93
  raise RuntimeError(
94
  "Failed to load/apply PEFT adapter from adapter repo. Make sure adapter files are present and HF_TOKEN has access if private. Error: "
95
  + str(e)
96
  )
97
 
98
- # 4) Optional non-streaming pipeline (useful for small tests)
99
  try:
100
  device_index = 0 if torch.cuda.is_available() else -1
101
  self.llm_pipeline = pipeline(
@@ -105,6 +166,7 @@ class WeeboAssistant:
105
  device=device_index,
106
  model_kwargs={"torch_dtype": self.torch_dtype},
107
  )
 
108
  except Exception as e:
109
  print("Warning: could not create text-generation pipeline. Streaming generate will still work. Error:", e)
110
  self.llm_pipeline = None
@@ -155,6 +217,7 @@ assistant = WeeboAssistant()
155
  # -------------------- Gradio pipelines --------------------
156
  def t2t_pipeline(text_input, chat_history):
157
  # Append the user's message and stream the assistant reply
 
158
  chat_history.append((text_input, ""))
159
  yield chat_history
160
 
 
2
  """
3
  Refactored Salama Assistant: text-only chatbot (STT and TTS removed)
4
  Drop this file into your Hugging Face Space (replace existing app.py) or run locally.
5
+
6
+ This version:
7
+ - Never passes device_map=None (avoids TypeError in accelerate)
8
+ - Detects bitsandbytes availability and only requests 4-bit loading when safe
9
+ - Keeps streaming responses into Gradio chat UI
 
 
 
 
 
10
  """
11
 
12
  import os
13
  import threading
14
  import gradio as gr
15
+ import importlib
16
+ import importlib.util
17
  import torch
18
  from huggingface_hub import login
19
  from transformers import (
 
39
  print("Warning: HF_TOKEN not found in env. Private repos may fail to load.")
40
 
41
 
42
+ def is_package_installed(name: str) -> bool:
43
+ """Return True if installed (distribution metadata present)."""
44
+ try:
45
+ # prefer importlib.metadata.distribution if available
46
+ import importlib.metadata as md
47
+
48
+ try:
49
+ md.distribution(name)
50
+ return True
51
+ except Exception:
52
+ return False
53
+ except Exception:
54
+ # fallback: try import
55
+ try:
56
+ importlib.import_module(name)
57
+ return True
58
+ except Exception:
59
+ return False
60
+
61
+
62
  class WeeboAssistant:
63
  def __init__(self):
64
  self.SYSTEM_PROMPT = (
 
70
  def _init_models(self):
71
  print("Initializing models...")
72
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
73
+ # choose dtype: bfloat16 usually for newer GPUs; keep float32 on CPU
74
  self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
75
+ print(f"Using device: {self.device}, torch_dtype: {self.torch_dtype}")
76
+
77
+ # check bitsandbytes presence (used for 4-bit quant)
78
+ BNB_AVAILABLE = is_package_installed("bitsandbytes")
79
+ print("bitsandbytes available:", BNB_AVAILABLE)
80
 
81
  # 1) Tokenizer (prefer base tokenizer)
82
  try:
83
  self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
84
+ print("Loaded tokenizer from BASE_MODEL_ID")
85
  except Exception as e:
86
  print("Warning: could not load base tokenizer, falling back to adapter tokenizer. Error:", e)
87
  self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
88
+ print("Loaded tokenizer from ADAPTER_REPO_ID")
89
+
90
+ # 2) prepare device_map (never None)
91
+ if torch.cuda.is_available():
92
+ device_map = "auto"
93
+ else:
94
+ # Force the entire model onto CPU (prevents accelerate from iterating a None)
95
+ device_map = {"": "cpu"}
96
+ print("device_map being used for model load:", device_map)
97
+
98
+ # 3) Load base model with conditional kwargs to avoid probing bitsandbytes when missing
99
+ base_model_kwargs = dict(
100
+ torch_dtype=self.torch_dtype,
101
+ low_cpu_mem_usage=True,
102
+ device_map=device_map,
103
+ trust_remote_code=True,
104
+ )
105
+
106
+ # If bitsandbytes is available and we're on CUDA, we can attempt 4-bit loading.
107
+ # Otherwise do not request load_in_4bit to avoid import checks inside transformers.
108
+ if BNB_AVAILABLE and torch.cuda.is_available():
109
+ # requesting 4-bit loading is appropriate when bnb + GPU available
110
+ base_model_kwargs["load_in_4bit"] = True
111
+ # you might also want to pass bnb-specific kwargs; leaving defaults
112
+ print("Will attempt to load base model in 4-bit (bitsandbytes + CUDA detected).")
113
+ else:
114
+ # explicitly avoid asking transformers to use 4-bit
115
+ print("bitsandbytes not usable or no CUDA: loading model normally (no 4-bit).")
116
 
 
 
117
  try:
118
  self.llm_model = AutoModelForCausalLM.from_pretrained(
119
  BASE_MODEL_ID,
120
+ **base_model_kwargs,
 
 
 
121
  )
122
+ print("Base model loaded from", BASE_MODEL_ID)
123
  except Exception as e:
124
  raise RuntimeError(
125
  "Failed to load base model. Ensure the base model ID is correct and the HF_TOKEN has access if private. Error: "
126
  + str(e)
127
  )
128
 
129
+ # 4) Load and apply PEFT adapter (adapter-only repo)
130
  try:
131
+ # get peft config (optional use)
132
+ try:
133
+ peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
134
+ print("Loaded PEFT config from", ADAPTER_REPO_ID)
135
+ except Exception:
136
+ peft_config = None
137
+ print("Warning: could not load PeftConfig; continuing to attempt adapter load.")
138
+
139
+ # build kwargs for PeftModel.from_pretrained
140
+ peft_kwargs = dict(
141
  device_map=device_map,
142
  torch_dtype=self.torch_dtype,
143
  low_cpu_mem_usage=True,
144
  )
145
+ # If we loaded base model in 4-bit, PeftModel should be able to attach to it.
146
+ # If not, just pass the usual kwargs (we avoid adding load_in_4bit here; it's taken care of above).
147
+ self.llm_model = PeftModel.from_pretrained(
148
+ self.llm_model,
149
+ ADAPTER_REPO_ID,
150
+ **peft_kwargs,
151
+ )
152
+ print("PEFT adapter applied from", ADAPTER_REPO_ID)
153
  except Exception as e:
154
  raise RuntimeError(
155
  "Failed to load/apply PEFT adapter from adapter repo. Make sure adapter files are present and HF_TOKEN has access if private. Error: "
156
  + str(e)
157
  )
158
 
159
+ # 5) Optional non-streaming pipeline (useful for small tests)
160
  try:
161
  device_index = 0 if torch.cuda.is_available() else -1
162
  self.llm_pipeline = pipeline(
 
166
  device=device_index,
167
  model_kwargs={"torch_dtype": self.torch_dtype},
168
  )
169
+ print("Created text-generation pipeline (non-streaming).")
170
  except Exception as e:
171
  print("Warning: could not create text-generation pipeline. Streaming generate will still work. Error:", e)
172
  self.llm_pipeline = None
 
217
  # -------------------- Gradio pipelines --------------------
218
  def t2t_pipeline(text_input, chat_history):
219
  # Append the user's message and stream the assistant reply
220
+ chat_history = chat_history or []
221
  chat_history.append((text_input, ""))
222
  yield chat_history
223