Spaces:

OthnnyEL
/

DimChi

Sleeping

App Files Files Community

EYEDOL commited on Oct 18

Commit

d6fe098

verified ·

1 Parent(s): 00b9e10

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -23

app.py CHANGED Viewed

@@ -2,21 +2,18 @@
 """
 Refactored Salama Assistant: text-only chatbot (STT and TTS removed)
 Drop this file into your Hugging Face Space (replace existing app.py) or run locally.
-Requirements:
-- transformers
-- peft
-- gradio
-- huggingface_hub
-- torch
-Notes:
-- Set HF_TOKEN in env for private models or use Spaces secret.
-- This keeps the LLM + PEFT adapter loading and streaming text responses into the Gradio chat UI.
 """
 import os
 import threading
 import gradio as gr
 import torch
 from huggingface_hub import login
 from transformers import (
@@ -42,6 +39,26 @@ else:
     print("Warning: HF_TOKEN not found in env. Private repos may fail to load.")
 class WeeboAssistant:
     def __init__(self):
         self.SYSTEM_PROMPT = (
@@ -53,49 +70,93 @@ class WeeboAssistant:
     def _init_models(self):
         print("Initializing models...")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
-        print(f"Using device: {self.device}")
         # 1) Tokenizer (prefer base tokenizer)
         try:
             self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
         except Exception as e:
             print("Warning: could not load base tokenizer, falling back to adapter tokenizer. Error:", e)
             self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
-        # 2) Load base model
-        device_map = "auto" if torch.cuda.is_available() else None
         try:
             self.llm_model = AutoModelForCausalLM.from_pretrained(
                 BASE_MODEL_ID,
-                torch_dtype=self.torch_dtype,
-                low_cpu_mem_usage=True,
-                device_map=device_map,
-                trust_remote_code=True,
             )
         except Exception as e:
             raise RuntimeError(
                 "Failed to load base model. Ensure the base model ID is correct and the HF_TOKEN has access if private. Error: "
                 + str(e)
             )
-        # 3) Load and apply PEFT adapter (adapter-only repo)
         try:
-            peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
-            self.llm_model = PeftModel.from_pretrained(
-                self.llm_model,
-                ADAPTER_REPO_ID,
                 device_map=device_map,
                 torch_dtype=self.torch_dtype,
                 low_cpu_mem_usage=True,
             )
         except Exception as e:
             raise RuntimeError(
                 "Failed to load/apply PEFT adapter from adapter repo. Make sure adapter files are present and HF_TOKEN has access if private. Error: "
                 + str(e)
             )
-        # 4) Optional non-streaming pipeline (useful for small tests)
         try:
             device_index = 0 if torch.cuda.is_available() else -1
             self.llm_pipeline = pipeline(
@@ -105,6 +166,7 @@ class WeeboAssistant:
                 device=device_index,
                 model_kwargs={"torch_dtype": self.torch_dtype},
             )
         except Exception as e:
             print("Warning: could not create text-generation pipeline. Streaming generate will still work. Error:", e)
             self.llm_pipeline = None
@@ -155,6 +217,7 @@ assistant = WeeboAssistant()
 # -------------------- Gradio pipelines --------------------
 def t2t_pipeline(text_input, chat_history):
     # Append the user's message and stream the assistant reply
     chat_history.append((text_input, ""))
     yield chat_history

 """
 Refactored Salama Assistant: text-only chatbot (STT and TTS removed)
 Drop this file into your Hugging Face Space (replace existing app.py) or run locally.
+This version:
+- Never passes device_map=None (avoids TypeError in accelerate)
+- Detects bitsandbytes availability and only requests 4-bit loading when safe
+- Keeps streaming responses into Gradio chat UI
 """
 import os
 import threading
 import gradio as gr
+import importlib
+import importlib.util
 import torch
 from huggingface_hub import login
 from transformers import (
     print("Warning: HF_TOKEN not found in env. Private repos may fail to load.")
+def is_package_installed(name: str) -> bool:
+    """Return True if installed (distribution metadata present)."""
+    try:
+        # prefer importlib.metadata.distribution if available
+        import importlib.metadata as md
+        try:
+            md.distribution(name)
+            return True
+        except Exception:
+            return False
+    except Exception:
+        # fallback: try import
+        try:
+            importlib.import_module(name)
+            return True
+        except Exception:
+            return False
 class WeeboAssistant:
     def __init__(self):
         self.SYSTEM_PROMPT = (
     def _init_models(self):
         print("Initializing models...")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # choose dtype: bfloat16 usually for newer GPUs; keep float32 on CPU
         self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
+        print(f"Using device: {self.device}, torch_dtype: {self.torch_dtype}")
+        # check bitsandbytes presence (used for 4-bit quant)
+        BNB_AVAILABLE = is_package_installed("bitsandbytes")
+        print("bitsandbytes available:", BNB_AVAILABLE)
         # 1) Tokenizer (prefer base tokenizer)
         try:
             self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
+            print("Loaded tokenizer from BASE_MODEL_ID")
         except Exception as e:
             print("Warning: could not load base tokenizer, falling back to adapter tokenizer. Error:", e)
             self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
+            print("Loaded tokenizer from ADAPTER_REPO_ID")
+        # 2) prepare device_map (never None)
+        if torch.cuda.is_available():
+            device_map = "auto"
+        else:
+            # Force the entire model onto CPU (prevents accelerate from iterating a None)
+            device_map = {"": "cpu"}
+        print("device_map being used for model load:", device_map)
+        # 3) Load base model with conditional kwargs to avoid probing bitsandbytes when missing
+        base_model_kwargs = dict(
+            torch_dtype=self.torch_dtype,
+            low_cpu_mem_usage=True,
+            device_map=device_map,
+            trust_remote_code=True,
+        )
+        # If bitsandbytes is available and we're on CUDA, we can attempt 4-bit loading.
+        # Otherwise do not request load_in_4bit to avoid import checks inside transformers.
+        if BNB_AVAILABLE and torch.cuda.is_available():
+            # requesting 4-bit loading is appropriate when bnb + GPU available
+            base_model_kwargs["load_in_4bit"] = True
+            # you might also want to pass bnb-specific kwargs; leaving defaults
+            print("Will attempt to load base model in 4-bit (bitsandbytes + CUDA detected).")
+        else:
+            # explicitly avoid asking transformers to use 4-bit
+            print("bitsandbytes not usable or no CUDA: loading model normally (no 4-bit).")
         try:
             self.llm_model = AutoModelForCausalLM.from_pretrained(
                 BASE_MODEL_ID,
+                **base_model_kwargs,
             )
+            print("Base model loaded from", BASE_MODEL_ID)
         except Exception as e:
             raise RuntimeError(
                 "Failed to load base model. Ensure the base model ID is correct and the HF_TOKEN has access if private. Error: "
                 + str(e)
             )
+        # 4) Load and apply PEFT adapter (adapter-only repo)
         try:
+            # get peft config (optional use)
+            try:
+                peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
+                print("Loaded PEFT config from", ADAPTER_REPO_ID)
+            except Exception:
+                peft_config = None
+                print("Warning: could not load PeftConfig; continuing to attempt adapter load.")
+            # build kwargs for PeftModel.from_pretrained
+            peft_kwargs = dict(
                 device_map=device_map,
                 torch_dtype=self.torch_dtype,
                 low_cpu_mem_usage=True,
             )
+            # If we loaded base model in 4-bit, PeftModel should be able to attach to it.
+            # If not, just pass the usual kwargs (we avoid adding load_in_4bit here; it's taken care of above).
+            self.llm_model = PeftModel.from_pretrained(
+                self.llm_model,
+                ADAPTER_REPO_ID,
+                **peft_kwargs,
+            )
+            print("PEFT adapter applied from", ADAPTER_REPO_ID)
         except Exception as e:
             raise RuntimeError(
                 "Failed to load/apply PEFT adapter from adapter repo. Make sure adapter files are present and HF_TOKEN has access if private. Error: "
                 + str(e)
             )
+        # 5) Optional non-streaming pipeline (useful for small tests)
         try:
             device_index = 0 if torch.cuda.is_available() else -1
             self.llm_pipeline = pipeline(
                 device=device_index,
                 model_kwargs={"torch_dtype": self.torch_dtype},
             )
+            print("Created text-generation pipeline (non-streaming).")
         except Exception as e:
             print("Warning: could not create text-generation pipeline. Streaming generate will still work. Error:", e)
             self.llm_pipeline = None
 # -------------------- Gradio pipelines --------------------
 def t2t_pipeline(text_input, chat_history):
     # Append the user's message and stream the assistant reply
+    chat_history = chat_history or []
     chat_history.append((text_input, ""))
     yield chat_history