Spaces:

OthnnyEL
/

DimChi

Sleeping

App Files Files Community

EYEDOL commited on Oct 18

Commit

813792b

verified ·

1 Parent(s): 485e894

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -30

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
 # -*- coding: utf-8 -*-
 """
-Refactored Salama Assistant: text-only chatbot (STT and TTS removed)
 Drop this file into your Hugging Face Space (replace existing app.py) or run locally.
-Performance-focused tweaks:
-- lower max_new_tokens
-- use greedy decoding (do_sample=False) for speed
-- call generate() under torch.no_grad()
-- set model.config.use_cache = True
-- other minor safe optimizations
 """
 import os
@@ -61,14 +59,15 @@ def is_package_installed(name: str) -> bool:
 class WeeboAssistant:
     def __init__(self):
         self.SYSTEM_PROMPT = (
             "You are an intelligent assistant. Answer questions briefly and accurately. "
             "Respond only in English. No long answers.\n"
         )
-        # set sensible defaults for generation speed
         self.MAX_NEW_TOKENS = 256   # lowered from 512 for speed
-        self.DO_SAMPLE = False      # greedy = faster; set True if you need randomness
-        self.NUM_BEAMS = 1          # keep 1 for greedy; increase for beam search (slower)
         self._init_models()
     def _init_models(self):
@@ -80,6 +79,7 @@ class WeeboAssistant:
         BNB_AVAILABLE = is_package_installed("bitsandbytes")
         print("bitsandbytes available:", BNB_AVAILABLE)
         try:
             self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
             print("Loaded tokenizer from BASE_MODEL_ID")
@@ -88,15 +88,15 @@ class WeeboAssistant:
             self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
             print("Loaded tokenizer from ADAPTER_REPO_ID")
-        # ensure tokenizer has pad_token_id (some HF models lack it)
         if getattr(self.llm_tokenizer, "pad_token_id", None) is None:
-            # try to set eos_token_id as pad if pad missing
             if getattr(self.llm_tokenizer, "eos_token_id", None) is not None:
                 self.llm_tokenizer.pad_token_id = self.llm_tokenizer.eos_token_id
             else:
-                # fallback to 0 (not ideal but prevents crashes)
                 self.llm_tokenizer.pad_token_id = 0
         if torch.cuda.is_available():
             device_map = "auto"
         else:
@@ -121,7 +121,7 @@ class WeeboAssistant:
                 BASE_MODEL_ID,
                 **base_model_kwargs,
             )
-            # make sure use_cache is enabled for faster autoregressive generation
             try:
                 self.llm_model.config.use_cache = True
             except Exception:
@@ -133,6 +133,7 @@ class WeeboAssistant:
                 + str(e)
             )
         try:
             try:
                 peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
@@ -164,6 +165,7 @@ class WeeboAssistant:
                 + str(e)
             )
         try:
             device_index = 0 if torch.cuda.is_available() else -1
             self.llm_pipeline = pipeline(
@@ -181,6 +183,7 @@ class WeeboAssistant:
         print("LLM base + adapter loaded successfully.")
     def get_llm_response(self, chat_history):
         prompt_lines = [self.SYSTEM_PROMPT]
         for user_msg, assistant_msg in chat_history:
             if user_msg:
@@ -190,7 +193,7 @@ class WeeboAssistant:
         prompt_lines.append("Assistant: ")
         prompt = "\n".join(prompt_lines)
-        # Tokenize
         inputs = self.llm_tokenizer(prompt, return_tensors="pt", padding=False)
         try:
             model_device = next(self.llm_model.parameters()).device
@@ -198,10 +201,10 @@ class WeeboAssistant:
             model_device = torch.device("cpu")
         inputs = {k: v.to(model_device) for k, v in inputs.items()}
-        # Streamer unchanged (still yields chunks)
         streamer = TextIteratorStreamer(self.llm_tokenizer, skip_prompt=True, skip_special_tokens=True)
-        # Prefill some generation kwargs optimized for speed
         input_len = inputs["input_ids"].shape[1]
         max_new = self.MAX_NEW_TOKENS
         max_length = input_len + max_new
@@ -209,10 +212,10 @@ class WeeboAssistant:
         generation_kwargs = dict(
             input_ids=inputs["input_ids"],
             attention_mask=inputs.get("attention_mask", None),
-            max_length=max_length,               # prefer max_length = input_len + max_new_tokens
-            max_new_tokens=max_new,              # kept for clarity / compatibility
             do_sample=self.DO_SAMPLE,            # greedy if False -> faster
-            num_beams=self.NUM_BEAMS,            # beam search >1 slows down; keep 1 for speed
             streamer=streamer,
             eos_token_id=getattr(self.llm_tokenizer, "eos_token_id", None),
             pad_token_id=getattr(self.llm_tokenizer, "pad_token_id", None),
@@ -220,15 +223,12 @@ class WeeboAssistant:
             early_stopping=True,
         )
-        # Run generate under no_grad for speed / memory
         def _generate_thread():
             with torch.no_grad():
                 try:
-                    # call generate on model (PEFT-wrapped)
                     self.llm_model.generate(**generation_kwargs)
                 except Exception as e:
-                    # if streaming fails, put an error chunk into streamer by raising
-                    # streamer does not provide a direct API to inject text; print to log
                     print("Generation error:", e)
         gen_thread = threading.Thread(target=_generate_thread, daemon=True)
@@ -237,12 +237,14 @@ class WeeboAssistant:
         return streamer
 assistant = WeeboAssistant()
 def t2t_pipeline(text_input, chat_history):
     chat_history = chat_history or []
-    chat_history.append((text_input, ""))
     yield chat_history
     response_stream = assistant.get_llm_response(chat_history)
@@ -257,12 +259,71 @@ def clear_textbox():
     return gr.Textbox.update(value="")
-# -------------------- English UI --------------------
-with gr.Blocks(theme=gr.themes.Soft(), title="Swahili Assistant - Text Chat") as demo:
-    gr.Markdown("# 🤖 Swahili Assistant (Text Chat)")
-    gr.Markdown("Chat (text-based) with the assistant in English. Use the box below to type your question.")
-    t2t_chatbot = gr.Chatbot(label="Conversation", bubble_full_width=False, height=500)
     with gr.Row():
         t2t_text_in = gr.Textbox(show_label=False, placeholder="Type your message here...", scale=4, container=False)
         t2t_submit_btn = gr.Button("Send", variant="primary", scale=1)
@@ -289,4 +350,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Swahili Assistant - Text Chat") as
         outputs=t2t_text_in,
     )
 demo.queue().launch(debug=True)

 # -*- coding: utf-8 -*-
 """
+YOUR FOIA CHAT ASSISTANCE - Text-only chatbot (STT and TTS removed)
 Drop this file into your Hugging Face Space (replace existing app.py) or run locally.
+Notes:
+- Dark UI via custom CSS (works even if Gradio theme API differs)
+- Performance-focused: greedy generation, lower max_new_tokens, use_cache, no_grad, streaming
+- Keeps bitsandbytes / 4-bit logic intact when available
 """
 import os
 class WeeboAssistant:
     def __init__(self):
+        # system prompt instructs the assistant to answer concisely in English
         self.SYSTEM_PROMPT = (
             "You are an intelligent assistant. Answer questions briefly and accurately. "
             "Respond only in English. No long answers.\n"
         )
+        # generation defaults tuned for speed (adjust if you need different behavior)
         self.MAX_NEW_TOKENS = 256   # lowered from 512 for speed
+        self.DO_SAMPLE = False      # greedy = faster; set True if you want sampling
+        self.NUM_BEAMS = 1          # keep 1 for greedy (increase >1 for beam search)
         self._init_models()
     def _init_models(self):
         BNB_AVAILABLE = is_package_installed("bitsandbytes")
         print("bitsandbytes available:", BNB_AVAILABLE)
+        # load tokenizer (prefer base tokenizer)
         try:
             self.llm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
             print("Loaded tokenizer from BASE_MODEL_ID")
             self.llm_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
             print("Loaded tokenizer from ADAPTER_REPO_ID")
+        # ensure tokenizer has pad_token_id to avoid generation stalls
         if getattr(self.llm_tokenizer, "pad_token_id", None) is None:
             if getattr(self.llm_tokenizer, "eos_token_id", None) is not None:
                 self.llm_tokenizer.pad_token_id = self.llm_tokenizer.eos_token_id
             else:
+                # fallback to 0 to prevent crashes (not ideal but safe)
                 self.llm_tokenizer.pad_token_id = 0
+        # decide device_map (never pass None)
         if torch.cuda.is_available():
             device_map = "auto"
         else:
                 BASE_MODEL_ID,
                 **base_model_kwargs,
             )
+            # ensure use_cache set for faster autoregressive generation
             try:
                 self.llm_model.config.use_cache = True
             except Exception:
                 + str(e)
             )
+        # load and apply PEFT adapter
         try:
             try:
                 peft_config = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
                 + str(e)
             )
+        # optional non-streaming pipeline (useful for quick tests)
         try:
             device_index = 0 if torch.cuda.is_available() else -1
             self.llm_pipeline = pipeline(
         print("LLM base + adapter loaded successfully.")
     def get_llm_response(self, chat_history):
+        # Build prompt (system + conversation)
         prompt_lines = [self.SYSTEM_PROMPT]
         for user_msg, assistant_msg in chat_history:
             if user_msg:
         prompt_lines.append("Assistant: ")
         prompt = "\n".join(prompt_lines)
+        # Tokenize inputs
         inputs = self.llm_tokenizer(prompt, return_tensors="pt", padding=False)
         try:
             model_device = next(self.llm_model.parameters()).device
             model_device = torch.device("cpu")
         inputs = {k: v.to(model_device) for k, v in inputs.items()}
+        # Use TextIteratorStreamer for streaming outputs to Gradio
         streamer = TextIteratorStreamer(self.llm_tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # Prefill generation kwargs optimized for speed
         input_len = inputs["input_ids"].shape[1]
         max_new = self.MAX_NEW_TOKENS
         max_length = input_len + max_new
         generation_kwargs = dict(
             input_ids=inputs["input_ids"],
             attention_mask=inputs.get("attention_mask", None),
+            max_length=max_length,               # input_len + max_new
+            max_new_tokens=max_new,              # explicit
             do_sample=self.DO_SAMPLE,            # greedy if False -> faster
+            num_beams=self.NUM_BEAMS,            # keep 1 for speed
             streamer=streamer,
             eos_token_id=getattr(self.llm_tokenizer, "eos_token_id", None),
             pad_token_id=getattr(self.llm_tokenizer, "pad_token_id", None),
             early_stopping=True,
         )
+        # Run generate under no_grad to save memory and time
         def _generate_thread():
             with torch.no_grad():
                 try:
                     self.llm_model.generate(**generation_kwargs)
                 except Exception as e:
                     print("Generation error:", e)
         gen_thread = threading.Thread(target=_generate_thread, daemon=True)
         return streamer
+# create assistant instance (loads model once at startup)
 assistant = WeeboAssistant()
+# -------------------- Gradio pipeline functions --------------------
 def t2t_pipeline(text_input, chat_history):
     chat_history = chat_history or []
+    chat_history.append((text_input, ""))  # placeholder for assistant reply
     yield chat_history
     response_stream = assistant.get_llm_response(chat_history)
     return gr.Textbox.update(value="")
+# -------------------- Dark UI CSS --------------------
+DARK_CSS = """
+/* Base background & text */
+body, .gradio-container {
+  background: linear-gradient(180deg, #04060a 0%, #0b1220 100%) !important;
+  color: #E6EEF8 !important;
+}
+/* Header / Markdown text */
+h1, h2, h3, .markdown {
+  color: #E6EEF8 !important;
+}
+/* Card backgrounds */
+.gr-block, .gr-box, .gr-row, .gr-column, .gradio-container .container {
+  background-color: transparent !important;
+}
+/* Chatbot area */
+.gr-chatbot {
+  background: rgba(10, 14, 22, 0.6) !important;
+  border: 1px solid rgba(255,255,255,0.04) !important;
+  color: #E6EEF8 !important;
+}
+/* Chat messages - user and assistant bubbles */
+.gr-chatbot .message.user, .gr-chatbot .message.user p {
+  background: linear-gradient(180deg, #0f1724, #0b1220) !important;
+  color: #CFE7FF !important;
+  border: 1px solid rgba(255,255,255,0.04) !important;
+}
+.gr-chatbot .message.bot, .gr-chatbot .message.bot p {
+  background: linear-gradient(180deg, #071126, #081426) !important;
+  color: #E6EEF8 !important;
+  border: 1px solid rgba(255,255,255,0.03) !important;
+}
+/* Input textbox and button */
+.gr-textbox, .gr-textbox textarea {
+  background: #071226 !important;
+  color: #E6EEF8 !important;
+  border: 1px solid rgba(255,255,255,0.04) !important;
+}
+.gr-button, .gr-button:hover {
+  background: linear-gradient(180deg, #0b63ff, #0a4ad6) !important;
+  color: white !important;
+  border: none !important;
+  box-shadow: 0 6px 18px rgba(6, 18, 55, 0.5) !important;
+}
+/* Small UI tweaks */
+footer, .footer {
+  display: none;
+}
+.gradio-container * {
+  font-family: Inter, ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial;
+}
+"""
+# -------------------- Gradio UI (dark) --------------------
+with gr.Blocks(css=DARK_CSS, title="YOUR FOIA CHAT ASSISTANCE") as demo:
+    gr.Markdown("# YOUR FOIA CHAT ASSISTANCE")
+    gr.Markdown("Chat (text-based) with the FOIA assistant. Use the box below to type your question.")
+    t2t_chatbot = gr.Chatbot(label="Conversation", bubble_full_width=False, height=520)
     with gr.Row():
         t2t_text_in = gr.Textbox(show_label=False, placeholder="Type your message here...", scale=4, container=False)
         t2t_submit_btn = gr.Button("Send", variant="primary", scale=1)
         outputs=t2t_text_in,
     )
+# launch
 demo.queue().launch(debug=True)