Spaces:

OthnnyEL
/

DimChi

Sleeping

App Files Files Community

EYEDOL commited on Oct 18

Commit

b300f26

verified ·

1 Parent(s): 830c848

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -130

app.py CHANGED Viewed

@@ -1,42 +1,55 @@
 # app.py
-# Streamlit Chat UI with robust model + PEFT loading (English interface)
 # Requirements:
 #   pip install streamlit torch transformers peft accelerate safetensors huggingface_hub
 import os
-import threading
 import time
 import streamlit as st
 import torch
-import importlib
 from huggingface_hub import login
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    TextIteratorStreamer,
-)
 from peft import PeftModel, PeftConfig
-# -------------------- Configuration --------------------
-# Edit these to the model/adapter you want. Adapter repo can be adapter-only (PEFT).
 BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "unsloth/Llama-3.2-3B-Instruct-bnb-4bit")
-ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", None)  # set to adapter repo id or leave None
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("hugface")
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 256))
 TEMPERATURE = float(os.environ.get("TEMP", 0.6))
 TOP_P = float(os.environ.get("TOP_P", 0.9))
-# -------------------- Helpers --------------------
 def is_package_installed(name: str) -> bool:
     """Return True if distribution metadata or importable."""
     try:
         import importlib.metadata as md
         try:
             md.distribution(name)
             return True
         except Exception:
-            return False
     except Exception:
         try:
             importlib.import_module(name)
@@ -46,191 +59,223 @@ def is_package_installed(name: str) -> bool:
 def try_login_hf(token: str):
     if not token:
-        st.info("HF_TOKEN not provided — private models may fail.")
         return
     try:
         login(token=token)
-        st.success("Logged into Hugging Face Hub")
     except Exception as e:
         st.warning(f"Hugging Face login failed: {e}")
-# -------------------- Streamlit Page --------------------
 st.set_page_config(page_title="AI Chatbot Assistant", page_icon="🤖", layout="wide")
 st.title("🤖 AI Chatbot Assistant")
-st.write("Type your message in English and get a response from the AI model. Keep messages short for better results.")
-# Sidebar for status/config
 with st.sidebar:
-    st.header("Model / Environment")
     st.text(f"BASE_MODEL_ID: {BASE_MODEL_ID}")
     st.text(f"ADAPTER_REPO_ID: {ADAPTER_REPO_ID or 'None'}")
-    st.text(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
     st.text(f"bitsandbytes installed: {is_package_installed('bitsandbytes')}")
-# Attempt HF login (for private repos)
 try_login_hf(HF_TOKEN)
-# -------------------- Model loader (cached) --------------------
 @st.cache_resource(show_spinner=False)
-def load_models():
-    """Loads tokenizer, base model, and optional adapter; returns (tokenizer, model, device)."""
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
     BNB_AVAILABLE = is_package_installed("bitsandbytes")
-    st.write(f"bitsandbytes available: {BNB_AVAILABLE}")
     # Load tokenizer (prefer base)
     try:
         tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
         st.write("Tokenizer loaded from base model.")
     except Exception as e:
-        st.write(f"Warning: failed to load tokenizer from base: {e}")
         if ADAPTER_REPO_ID:
             tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
             st.write("Tokenizer loaded from adapter repo.")
         else:
-            raise RuntimeError("Failed to load tokenizer from base and no adapter set.")
-    # Prepare device_map (never None)
     if torch.cuda.is_available():
         device_map = "auto"
     else:
-        device_map = {"": "cpu"}  # force all weights on CPU to avoid NoneType iteration
     st.write(f"Using device_map = {device_map}")
-    # Build kwargs for from_pretrained
     base_kwargs = dict(
-        torch_dtype=torch_dtype,
         low_cpu_mem_usage=True,
         device_map=device_map,
         trust_remote_code=True,
     )
-    # Only request load_in_4bit if bitsandbytes present and CUDA available
-    if BNB_AVAILABLE and torch.cuda.is_available():
-        base_kwargs["load_in_4bit"] = True
-        st.write("Attempting to load base model in 4-bit (bitsandbytes + CUDA detected).")
-    else:
-        st.write("Not using 4-bit load (either no CUDA or bitsandbytes not available).")
-    # Load base model
     try:
         model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **base_kwargs)
-        st.write("Base model loaded.")
     except Exception as e:
-        raise RuntimeError(f"Failed to load base model {BASE_MODEL_ID}: {e}")
-    # If adapter specified, load PEFT
     if ADAPTER_REPO_ID:
         try:
-            # attempt to read peft config (optional)
             try:
                 _ = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
                 st.write("PEFT config loaded from adapter repo.")
             except Exception:
-                st.write("Warning: could not load PeftConfig (continuing to attempt adapter load).")
-            model = PeftModel.from_pretrained(
-                model,
-                ADAPTER_REPO_ID,
-                device_map=device_map,
-                torch_dtype=torch_dtype,
-                low_cpu_mem_usage=True,
-            )
             st.write("PEFT adapter loaded and applied.")
         except Exception as e:
             raise RuntimeError(f"Failed to load/apply PEFT adapter from {ADAPTER_REPO_ID}: {e}")
     return tokenizer, model, device
-# Load models (blocking; shows spinner)
-with st.spinner("Loading model(s), this may take a minute..."):
     try:
-        tokenizer, model, device = load_models()
     except Exception as e:
-        st.error(f"Model loading failed: {e}")
         st.stop()
-# -------------------- Chat state --------------------
 if "chat_history" not in st.session_state:
-    # list of tuples (user, assistant)
-    st.session_state.chat_history = []
-# Input area
-user_input = st.text_area("Your message (English):", height=120, key="user_input")
-col1, col2 = st.columns([1, 1])
-with col1:
-    send_btn = st.button("Send")
-with col2:
-    clear_btn = st.button("Clear chat")
-# Chat display container
-chat_container = st.container()
-def stream_generate_and_stream_to_ui(prompt, tokenizer, model, max_new_tokens=MAX_NEW_TOKENS):
     """
-    Uses TextIteratorStreamer and a thread to stream tokens to the UI.
-    Returns the final generated string.
     """
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        input_ids=prompt["input_ids"].to(next(model.parameters()).device),
-        attention_mask=prompt.get("attention_mask", None),
         max_new_tokens=max_new_tokens,
         do_sample=True,
-        temperature=TEMPERATURE,
-        top_p=TOP_P,
-        streamer=streamer,
         eos_token_id=getattr(tokenizer, "eos_token_id", None),
     )
-    # start generation in background thread
-    gen_thread = threading.Thread(target=model.generate, kwargs=generation_kwargs, daemon=True)
-    gen_thread.start()
-    # stream into UI
-    output_text = ""
-    placeholder = chat_container.empty()
-    # show current conversation and streaming answer
     while True:
-        try:
-            token = next(streamer)
-        except StopIteration:
             break
-        output_text += token
-        # Display chat history with the current streaming token appended
         with placeholder:
-            for user_msg, assistant_msg in st.session_state.chat_history[:-1]:
-                st.markdown(f"**🧑 You:** {user_msg}")
-                st.markdown(f"**🤖 Assistant:** {assistant_msg}")
-            # Current user (last) and streaming assistant
             last_user, _ = st.session_state.chat_history[-1]
             st.markdown(f"**🧑 You:** {last_user}")
-            st.markdown(f"**🤖 Assistant:** {output_text}")
-        # small sleep to allow UI update
-        time.sleep(0.01)
-    # finish: ensure final display
-    with chat_container:
-        for user_msg, assistant_msg in st.session_state.chat_history[:-1]:
-            st.markdown(f"**🧑 You:** {user_msg}")
-            st.markdown(f"**🤖 Assistant:** {assistant_msg}")
         last_user, _ = st.session_state.chat_history[-1]
         st.markdown(f"**🧑 You:** {last_user}")
-        st.markdown(f"**🤖 Assistant:** {output_text}")
-    return output_text
-# Handle Send
 if send_btn:
     if not user_input or not user_input.strip():
         st.warning("Please type a message before sending.")
     else:
-        # Add user message and placeholder assistant reply
-        st.session_state.chat_history.append((user_input.strip(), ""))
-        # Build prompt from history (system prompt + conversation)
         system_prompt = "You are a helpful assistant. Answer briefly and accurately in English."
         prompt_lines = [system_prompt]
         for u, a in st.session_state.chat_history:
@@ -241,36 +286,33 @@ if send_btn:
         prompt_lines.append("Assistant: ")
         final_prompt = "\n".join(prompt_lines)
-        # tokenize
         inputs = tokenizer(final_prompt, return_tensors="pt")
-        # move to model device
-        model_device = next(model.parameters()).device
-        inputs = {k: v.to(model_device) for k, v in inputs.items()}
-        # Stream generate and update UI
         try:
-            reply_text = stream_generate_and_stream_to_ui(inputs, tokenizer, model, max_new_tokens=MAX_NEW_TOKENS)
         except Exception as e:
             st.error(f"Generation failed: {e}")
             reply_text = "Error generating response."
-        # replace the last placeholder assistant reply
-        st.session_state.chat_history[-1] = (user_input.strip(), reply_text)
-        # clear input box
         st.session_state.user_input = ""
-# Handle Clear
 if clear_btn:
     st.session_state.chat_history = []
     st.experimental_rerun()
-# If there is chat history but user didn't just send (page load), display it
-if st.session_state.chat_history and not send_btn:
-    with chat_container:
-        for user_msg, assistant_msg in st.session_state.chat_history:
-            st.markdown(f"**🧑 You:** {user_msg}")
-            st.markdown(f"**🤖 Assistant:** {assistant_msg}")
-# Footer / tips
 st.markdown("---")
-st.caption("Tip: Keep prompts short. If model loading fails, check HF_TOKEN, CUDA availability and install bitsandbytes for 4-bit models.")

 # app.py
+# Full Streamlit chat app (English interface)
+# - Safe device_map handling (no device_map=None)
+# - Uses queue-based streaming so Streamlit UI is only updated from main thread
+# - Detects bitsandbytes and attempts 4-bit only when safe (CUDA + bitsandbytes)
+# - Supports optional PEFT adapter repo (set ADAPTER_REPO_ID env)
+# - Uses `dtype=` for from_pretrained where supported (silences deprecation)
+#
 # Requirements:
 #   pip install streamlit torch transformers peft accelerate safetensors huggingface_hub
+#   (if using 4-bit on GPU: pip install bitsandbytes matched to your CUDA)
+#
+# Run:
+#   streamlit run app.py --server.headless true --server.port 8501
 import os
 import time
+import threading
+import queue
+import importlib
+import importlib.util
 import streamlit as st
 import torch
 from huggingface_hub import login
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from peft import PeftModel, PeftConfig
+# -------------------- Configuration (via env or defaults) --------------------
 BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "unsloth/Llama-3.2-3B-Instruct-bnb-4bit")
+ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", None)  # e.g. "EYEDOL/FOIA" or None
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("hugface")
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 256))
 TEMPERATURE = float(os.environ.get("TEMP", 0.6))
 TOP_P = float(os.environ.get("TOP_P", 0.9))
+# -------------------- Utilities --------------------
 def is_package_installed(name: str) -> bool:
     """Return True if distribution metadata or importable."""
     try:
         import importlib.metadata as md
         try:
+            # this checks for distribution metadata (preferred)
             md.distribution(name)
             return True
         except Exception:
+            # fallback to plain import
+            try:
+                importlib.import_module(name)
+                return True
+            except Exception:
+                return False
     except Exception:
         try:
             importlib.import_module(name)
 def try_login_hf(token: str):
     if not token:
+        st.info("HF_TOKEN not provided — private models may fail to load.")
         return
     try:
         login(token=token)
+        st.success("Logged into Hugging Face Hub.")
     except Exception as e:
         st.warning(f"Hugging Face login failed: {e}")
+# -------------------- Streamlit UI Setup --------------------
 st.set_page_config(page_title="AI Chatbot Assistant", page_icon="🤖", layout="wide")
 st.title("🤖 AI Chatbot Assistant")
+st.write("Type your message in English and get a brief, accurate response from the AI model.")
 with st.sidebar:
+    st.header("Settings & Environment")
+    st.write("Change with environment variables before starting the app.")
     st.text(f"BASE_MODEL_ID: {BASE_MODEL_ID}")
     st.text(f"ADAPTER_REPO_ID: {ADAPTER_REPO_ID or 'None'}")
+    st.text(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
     st.text(f"bitsandbytes installed: {is_package_installed('bitsandbytes')}")
+    st.markdown("---")
+    st.caption("Run with: streamlit run app.py")
+# Attempt HF login for private repos
 try_login_hf(HF_TOKEN)
+# -------------------- Model Loading (cached) --------------------
 @st.cache_resource(show_spinner=False)
+def load_tokenizer_and_model():
+    """
+    Loads tokenizer and model (plus optional PEFT adapter).
+    Returns (tokenizer, model, device).
+    """
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32  # use dtype param where supported
     BNB_AVAILABLE = is_package_installed("bitsandbytes")
     # Load tokenizer (prefer base)
     try:
         tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
         st.write("Tokenizer loaded from base model.")
     except Exception as e:
+        st.warning(f"Failed to load tokenizer from base model: {e}")
         if ADAPTER_REPO_ID:
             tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO_ID, use_fast=True)
             st.write("Tokenizer loaded from adapter repo.")
         else:
+            raise RuntimeError("Failed to load tokenizer and no adapter repo provided.")
+    # prepare device_map (never None)
     if torch.cuda.is_available():
         device_map = "auto"
     else:
+        device_map = {"": "cpu"}  # forces entire model to CPU (avoids NoneType iteration)
     st.write(f"Using device_map = {device_map}")
     base_kwargs = dict(
         low_cpu_mem_usage=True,
         device_map=device_map,
         trust_remote_code=True,
     )
+    # Use dtype param if supported - Transformers accepts dtype or torch_dtype depending on version.
+    # Try dtype first; if it fails, fallback to torch_dtype in exception.
+    tried_dtype = False
     try:
+        base_kwargs["dtype"] = dtype
+        if BNB_AVAILABLE and torch.cuda.is_available():
+            base_kwargs["load_in_4bit"] = True
+            st.write("Attempting 4-bit load (bitsandbytes + CUDA detected).")
+        st.write("Loading base model (attempt using dtype)...")
         model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **base_kwargs)
+        tried_dtype = True
+    except TypeError as e:
+        # Transformers older versions might not accept dtype kwarg; fallback to torch_dtype
+        st.write("dtype param not accepted by this transformers version; falling back to torch_dtype.")
+        base_kwargs.pop("dtype", None)
+        base_kwargs["torch_dtype"] = dtype
+        try:
+            if BNB_AVAILABLE and torch.cuda.is_available():
+                base_kwargs["load_in_4bit"] = True
+                st.write("Attempting 4-bit load (bitsandbytes + CUDA detected).")
+            st.write("Loading base model (fallback using torch_dtype)...")
+            model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **base_kwargs)
+        except Exception as e2:
+            raise RuntimeError(f"Failed to load base model (both dtype and torch_dtype attempts failed): {e2}")
     except Exception as e:
+        raise RuntimeError(f"Failed to load base model: {e}")
+    st.write("Base model loaded successfully.")
+    # Optional: apply PEFT adapter
     if ADAPTER_REPO_ID:
         try:
             try:
                 _ = PeftConfig.from_pretrained(ADAPTER_REPO_ID)
                 st.write("PEFT config loaded from adapter repo.")
             except Exception:
+                st.write("Note: could not load PeftConfig (continuing to attempt adapter load).")
+            peft_kwargs = dict(device_map=device_map, low_cpu_mem_usage=True)
+            # If dtype was used earlier, it will be part of the underlying model types already.
+            model = PeftModel.from_pretrained(model, ADAPTER_REPO_ID, **peft_kwargs)
             st.write("PEFT adapter loaded and applied.")
         except Exception as e:
             raise RuntimeError(f"Failed to load/apply PEFT adapter from {ADAPTER_REPO_ID}: {e}")
     return tokenizer, model, device
+# Show spinner while loading (this call is cached)
+with st.spinner("Loading tokenizer and model (may take a while)..."):
     try:
+        tokenizer, model, device = load_tokenizer_and_model()
     except Exception as e:
+        st.error(f"Model load failed: {e}")
         st.stop()
+# -------------------- Chat State --------------------
 if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []  # list of (user, assistant)
+# -------------------- Streaming generation using queue (safe for Streamlit) --------------------
+def generation_worker(model, gen_kwargs, token_queue):
+    """
+    Worker runs in background thread. Creates a TextIteratorStreamer and puts tokens into token_queue.
+    Does NOT call any Streamlit functions.
+    """
+    try:
+        # The TextIteratorStreamer yields text chunks
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        gen_kwargs_local = gen_kwargs.copy()
+        gen_kwargs_local["streamer"] = streamer
+        # Start generation (this will run until complete)
+        model.generate(**gen_kwargs_local)
+        # Forward tokens from streamer into the queue
+        for chunk in streamer:
+            token_queue.put({"token": chunk})
+    except Exception as e:
+        token_queue.put({"error": str(e)})
+    finally:
+        # sentinel to mark completion
+        token_queue.put(None)
+def stream_generate_and_update_ui(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, top_p=TOP_P):
     """
+    Starts a generation_worker thread and reads its output via a queue,
+    updating Streamlit UI from the main thread only.
+    Returns the final generated text (string).
     """
+    token_queue = queue.Queue()
+    model_device = next(model.parameters()).device
+    gen_kwargs = dict(
+        input_ids=inputs["input_ids"].to(model_device),
+        attention_mask=inputs.get("attention_mask", None),
         max_new_tokens=max_new_tokens,
         do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
         eos_token_id=getattr(tokenizer, "eos_token_id", None),
     )
+    worker = threading.Thread(target=generation_worker, args=(model, gen_kwargs, token_queue), daemon=True)
+    worker.start()
+    # UI placeholder for streaming
+    placeholder = st.empty()
+    streamed_text = ""
     while True:
+        item = token_queue.get()  # blocking
+        if item is None:
             break
+        if "error" in item:
+            # Show error once and return what we have
+            with placeholder:
+                st.error("Generation error: " + item["error"])
+            return streamed_text
+        token = item.get("token", "")
+        streamed_text += token
+        # Update UI: render whole conversation with streaming assistant reply appended
         with placeholder:
+            for u_msg, a_msg in st.session_state.chat_history[:-1]:
+                st.markdown(f"**🧑 You:** {u_msg}")
+                st.markdown(f"**🤖 Assistant:** {a_msg}")
+            # last user is placeholder with streaming assistant text
             last_user, _ = st.session_state.chat_history[-1]
             st.markdown(f"**🧑 You:** {last_user}")
+            st.markdown(f"**🤖 Assistant:** {streamed_text}")
+    # final display (ensures final content shown)
+    with placeholder:
+        for u_msg, a_msg in st.session_state.chat_history[:-1]:
+            st.markdown(f"**🧑 You:** {u_msg}")
+            st.markdown(f"**🤖 Assistant:** {a_msg}")
         last_user, _ = st.session_state.chat_history[-1]
         st.markdown(f"**🧑 You:** {last_user}")
+        st.markdown(f"**🤖 Assistant:** {streamed_text}")
+    return streamed_text
+# -------------------- Input / Buttons --------------------
+user_input = st.text_area("Your message (English):", height=120, key="user_input")
+col1, col2 = st.columns([1, 1])
+with col1:
+    send_btn = st.button("Send")
+with col2:
+    clear_btn = st.button("Clear chat")
+# -------------------- Handlers --------------------
 if send_btn:
     if not user_input or not user_input.strip():
         st.warning("Please type a message before sending.")
     else:
+        user_text = user_input.strip()
+        # append placeholder for assistant reply
+        st.session_state.chat_history.append((user_text, ""))
+        # Build prompt from history
         system_prompt = "You are a helpful assistant. Answer briefly and accurately in English."
         prompt_lines = [system_prompt]
         for u, a in st.session_state.chat_history:
         prompt_lines.append("Assistant: ")
         final_prompt = "\n".join(prompt_lines)
+        # tokenize and move to model device inside stream function
         inputs = tokenizer(final_prompt, return_tensors="pt")
+        # Stream generate and update UI in main thread
         try:
+            reply_text = stream_generate_and_update_ui(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, top_p=TOP_P)
         except Exception as e:
             st.error(f"Generation failed: {e}")
             reply_text = "Error generating response."
+        # replace last placeholder assistant reply with final reply_text
+        st.session_state.chat_history[-1] = (user_text, reply_text)
+        # clear the input
         st.session_state.user_input = ""
+        # Rerun to refresh state display
+        st.experimental_rerun()
 if clear_btn:
     st.session_state.chat_history = []
     st.experimental_rerun()
+# -------------------- Display chat history (static on page load) --------------------
+if st.session_state.chat_history:
+    for u_msg, a_msg in st.session_state.chat_history:
+        st.markdown(f"**🧑 You:** {u_msg}")
+        st.markdown(f"**🤖 Assistant:** {a_msg}")
+# -------------------- Footer / Tips --------------------
 st.markdown("---")
+st.caption("Tips: Run the app with `streamlit run app.py`. If using a 4-bit model (model name ends with '-bnb-4bit'), install bitsandbytes and run on CUDA-enabled GPU. If model loading fails, check HF_TOKEN and adapter repo access.")