import streamlit as st from transformers import BertTokenizer, BertModel import tensorflow_hub as hub import torch import tensorflow as tf import pandas as pd import numpy as np import io # ---------------- Model options ---------------- models = { "BERT Base Uncased": "bert-base-uncased", "BERT Base Cased": "bert-base-cased", "BERT Large Uncased": "bert-large-uncased", "BERT Large Cased": "bert-large-cased", "ELMo": "https://tfhub.dev/google/elmo/3" } keys = list(models.keys()) # ---------------- Streamlit UI ---------------- st.title("📝 Generate Sentence Embeddings (BERT + ELMo)") choice = st.selectbox("Choose Model:", options=keys, index=0) st.markdown("### Enter a sentence (or upload CSV below):") text = st.text_area("", height=150) st.markdown("---") file = st.file_uploader("📂 Upload a CSV with a 'sentence' column for bulk analysis", type=["csv"]) # ---------------- Cached loaders ---------------- @st.cache_resource def load_bert(model_name: str): tokenizer = BertTokenizer.from_pretrained(model_name) model = BertModel.from_pretrained(model_name) model.eval() return tokenizer, model @st.cache_resource def load_elmo(): return hub.KerasLayer(models["ELMo"], trainable=False) # ---------------- Load selected model ---------------- if "BERT" in choice: tokenizer, model = load_bert(models[choice]) st.write(f"✅ Loaded {choice}") elif choice == "ELMo": elmo = load_elmo() st.write("✅ Loaded ELMo") # ---------------- Analyze ---------------- if st.button("🔍 Generate Embeddings") and (text.strip() or file): sentences = [] sources = [] # Single sentence input if text.strip(): sentences.append(text.strip()) sources.append("Single sentence") # CSV input if file: df = pd.read_csv(file) if "sentence" in df.columns: csv_sentences = df["sentence"].dropna().astype(str).tolist() sentences.extend(csv_sentences) sources.extend(["CSV"] * len(csv_sentences)) else: st.error("CSV must have a 'sentence' column") st.stop() embeddings = [] # --------- Generate embeddings --------- if "BERT" in choice: inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).numpy() st.success(f"✅ Generated embeddings for {len(sentences)} sentence(s) with {choice}") elif choice == "ELMo": inputs_tf = tf.convert_to_tensor(sentences, dtype=tf.string) emb_tensor = elmo(inputs_tf) if isinstance(emb_tensor, dict): emb_tensor = emb_tensor["default"] embeddings = emb_tensor.numpy() st.success(f"✅ Generated embeddings for {len(sentences)} sentence(s) with ELMo") embeddings = np.array(embeddings) # --------- Show summary --------- st.write(f"**Source:** {', '.join(set(sources))}") st.write(f"**Model:** {choice}") st.write(f"**Embeddings shape:** {embeddings.shape}") # --------- Preview embeddings (first 5 rows) --------- df_preview = pd.DataFrame(embeddings) df_preview.insert(0, "sentence", sentences) df_preview.insert(1, "source", sources) st.markdown("### 🔹 Preview of Embeddings (first 5 rows)") st.dataframe(df_preview.head()) # --------- Prepare CSV for download (UTF-8) --------- df_emb = pd.DataFrame(embeddings) df_emb.insert(0, "sentence", sentences) df_emb.insert(1, "source", sources) csv_data = df_emb.to_csv(index=False, encoding='utf-8') st.download_button( label="💾 Download Sentences + Embeddings", data=csv_data, file_name="sentences_embeddings.csv", mime="text/csv" )