import streamlit as st
from transformers import BertTokenizer, BertModel
import tensorflow_hub as hub
import torch
import tensorflow as tf
import pandas as pd
import numpy as np
import io

# ---------------- Model options ----------------
models = {
    "BERT Base Uncased": "bert-base-uncased",
    "BERT Base Cased": "bert-base-cased",
    "BERT Large Uncased": "bert-large-uncased",
    "BERT Large Cased": "bert-large-cased",
    "ELMo": "https://tfhub.dev/google/elmo/3"
}

keys = list(models.keys())

# ---------------- Streamlit UI ----------------
st.title("📝 Generate Sentence Embeddings (BERT + ELMo)")

choice = st.selectbox("Choose Model:", options=keys, index=0)

st.markdown("### Enter a sentence (or upload CSV below):")
text = st.text_area("", height=150)

st.markdown("---")
file = st.file_uploader("📂 Upload a CSV with a 'sentence' column for bulk analysis", type=["csv"])

# ---------------- Cached loaders ----------------
@st.cache_resource
def load_bert(model_name: str):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    model.eval()
    return tokenizer, model

@st.cache_resource
def load_elmo():
    return hub.KerasLayer(models["ELMo"], trainable=False)

# ---------------- Load selected model ----------------
if "BERT" in choice:
    tokenizer, model = load_bert(models[choice])
    st.write(f"✅ Loaded {choice}")
elif choice == "ELMo":
    elmo = load_elmo()
    st.write("✅ Loaded ELMo")

# ---------------- Analyze ----------------
if st.button("🔍 Generate Embeddings") and (text.strip() or file):

    sentences = []
    sources = []

    # Single sentence input
    if text.strip():
        sentences.append(text.strip())
        sources.append("Single sentence")

    # CSV input
    if file:
        df = pd.read_csv(file)
        if "sentence" in df.columns:
            csv_sentences = df["sentence"].dropna().astype(str).tolist()
            sentences.extend(csv_sentences)
            sources.extend(["CSV"] * len(csv_sentences))
        else:
            st.error("CSV must have a 'sentence' column")
            st.stop()

    embeddings = []

    # --------- Generate embeddings ---------
    if "BERT" in choice:
        inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        st.success(f"✅ Generated embeddings for {len(sentences)} sentence(s) with {choice}")

    elif choice == "ELMo":
        inputs_tf = tf.convert_to_tensor(sentences, dtype=tf.string)
        emb_tensor = elmo(inputs_tf)
        if isinstance(emb_tensor, dict):
            emb_tensor = emb_tensor["default"]
        embeddings = emb_tensor.numpy()
        st.success(f"✅ Generated embeddings for {len(sentences)} sentence(s) with ELMo")

    embeddings = np.array(embeddings)


    # --------- Show summary ---------
    st.write(f"**Source:** {', '.join(set(sources))}")
    st.write(f"**Model:** {choice}")
    st.write(f"**Embeddings shape:** {embeddings.shape}")

    # --------- Preview embeddings (first 5 rows) ---------
    df_preview = pd.DataFrame(embeddings)
    df_preview.insert(0, "sentence", sentences)
    df_preview.insert(1, "source", sources)

    st.markdown("### 🔹 Preview of Embeddings (first 5 rows)")
    st.dataframe(df_preview.head())


    # --------- Prepare CSV for download (UTF-8) ---------
    df_emb = pd.DataFrame(embeddings)
    df_emb.insert(0, "sentence", sentences)
    df_emb.insert(1, "source", sources)

    csv_data = df_emb.to_csv(index=False, encoding='utf-8')

    st.download_button(
        label="💾 Download Sentences + Embeddings",
        data=csv_data,
        file_name="sentences_embeddings.csv",
        mime="text/csv"
    )