Spaces:

eaglelandsonce
/

BertScorer

Sleeping

File size: 3,150 Bytes

c0663ad

# Install these first if needed:
# pip install gradio bert-score transformers

import gradio as gr
from bert_score import score


def compute_bertscore(candidate, reference, model_type, lang, rescale_with_baseline):
    if not candidate.strip() or not reference.strip():
        return "—", "—", "Please enter BOTH reference and candidate text."

    # BERTScore expects lists of strings
    cands = [candidate]
    refs = [reference]

    P, R, F1 = score(
        cands,
        refs,
        lang=lang,
        model_type=model_type,
        rescale_with_baseline=rescale_with_baseline,
    )

    precision = f"{P[0].item():.4f}"
    recall = f"{R[0].item():.4f}"
    f1 = f"{F1[0].item():.4f}"

    return precision, recall, f1


with gr.Blocks() as demo:
    gr.Markdown(
        """
    # 🔍 BERTScore Demo

    **BERTScore** evaluates the quality of generated text by comparing contextualized
    embeddings from models like BERT against a reference text.

    Unlike n-gram metrics (e.g., BLEU), BERTScore focuses on **semantic similarity**  
    and is often better at capturing whether **meaning is preserved**.

    1. Enter a **reference** text (ground truth).
    2. Enter a **candidate** text (model output or paraphrase).
    3. Click **Compute BERTScore**.
    """
    )

    with gr.Row():
        reference_input = gr.Textbox(
            label="Reference Text (Ground Truth)",
            lines=5,
            placeholder="e.g., The quick brown fox jumps over the lazy dog.",
        )
        candidate_input = gr.Textbox(
            label="Candidate Text (Generated/Paraphrased)",
            lines=5,
            placeholder="e.g., A fast brown fox leaps over a sleepy dog.",
        )

    with gr.Row():
        model_type = gr.Dropdown(
            label="Embedding Model",
            choices=[
                "microsoft/deberta-large-mnli",
                "bert-base-uncased",
                "roberta-large",
            ],
            value="microsoft/deberta-large-mnli",
            info="Recommended: microsoft/deberta-large-mnli for English",
        )

        lang = gr.Dropdown(
            label="Language Code",
            choices=["en", "de", "fr", "es", "zh"],
            value="en",
            info="Language of the texts (ISO code).",
        )

        rescale_with_baseline = gr.Checkbox(
            label="Rescale with Baseline (recommended for comparing scores)",
            value=True,
        )

    compute_button = gr.Button("Compute BERTScore", variant="primary")

    with gr.Row():
        precision_output = gr.Textbox(
            label="Precision", interactive=False, value="—"
        )
        recall_output = gr.Textbox(
            label="Recall", interactive=False, value="—"
        )
        f1_output = gr.Textbox(
            label="F1 (Main BERTScore Metric)", interactive=False, value="—"
        )

    compute_button.click(
        fn=compute_bertscore,
        inputs=[candidate_input, reference_input, model_type, lang, rescale_with_baseline],
        outputs=[precision_output, recall_output, f1_output],
    )

if __name__ == "__main__":
    demo.launch()