Spaces:

UMCU
/

MLMtester

Sleeping

App Files Files Community

UMCU commited on Jun 22

Commit

4e88cbb

verified ·

1 Parent(s): e76a11a

Upload 4 files

Browse files

Files changed (3) hide show

app.py +94 -0
poetry.lock +0 -0
pyproject.toml +21 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gradio as gr
+from transformers import pipeline
+from transformers import AutoTokenizer
+# Cache for loaded pipelines to avoid reloading
+pipeline_cache = {}
+# List of available masked language models
+def get_model_choices():
+    return [
+        "UMCU/CardioMedRoBERTa.nl",
+        "UMCU/CardioBERTa_base.nl",
+        "UMCU/CardioBERTa.nl_clinical",
+        "UMCU/CardioDeBERTa.nl",
+        "UMCU/CardioDeBERTa.nl_clinical",
+        #"UMCU/CardioBigBird_base.nl",
+        "CLTL/MedRoBERTa.nl",
+        "DTAI-KULeuven/robbert-2023-dutch-base",
+        "DTAI-KULeuven/robbert-2023-dutch-large",
+        "joeranbosma/dragon-bert-base-mixed-domain",
+        "joeranbosma/dragon-bert-base-domain-specific",
+        "joeranbosma/dragon-roberta-base-mixed-domain",
+        "joeranbosma/dragon-roberta-large-mixed-domain",
+        "joeranbosma/dragon-roberta-base-domain-specific",
+        "joeranbosma/dragon-roberta-large-domain-specific",
+        "joeranbosma/dragon-longformer-base-mixed-domain",
+        "joeranbosma/dragon-longformer-large-mixed-domain",
+        "joeranbosma/dragon-longformer-base-domain-specific",
+        "joeranbosma/dragon-longformer-large-domain-specific"
+    ]
+# Define the prediction function with top-k parameter
+def fill_masked(text: str, model_name: str, top_k: int):
+    """
+    Takes text with [MASK] tokens, a model name, and top_k, returns top predictions.
+    """
+    # Load the pipeline if not already cached
+    if model_name not in pipeline_cache:
+        pipeline_cache[model_name] = pipeline(
+            "fill-mask",
+            model=model_name
+        )
+    fill_mask = pipeline_cache[model_name]
+    # Get top_k predictions
+    # make sure the mask format is correct
+    # [MASK] for BERT and DeBERTa
+    # <mask> for BigBird, LongFormer, RoBERTa and XLM-RoBERTa
+    #
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    mask_token = tokenizer.mask_token
+    text = text.replace("[MASK]", mask_token)
+    results = fill_mask(text, top_k=top_k)
+    # Format results for display
+    formatted = []
+    for res in results:
+        formatted.append({
+            "sequence": res["sequence"],
+            "score": round(res["score"], 4),
+            "token": res["token_str"]
+        })
+    return formatted
+# Build the Gradio interface with a slider for top-k
+iface = gr.Interface(
+    fn=fill_masked,
+    inputs=[
+        gr.Textbox(
+            lines=2,
+            placeholder="Type text with [MASK] tokens here...",
+            label="Masked Text"
+        ),
+        gr.Dropdown(
+            choices=get_model_choices(),
+            value="bert-base-uncased",
+            label="Model"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=20,
+            step=1,
+            value=5,
+            label="Top K Predictions"
+        )
+    ],
+    outputs=gr.JSON(label="Predictions"),
+    title="Masked Language Model tester",
+    description="Enter a sentence with [MASK] tokens, select a model, and choose how many top predictions to return."
+)
+if __name__ == "__main__":
+    iface.launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[project]
+name = "mlmtester"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Bram van Es",email = "[email protected]"}
+]
+license = {text = "gpl-3"}
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "transformers (>=4.52.4,<5.0.0)",
+    "gradio (>=5.34.2,<6.0.0)",
+    "torch (>=2.7.1,<3.0.0)",
+    "protobuf (>=6.31.1,<7.0.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"