Initial commit of WimBERT-synth-v0
Browse files- .gitattributes +1 -0
- MODEL_CARD.md +121 -0
- config.json +46 -0
- dual_head_state.pt +3 -0
- inference_mmbert_hf_example.py +61 -0
- label_names.json +128 -0
- model.safetensors +3 -0
- requirements.txt +93 -0
- special_tokens_map.json +55 -0
- tokenizer.json +3 -0
- tokenizer_config.json +2018 -0
- train/rd_dataset_loader.py +138 -0
- train/train_mmbert_dual_soft_f1_simplified.py +953 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
MODEL_CARD.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# WimBERT v0
|
| 2 |
+
|
| 3 |
+
WimBERT is a dual‑head, multi‑label classifier for Dutch municipal complaint messages.
|
| 4 |
+
The model uses a shared mmBERT‑base encoder with two MLP heads:
|
| 5 |
+
- Onderwerp (topics): 96 labels
|
| 6 |
+
- Beleving (experience): 26 labels
|
| 7 |
+
|
| 8 |
+
Trained with a combined objective: alpha · (1 − Soft‑F1) + (1 − alpha) · BCE.
|
| 9 |
+
|
| 10 |
+
## Overview
|
| 11 |
+
- Encoder: mmBERT‑base (multilingual)
|
| 12 |
+
- Heads: 2× MLP (Linear → Dropout → ReLU → Linear)
|
| 13 |
+
- Labels: 96 onderwerp, 26 beleving
|
| 14 |
+
- Task: Multi‑label classification (sigmoid per class)
|
| 15 |
+
- Thresholds: Disabled (fixed 0.5 used for evaluation/inference)
|
| 16 |
+
|
| 17 |
+
## Intended Use
|
| 18 |
+
- Classify incoming Dutch complaint messages into topical (onderwerp) and experiential (beleving) labels.
|
| 19 |
+
- Useful for analytics, routing, and trend insights. Not intended for legal or benefit decisions without human review.
|
| 20 |
+
|
| 21 |
+
## Training Data
|
| 22 |
+
- Source: `UWV/wim-synthetic-data-rd` (train split)
|
| 23 |
+
- Samples: 9,351
|
| 24 |
+
- Labels: 96 onderwerp, 26 beleving
|
| 25 |
+
- Avg labels per sample: onderwerp 1.75, beleving 1.89
|
| 26 |
+
- Shapes: onderwerp (9351, 96), beleving (9351, 26)
|
| 27 |
+
- Train/Val split: 7,480 / 1,871 (80/20)
|
| 28 |
+
|
| 29 |
+
## Training Setup
|
| 30 |
+
- Date: 2025‑10‑20
|
| 31 |
+
- Hardware: NVIDIA A100 GPU
|
| 32 |
+
- Epochs: 15
|
| 33 |
+
- Batch size: 16
|
| 34 |
+
- Sequence length: 1,408 tokens
|
| 35 |
+
- Optimizer: AdamW
|
| 36 |
+
- Scheduler: Linear warmup (10%) → cosine annealing, `min_lr=1e‑6`
|
| 37 |
+
- Gradient clipping: max_norm 1.0
|
| 38 |
+
- Random seed: 42
|
| 39 |
+
|
| 40 |
+
### Hyperparameters
|
| 41 |
+
- alpha (F1 weight): 0.15
|
| 42 |
+
- dropout: 0.20
|
| 43 |
+
- encoder peak LR: 8e‑5
|
| 44 |
+
- temperature (Soft‑F1): 2.0
|
| 45 |
+
- learnable thresholds: false
|
| 46 |
+
- initial_threshold: 0.565 (not used, thresholds disabled)
|
| 47 |
+
- threshold LR mult: 5.0 (not used because thresholds disabled)
|
| 48 |
+
|
| 49 |
+
## Metrics
|
| 50 |
+
Final validation (500 samples):
|
| 51 |
+
- Onderwerp:
|
| 52 |
+
- Accuracy: 99.8%
|
| 53 |
+
- Precision: 0.960
|
| 54 |
+
- Recall: 0.905
|
| 55 |
+
- F1: 0.932
|
| 56 |
+
- Beleving:
|
| 57 |
+
- Accuracy: 97.1%
|
| 58 |
+
- Precision: 0.859
|
| 59 |
+
- Recall: 0.730
|
| 60 |
+
- F1: 0.789
|
| 61 |
+
- Combined:
|
| 62 |
+
- Average Accuracy: 98.4%
|
| 63 |
+
- Average F1: 0.861
|
| 64 |
+
|
| 65 |
+
## Saved Artifacts
|
| 66 |
+
- HF‑compatible files:
|
| 67 |
+
- `model.safetensors` — encoder weights
|
| 68 |
+
- `config.json` — encoder config
|
| 69 |
+
- `tokenizer.json`, `tokenizer_config.json`, `special_tokens_map.json` — tokenizer
|
| 70 |
+
- `dual_head_state.pt` — classification heads + metadata (no thresholds included when disabled)
|
| 71 |
+
- `label_names.json` — label names for both heads
|
| 72 |
+
- `inference_mmbert_hf_example.py` — example inference script (CLI)
|
| 73 |
+
|
| 74 |
+
## Inference
|
| 75 |
+
Quick start (script):
|
| 76 |
+
- `python inference_mmbert_hf_example.py [model_dir=. ] "Uw voorbeeldzin hier"`
|
| 77 |
+
|
| 78 |
+
Minimal code (probabilities + top‑k):
|
| 79 |
+
```python
|
| 80 |
+
import os, json, torch, torch.nn as nn
|
| 81 |
+
from transformers import AutoModel, AutoTokenizer
|
| 82 |
+
|
| 83 |
+
model_dir = "."
|
| 84 |
+
tok = AutoTokenizer.from_pretrained(model_dir)
|
| 85 |
+
enc = AutoModel.from_pretrained(model_dir).eval()
|
| 86 |
+
state = torch.load(os.path.join(model_dir, "dual_head_state.pt"), map_location="cpu")
|
| 87 |
+
with open(os.path.join(model_dir, "label_names.json")) as f:
|
| 88 |
+
labels = json.load(f)
|
| 89 |
+
|
| 90 |
+
hidden = enc.config.hidden_size
|
| 91 |
+
drop = float(state.get("dropout", 0.1))
|
| 92 |
+
n_on, n_be = int(state["num_onderwerp"]), int(state["num_beleving"])
|
| 93 |
+
on_head = nn.Sequential(nn.Linear(hidden, hidden), nn.Dropout(drop), nn.ReLU(), nn.Linear(hidden, n_on)).eval()
|
| 94 |
+
be_head = nn.Sequential(nn.Linear(hidden, hidden), nn.Dropout(drop), nn.ReLU(), nn.Linear(hidden, n_be)).eval()
|
| 95 |
+
on_head.load_state_dict(state["onderwerp_head_state"])
|
| 96 |
+
be_head.load_state_dict(state["beleving_head_state"])
|
| 97 |
+
|
| 98 |
+
text = "Goedemiddag, ik heb al drie keer gebeld over mijn uitkering ..."
|
| 99 |
+
enc_inputs = tok(text, truncation=True, padding="max_length", max_length=int(state.get("max_length", 512)), return_tensors="pt")
|
| 100 |
+
pooled = enc(**enc_inputs).last_hidden_state[:, 0, :]
|
| 101 |
+
on_probs = torch.sigmoid(on_head(pooled))[0]
|
| 102 |
+
be_probs = torch.sigmoid(be_head(pooled))[0]
|
| 103 |
+
|
| 104 |
+
topk = lambda p, names, k=5: [(names[i], float(p[i])) for i in torch.topk(p, k=min(k, len(p))).indices]
|
| 105 |
+
print("Onderwerp:", topk(on_probs, labels["onderwerp"]))
|
| 106 |
+
print("Beleving:", topk(be_probs, labels["beleving"]))
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Limitations & Risks
|
| 110 |
+
- Domain: Dutch complaint messages; performance may degrade out‑of‑domain or in other languages.
|
| 111 |
+
- Thresholding: No learned thresholds; 0.5 cutoff is a simple heuristic.
|
| 112 |
+
- Label imbalance and multi‑label ambiguity can affect precision/recall trade‑offs.
|
| 113 |
+
|
| 114 |
+
## Reproduction
|
| 115 |
+
- Script: `train_mmbert_dual_soft_f1_simplified.py`
|
| 116 |
+
- Env: see `requirements.txt` (PyTorch, Transformers, Datasets, wandb)
|
| 117 |
+
- Key config: seed 42, batch size 16, epochs 13, max_length 1408, α=0.15, encoder_peak_lr=8e‑5, warmup_ratio=0.1, min_lr=1e‑6.
|
| 118 |
+
|
| 119 |
+
## Acknowledgements
|
| 120 |
+
- UWV WIM synthetic RD dataset
|
| 121 |
+
- Hugging Face Transformers/Datasets
|
config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"ModernBertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 2,
|
| 8 |
+
"classifier_activation": "gelu",
|
| 9 |
+
"classifier_bias": false,
|
| 10 |
+
"classifier_dropout": 0.0,
|
| 11 |
+
"classifier_pooling": "mean",
|
| 12 |
+
"cls_token_id": 1,
|
| 13 |
+
"decoder_bias": true,
|
| 14 |
+
"deterministic_flash_attn": false,
|
| 15 |
+
"dtype": "float32",
|
| 16 |
+
"embedding_dropout": 0.0,
|
| 17 |
+
"eos_token_id": 1,
|
| 18 |
+
"global_attn_every_n_layers": 3,
|
| 19 |
+
"global_rope_theta": 160000,
|
| 20 |
+
"gradient_checkpointing": false,
|
| 21 |
+
"hidden_activation": "gelu",
|
| 22 |
+
"hidden_size": 768,
|
| 23 |
+
"initializer_cutoff_factor": 2.0,
|
| 24 |
+
"initializer_range": 0.02,
|
| 25 |
+
"intermediate_size": 1152,
|
| 26 |
+
"layer_norm_eps": 1e-05,
|
| 27 |
+
"local_attention": 128,
|
| 28 |
+
"local_rope_theta": 160000,
|
| 29 |
+
"mask_token_id": 4,
|
| 30 |
+
"max_position_embeddings": 8192,
|
| 31 |
+
"mlp_bias": false,
|
| 32 |
+
"mlp_dropout": 0.0,
|
| 33 |
+
"model_type": "modernbert",
|
| 34 |
+
"norm_bias": false,
|
| 35 |
+
"norm_eps": 1e-05,
|
| 36 |
+
"num_attention_heads": 12,
|
| 37 |
+
"num_hidden_layers": 22,
|
| 38 |
+
"pad_token_id": 0,
|
| 39 |
+
"position_embedding_type": "sans_pos",
|
| 40 |
+
"repad_logits_with_grad": false,
|
| 41 |
+
"sep_token_id": 1,
|
| 42 |
+
"sparse_pred_ignore_index": -100,
|
| 43 |
+
"sparse_prediction": false,
|
| 44 |
+
"transformers_version": "4.57.1",
|
| 45 |
+
"vocab_size": 256000
|
| 46 |
+
}
|
dual_head_state.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1823023519e9bf2f730708ef03835144a46fd1624557c6c63af6efd305e9e818
|
| 3 |
+
size 5103997
|
inference_mmbert_hf_example.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Concise inference: load HF bundle and predict on one text.
|
| 4 |
+
Usage:
|
| 5 |
+
python inference_mmbert_hf_example.py [model_dir] [text]
|
| 6 |
+
Defaults:
|
| 7 |
+
model_dir = .
|
| 8 |
+
text = simple Dutch example
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os, sys, json, torch
|
| 12 |
+
import torch.nn as nn
|
| 13 |
+
from transformers import AutoModel, AutoTokenizer
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
| 18 |
+
text = sys.argv[2] if len(sys.argv) > 2 else (
|
| 19 |
+
"Het is echt NIET te doen hier!!! Door dat hele filmfestival zijn er elke avond mensen aan het schreeuwen en harde muziek tot laat Ik kan gewoon niet meer slapen Hoe is dit ooit goedgekeurd zo vlak na de feestdagen?????? Heb al beelden gemaakt als bewijs, kan ik die ergens heen sturen?? Het moet toch snellre opgelost kunnen worden dan dit, het duurt allemaal veel te lang Kunnen jullie dr ff naar kijken????"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu"))
|
| 23 |
+
|
| 24 |
+
# Load encoder + tokenizer + heads metadata
|
| 25 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
| 26 |
+
encoder = AutoModel.from_pretrained(model_dir).to(device).eval()
|
| 27 |
+
state = torch.load(os.path.join(model_dir, "dual_head_state.pt"), map_location="cpu")
|
| 28 |
+
with open(os.path.join(model_dir, "label_names.json")) as f:
|
| 29 |
+
labels = json.load(f)
|
| 30 |
+
|
| 31 |
+
hidden = encoder.config.hidden_size
|
| 32 |
+
n_on, n_be = int(state["num_onderwerp"]), int(state["num_beleving"])
|
| 33 |
+
drop = float(state.get("dropout", 0.1))
|
| 34 |
+
max_len = int(state.get("max_length", 512))
|
| 35 |
+
|
| 36 |
+
# Rebuild heads and load weights
|
| 37 |
+
onderwerp_head = nn.Sequential(nn.Linear(hidden, hidden), nn.Dropout(drop), nn.ReLU(), nn.Linear(hidden, n_on)).to(device).eval()
|
| 38 |
+
beleving_head = nn.Sequential(nn.Linear(hidden, hidden), nn.Dropout(drop), nn.ReLU(), nn.Linear(hidden, n_be)).to(device).eval()
|
| 39 |
+
onderwerp_head.load_state_dict(state["onderwerp_head_state"], strict=True)
|
| 40 |
+
beleving_head.load_state_dict(state["beleving_head_state"], strict=True)
|
| 41 |
+
|
| 42 |
+
# Encode and predict
|
| 43 |
+
with torch.inference_mode():
|
| 44 |
+
enc = tokenizer(text, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
|
| 45 |
+
input_ids, attn = enc["input_ids"].to(device), enc["attention_mask"].to(device)
|
| 46 |
+
pooled = encoder(input_ids=input_ids, attention_mask=attn).last_hidden_state[:, 0, :]
|
| 47 |
+
on_probs = torch.sigmoid(onderwerp_head(pooled))[0].cpu()
|
| 48 |
+
be_probs = torch.sigmoid(beleving_head(pooled))[0].cpu()
|
| 49 |
+
|
| 50 |
+
# Top-5 per head (probability)
|
| 51 |
+
def topk(probs, names, k=5):
|
| 52 |
+
idx = torch.topk(probs, k=min(k, len(probs))).indices.tolist()
|
| 53 |
+
return [(names[i], float(probs[i])) for i in idx]
|
| 54 |
+
|
| 55 |
+
print(f"Onderwerp top-5: {[f'{n}: {p:.3f}' for n,p in topk(on_probs, labels['onderwerp'])]}")
|
| 56 |
+
print(f"Beleving top-5: {[f'{n}: {p:.3f}' for n,p in topk(be_probs, labels['beleving'])]}")
|
| 57 |
+
print(f"Device: {device} | max_length: {max_len} | model_dir: {model_dir}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
main()
|
label_names.json
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"onderwerp": [
|
| 3 |
+
"Advies",
|
| 4 |
+
"Algemene veiligheid",
|
| 5 |
+
"Begeleiding",
|
| 6 |
+
"Begeleiding bij- en naar werk",
|
| 7 |
+
"Bereikbaarheid afvalcontainers",
|
| 8 |
+
"Bereikbaarheid/beschikbaarheid parkeerplek",
|
| 9 |
+
"Betalingsregeling",
|
| 10 |
+
"Bijstand",
|
| 11 |
+
"Bouwoverlast",
|
| 12 |
+
"Bruikbaarheid/beschikbaarheid afvalcontainers",
|
| 13 |
+
"Burgemeester",
|
| 14 |
+
"Buurt ambiance",
|
| 15 |
+
"COVID-19",
|
| 16 |
+
"Criminaliteit",
|
| 17 |
+
"Documentaanvraag",
|
| 18 |
+
"Echtscheiding",
|
| 19 |
+
"Energiekosten",
|
| 20 |
+
"Evenementen, feestdagen en herdenkingen",
|
| 21 |
+
"Financiële regelingen",
|
| 22 |
+
"Gebedsoproepen",
|
| 23 |
+
"Geboorte/erkenning",
|
| 24 |
+
"Geluidsoverlast",
|
| 25 |
+
"Gemeentelijke heffingen",
|
| 26 |
+
"Geuroverlast",
|
| 27 |
+
"Hangjongeren",
|
| 28 |
+
"Huisdierenoverlast",
|
| 29 |
+
"Huishoudelijke hulp",
|
| 30 |
+
"Hulp aan dak- en thuislozen",
|
| 31 |
+
"Hulpmiddelen en woonvoorzieningen",
|
| 32 |
+
"Huur en verhuur",
|
| 33 |
+
"Huwelijk/geregistreerd partnerschap",
|
| 34 |
+
"Inbraak",
|
| 35 |
+
"Infrastructuur",
|
| 36 |
+
"Koop en verkoop",
|
| 37 |
+
"Kwijtschelding",
|
| 38 |
+
"Locatie: A20 richting Utrecht",
|
| 39 |
+
"Locatie: Achterzijde Museum Boijmans",
|
| 40 |
+
"Locatie: Afvalcontainer Schiedamseweg",
|
| 41 |
+
"Locatie: Appartement Blaak",
|
| 42 |
+
"Locatie: Appartement Oude Binnenweg",
|
| 43 |
+
"Locatie: Beursplein uitgang parkeergarage",
|
| 44 |
+
"Locatie: Bouwmarkt Alexandrium",
|
| 45 |
+
"Locatie: Bouwmarkt Zuidplein",
|
| 46 |
+
"Locatie: Eendrachtsplein",
|
| 47 |
+
"Locatie: Erasmusbrug",
|
| 48 |
+
"Locatie: Gemeente Rotterdam - Stadhuis",
|
| 49 |
+
"Locatie: Grensovergang Hazeldonk",
|
| 50 |
+
"Locatie: Maastunnel ingang noord",
|
| 51 |
+
"Locatie: Marathon route - Coolsingel",
|
| 52 |
+
"Locatie: Museum - Achteringang",
|
| 53 |
+
"Locatie: Museumpark - Meerdere locaties",
|
| 54 |
+
"Locatie: Oldegaarde 45-89",
|
| 55 |
+
"Locatie: Oldegaarde complex",
|
| 56 |
+
"Locatie: Parkeergarage Beursplein",
|
| 57 |
+
"Locatie: Politiebureau Boezemsingel",
|
| 58 |
+
"Locatie: Station Rotterdam Centraal",
|
| 59 |
+
"Locatie: Tankstation Kleinpolderplein",
|
| 60 |
+
"Locatie: Verhuurbedrijf Rotterdam Zuid",
|
| 61 |
+
"Locatie: Voor Oldegaarde complex",
|
| 62 |
+
"Locatie: Waalhaven Zuid - Industrieterrein",
|
| 63 |
+
"Locatie: Westersingel - Tegenover museum",
|
| 64 |
+
"Locatie: Westersingel/Museumpark",
|
| 65 |
+
"Luchtkwaliteit",
|
| 66 |
+
"Migratie",
|
| 67 |
+
"Milieupark",
|
| 68 |
+
"No subtopic found",
|
| 69 |
+
"Omgevingsvergunning",
|
| 70 |
+
"Onderhoud omgeving",
|
| 71 |
+
"Ophalen huiscontainers/grofvuil",
|
| 72 |
+
"Overlast personen",
|
| 73 |
+
"Overlijden",
|
| 74 |
+
"Parkeerapp",
|
| 75 |
+
"Parkeerboetes",
|
| 76 |
+
"Parkeergarage",
|
| 77 |
+
"Parkeerkosten",
|
| 78 |
+
"Parkeeroverlast",
|
| 79 |
+
"Parkeerterminal",
|
| 80 |
+
"Parkeervergunning/abonnement",
|
| 81 |
+
"Parkeren bezoekers",
|
| 82 |
+
"Rotterdampas",
|
| 83 |
+
"Schade en claims",
|
| 84 |
+
"Urgent - directe actie nodig",
|
| 85 |
+
"Vaccinatie",
|
| 86 |
+
"Verdacht voertuig",
|
| 87 |
+
"Verdachte situatie",
|
| 88 |
+
"Verhuizen",
|
| 89 |
+
"Verkeersmaatregelen",
|
| 90 |
+
"Verkeersovertreding",
|
| 91 |
+
"Verkeersveiligheid",
|
| 92 |
+
"Verlichting",
|
| 93 |
+
"Verloren voorwerpen",
|
| 94 |
+
"Vervoer",
|
| 95 |
+
"Vervoersorganisaties",
|
| 96 |
+
"Vuil/ongedierte overlast",
|
| 97 |
+
"Wijkteam",
|
| 98 |
+
"Wijzigen officiële gegevens"
|
| 99 |
+
],
|
| 100 |
+
"beleving": [
|
| 101 |
+
"Afspraakmogelijkheden",
|
| 102 |
+
"Algemene ervaring",
|
| 103 |
+
"Behulpzaamheid",
|
| 104 |
+
"Bereikbaarheid van medewerker",
|
| 105 |
+
"Bezwaar & bewijs",
|
| 106 |
+
"Communicatie",
|
| 107 |
+
"Duidelijkheid",
|
| 108 |
+
"Efficiëntie van het proces",
|
| 109 |
+
"Functionaliteiten web & app",
|
| 110 |
+
"Gebruiksgemak web & app",
|
| 111 |
+
"Gemak van het proces",
|
| 112 |
+
"Informatievoorziening web & app",
|
| 113 |
+
"Integriteit & Afspraken nakomen",
|
| 114 |
+
"Juiste persoon te spreken krijgen",
|
| 115 |
+
"Juistheid van afhandeling",
|
| 116 |
+
"Kwaliteit v/d informatie",
|
| 117 |
+
"Op de hoogte houden",
|
| 118 |
+
"Oprechte interesse",
|
| 119 |
+
"Persoonlijk",
|
| 120 |
+
"Snelheid van afhandeling",
|
| 121 |
+
"Statusinformatie",
|
| 122 |
+
"Vragen student",
|
| 123 |
+
"Vriendelijkheid",
|
| 124 |
+
"Wachttijd",
|
| 125 |
+
"ongerust",
|
| 126 |
+
"verdacht"
|
| 127 |
+
]
|
| 128 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7a0c1d139dedea006dc8c862788a2ed33601dc7c2edcf1729ea2372bc1ea33f
|
| 3 |
+
size 1227771776
|
requirements.txt
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.3.1
|
| 2 |
+
accelerate==1.10.1
|
| 3 |
+
aiohappyeyeballs==2.6.1
|
| 4 |
+
aiohttp==3.13.1
|
| 5 |
+
aiosignal==1.4.0
|
| 6 |
+
annotated-types==0.7.0
|
| 7 |
+
anyio==4.11.0
|
| 8 |
+
attrs==25.4.0
|
| 9 |
+
certifi==2025.10.5
|
| 10 |
+
charset-normalizer==3.4.4
|
| 11 |
+
click==8.3.0
|
| 12 |
+
datasets==4.2.0
|
| 13 |
+
dill==0.4.0
|
| 14 |
+
filelock==3.19.1
|
| 15 |
+
frozenlist==1.8.0
|
| 16 |
+
fsspec==2025.9.0
|
| 17 |
+
gitdb==4.0.12
|
| 18 |
+
GitPython==3.1.45
|
| 19 |
+
grpcio==1.75.1
|
| 20 |
+
h11==0.16.0
|
| 21 |
+
hf-xet==1.1.10
|
| 22 |
+
httpcore==1.0.9
|
| 23 |
+
httpx==0.28.1
|
| 24 |
+
huggingface-hub==0.35.3
|
| 25 |
+
idna==3.11
|
| 26 |
+
Jinja2==3.1.6
|
| 27 |
+
Markdown==3.9
|
| 28 |
+
MarkupSafe==2.1.5
|
| 29 |
+
mpmath==1.3.0
|
| 30 |
+
multidict==6.7.0
|
| 31 |
+
multiprocess==0.70.16
|
| 32 |
+
networkx==3.5
|
| 33 |
+
numpy==2.3.4
|
| 34 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 35 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 36 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 37 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 38 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 39 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 40 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 41 |
+
nvidia-curand-cu12==10.3.9.90
|
| 42 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 43 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 44 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 45 |
+
nvidia-ml-py==13.580.82
|
| 46 |
+
nvidia-nccl-cu12==2.27.5
|
| 47 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 48 |
+
nvidia-nvshmem-cu12==3.3.20
|
| 49 |
+
nvidia-nvtx-cu12==12.8.90
|
| 50 |
+
nvitop==1.5.3
|
| 51 |
+
ollama==0.6.0
|
| 52 |
+
packaging==25.0
|
| 53 |
+
pandas==2.3.3
|
| 54 |
+
peft==0.17.1
|
| 55 |
+
pillow==12.0.0
|
| 56 |
+
platformdirs==4.5.0
|
| 57 |
+
propcache==0.4.1
|
| 58 |
+
protobuf==6.33.0
|
| 59 |
+
psutil==7.1.1
|
| 60 |
+
pyarrow==21.0.0
|
| 61 |
+
pydantic==2.12.3
|
| 62 |
+
pydantic_core==2.41.4
|
| 63 |
+
pyparsing==3.2.5
|
| 64 |
+
python-dateutil==2.9.0.post0
|
| 65 |
+
pytz==2025.2
|
| 66 |
+
PyYAML==6.0.3
|
| 67 |
+
rdflib==7.2.1
|
| 68 |
+
regex==2025.9.18
|
| 69 |
+
requests==2.32.5
|
| 70 |
+
safetensors==0.6.2
|
| 71 |
+
sentry-sdk==2.42.1
|
| 72 |
+
setuptools==70.2.0
|
| 73 |
+
six==1.17.0
|
| 74 |
+
smmap==5.0.2
|
| 75 |
+
sniffio==1.3.1
|
| 76 |
+
sympy==1.14.0
|
| 77 |
+
tensorboard==2.20.0
|
| 78 |
+
tensorboard-data-server==0.7.2
|
| 79 |
+
termcolor==3.1.0
|
| 80 |
+
tokenizers==0.22.1
|
| 81 |
+
torch==2.9.0+cu128
|
| 82 |
+
tqdm==4.67.1
|
| 83 |
+
transformers==4.57.1
|
| 84 |
+
triton==3.5.0
|
| 85 |
+
trl==0.24.0
|
| 86 |
+
typing-inspection==0.4.2
|
| 87 |
+
typing_extensions==4.15.0
|
| 88 |
+
tzdata==2025.2
|
| 89 |
+
urllib3==2.5.0
|
| 90 |
+
wandb==0.22.2
|
| 91 |
+
Werkzeug==3.1.3
|
| 92 |
+
xxhash==3.6.0
|
| 93 |
+
yarl==1.22.0
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<start_of_turn>",
|
| 4 |
+
"<end_of_turn>"
|
| 5 |
+
],
|
| 6 |
+
"bos_token": {
|
| 7 |
+
"content": "<bos>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false
|
| 12 |
+
},
|
| 13 |
+
"cls_token": {
|
| 14 |
+
"content": "<bos>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false
|
| 19 |
+
},
|
| 20 |
+
"eos_token": {
|
| 21 |
+
"content": "<eos>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false
|
| 26 |
+
},
|
| 27 |
+
"mask_token": {
|
| 28 |
+
"content": "<mask>",
|
| 29 |
+
"lstrip": true,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false
|
| 33 |
+
},
|
| 34 |
+
"pad_token": {
|
| 35 |
+
"content": "<pad>",
|
| 36 |
+
"lstrip": false,
|
| 37 |
+
"normalized": false,
|
| 38 |
+
"rstrip": false,
|
| 39 |
+
"single_word": false
|
| 40 |
+
},
|
| 41 |
+
"sep_token": {
|
| 42 |
+
"content": "<eos>",
|
| 43 |
+
"lstrip": false,
|
| 44 |
+
"normalized": false,
|
| 45 |
+
"rstrip": false,
|
| 46 |
+
"single_word": false
|
| 47 |
+
},
|
| 48 |
+
"unk_token": {
|
| 49 |
+
"content": "<unk>",
|
| 50 |
+
"lstrip": false,
|
| 51 |
+
"normalized": false,
|
| 52 |
+
"rstrip": false,
|
| 53 |
+
"single_word": false
|
| 54 |
+
}
|
| 55 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5e0d8cfefee1eb9d19b4a6f301d5ae937cb866ff3d9a3af101595aab000c627
|
| 3 |
+
size 34363455
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,2018 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"0": {
|
| 5 |
+
"content": "<pad>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"1": {
|
| 13 |
+
"content": "<eos>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"2": {
|
| 21 |
+
"content": "<bos>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"3": {
|
| 29 |
+
"content": "<unk>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": false,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"4": {
|
| 37 |
+
"content": "<mask>",
|
| 38 |
+
"lstrip": true,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
},
|
| 44 |
+
"5": {
|
| 45 |
+
"content": "<2mass>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false,
|
| 50 |
+
"special": false
|
| 51 |
+
},
|
| 52 |
+
"6": {
|
| 53 |
+
"content": "[@BOS@]",
|
| 54 |
+
"lstrip": false,
|
| 55 |
+
"normalized": false,
|
| 56 |
+
"rstrip": false,
|
| 57 |
+
"single_word": false,
|
| 58 |
+
"special": false
|
| 59 |
+
},
|
| 60 |
+
"7": {
|
| 61 |
+
"content": "<unused0>",
|
| 62 |
+
"lstrip": false,
|
| 63 |
+
"normalized": false,
|
| 64 |
+
"rstrip": false,
|
| 65 |
+
"single_word": false,
|
| 66 |
+
"special": false
|
| 67 |
+
},
|
| 68 |
+
"8": {
|
| 69 |
+
"content": "<unused1>",
|
| 70 |
+
"lstrip": false,
|
| 71 |
+
"normalized": false,
|
| 72 |
+
"rstrip": false,
|
| 73 |
+
"single_word": false,
|
| 74 |
+
"special": false
|
| 75 |
+
},
|
| 76 |
+
"9": {
|
| 77 |
+
"content": "<unused2>",
|
| 78 |
+
"lstrip": false,
|
| 79 |
+
"normalized": false,
|
| 80 |
+
"rstrip": false,
|
| 81 |
+
"single_word": false,
|
| 82 |
+
"special": false
|
| 83 |
+
},
|
| 84 |
+
"10": {
|
| 85 |
+
"content": "<unused3>",
|
| 86 |
+
"lstrip": false,
|
| 87 |
+
"normalized": false,
|
| 88 |
+
"rstrip": false,
|
| 89 |
+
"single_word": false,
|
| 90 |
+
"special": false
|
| 91 |
+
},
|
| 92 |
+
"11": {
|
| 93 |
+
"content": "<unused4>",
|
| 94 |
+
"lstrip": false,
|
| 95 |
+
"normalized": false,
|
| 96 |
+
"rstrip": false,
|
| 97 |
+
"single_word": false,
|
| 98 |
+
"special": false
|
| 99 |
+
},
|
| 100 |
+
"12": {
|
| 101 |
+
"content": "<unused5>",
|
| 102 |
+
"lstrip": false,
|
| 103 |
+
"normalized": false,
|
| 104 |
+
"rstrip": false,
|
| 105 |
+
"single_word": false,
|
| 106 |
+
"special": false
|
| 107 |
+
},
|
| 108 |
+
"13": {
|
| 109 |
+
"content": "<unused6>",
|
| 110 |
+
"lstrip": false,
|
| 111 |
+
"normalized": false,
|
| 112 |
+
"rstrip": false,
|
| 113 |
+
"single_word": false,
|
| 114 |
+
"special": false
|
| 115 |
+
},
|
| 116 |
+
"14": {
|
| 117 |
+
"content": "<unused7>",
|
| 118 |
+
"lstrip": false,
|
| 119 |
+
"normalized": false,
|
| 120 |
+
"rstrip": false,
|
| 121 |
+
"single_word": false,
|
| 122 |
+
"special": false
|
| 123 |
+
},
|
| 124 |
+
"15": {
|
| 125 |
+
"content": "<unused8>",
|
| 126 |
+
"lstrip": false,
|
| 127 |
+
"normalized": false,
|
| 128 |
+
"rstrip": false,
|
| 129 |
+
"single_word": false,
|
| 130 |
+
"special": false
|
| 131 |
+
},
|
| 132 |
+
"16": {
|
| 133 |
+
"content": "<unused9>",
|
| 134 |
+
"lstrip": false,
|
| 135 |
+
"normalized": false,
|
| 136 |
+
"rstrip": false,
|
| 137 |
+
"single_word": false,
|
| 138 |
+
"special": false
|
| 139 |
+
},
|
| 140 |
+
"17": {
|
| 141 |
+
"content": "<unused10>",
|
| 142 |
+
"lstrip": false,
|
| 143 |
+
"normalized": false,
|
| 144 |
+
"rstrip": false,
|
| 145 |
+
"single_word": false,
|
| 146 |
+
"special": false
|
| 147 |
+
},
|
| 148 |
+
"18": {
|
| 149 |
+
"content": "<unused11>",
|
| 150 |
+
"lstrip": false,
|
| 151 |
+
"normalized": false,
|
| 152 |
+
"rstrip": false,
|
| 153 |
+
"single_word": false,
|
| 154 |
+
"special": false
|
| 155 |
+
},
|
| 156 |
+
"19": {
|
| 157 |
+
"content": "<unused12>",
|
| 158 |
+
"lstrip": false,
|
| 159 |
+
"normalized": false,
|
| 160 |
+
"rstrip": false,
|
| 161 |
+
"single_word": false,
|
| 162 |
+
"special": false
|
| 163 |
+
},
|
| 164 |
+
"20": {
|
| 165 |
+
"content": "<unused13>",
|
| 166 |
+
"lstrip": false,
|
| 167 |
+
"normalized": false,
|
| 168 |
+
"rstrip": false,
|
| 169 |
+
"single_word": false,
|
| 170 |
+
"special": false
|
| 171 |
+
},
|
| 172 |
+
"21": {
|
| 173 |
+
"content": "<unused14>",
|
| 174 |
+
"lstrip": false,
|
| 175 |
+
"normalized": false,
|
| 176 |
+
"rstrip": false,
|
| 177 |
+
"single_word": false,
|
| 178 |
+
"special": false
|
| 179 |
+
},
|
| 180 |
+
"22": {
|
| 181 |
+
"content": "<unused15>",
|
| 182 |
+
"lstrip": false,
|
| 183 |
+
"normalized": false,
|
| 184 |
+
"rstrip": false,
|
| 185 |
+
"single_word": false,
|
| 186 |
+
"special": false
|
| 187 |
+
},
|
| 188 |
+
"23": {
|
| 189 |
+
"content": "<unused16>",
|
| 190 |
+
"lstrip": false,
|
| 191 |
+
"normalized": false,
|
| 192 |
+
"rstrip": false,
|
| 193 |
+
"single_word": false,
|
| 194 |
+
"special": false
|
| 195 |
+
},
|
| 196 |
+
"24": {
|
| 197 |
+
"content": "<unused17>",
|
| 198 |
+
"lstrip": false,
|
| 199 |
+
"normalized": false,
|
| 200 |
+
"rstrip": false,
|
| 201 |
+
"single_word": false,
|
| 202 |
+
"special": false
|
| 203 |
+
},
|
| 204 |
+
"25": {
|
| 205 |
+
"content": "<unused18>",
|
| 206 |
+
"lstrip": false,
|
| 207 |
+
"normalized": false,
|
| 208 |
+
"rstrip": false,
|
| 209 |
+
"single_word": false,
|
| 210 |
+
"special": false
|
| 211 |
+
},
|
| 212 |
+
"26": {
|
| 213 |
+
"content": "<unused19>",
|
| 214 |
+
"lstrip": false,
|
| 215 |
+
"normalized": false,
|
| 216 |
+
"rstrip": false,
|
| 217 |
+
"single_word": false,
|
| 218 |
+
"special": false
|
| 219 |
+
},
|
| 220 |
+
"27": {
|
| 221 |
+
"content": "<unused20>",
|
| 222 |
+
"lstrip": false,
|
| 223 |
+
"normalized": false,
|
| 224 |
+
"rstrip": false,
|
| 225 |
+
"single_word": false,
|
| 226 |
+
"special": false
|
| 227 |
+
},
|
| 228 |
+
"28": {
|
| 229 |
+
"content": "<unused21>",
|
| 230 |
+
"lstrip": false,
|
| 231 |
+
"normalized": false,
|
| 232 |
+
"rstrip": false,
|
| 233 |
+
"single_word": false,
|
| 234 |
+
"special": false
|
| 235 |
+
},
|
| 236 |
+
"29": {
|
| 237 |
+
"content": "<unused22>",
|
| 238 |
+
"lstrip": false,
|
| 239 |
+
"normalized": false,
|
| 240 |
+
"rstrip": false,
|
| 241 |
+
"single_word": false,
|
| 242 |
+
"special": false
|
| 243 |
+
},
|
| 244 |
+
"30": {
|
| 245 |
+
"content": "<unused23>",
|
| 246 |
+
"lstrip": false,
|
| 247 |
+
"normalized": false,
|
| 248 |
+
"rstrip": false,
|
| 249 |
+
"single_word": false,
|
| 250 |
+
"special": false
|
| 251 |
+
},
|
| 252 |
+
"31": {
|
| 253 |
+
"content": "<unused24>",
|
| 254 |
+
"lstrip": false,
|
| 255 |
+
"normalized": false,
|
| 256 |
+
"rstrip": false,
|
| 257 |
+
"single_word": false,
|
| 258 |
+
"special": false
|
| 259 |
+
},
|
| 260 |
+
"32": {
|
| 261 |
+
"content": "<unused25>",
|
| 262 |
+
"lstrip": false,
|
| 263 |
+
"normalized": false,
|
| 264 |
+
"rstrip": false,
|
| 265 |
+
"single_word": false,
|
| 266 |
+
"special": false
|
| 267 |
+
},
|
| 268 |
+
"33": {
|
| 269 |
+
"content": "<unused26>",
|
| 270 |
+
"lstrip": false,
|
| 271 |
+
"normalized": false,
|
| 272 |
+
"rstrip": false,
|
| 273 |
+
"single_word": false,
|
| 274 |
+
"special": false
|
| 275 |
+
},
|
| 276 |
+
"34": {
|
| 277 |
+
"content": "<unused27>",
|
| 278 |
+
"lstrip": false,
|
| 279 |
+
"normalized": false,
|
| 280 |
+
"rstrip": false,
|
| 281 |
+
"single_word": false,
|
| 282 |
+
"special": false
|
| 283 |
+
},
|
| 284 |
+
"35": {
|
| 285 |
+
"content": "<unused28>",
|
| 286 |
+
"lstrip": false,
|
| 287 |
+
"normalized": false,
|
| 288 |
+
"rstrip": false,
|
| 289 |
+
"single_word": false,
|
| 290 |
+
"special": false
|
| 291 |
+
},
|
| 292 |
+
"36": {
|
| 293 |
+
"content": "<unused29>",
|
| 294 |
+
"lstrip": false,
|
| 295 |
+
"normalized": false,
|
| 296 |
+
"rstrip": false,
|
| 297 |
+
"single_word": false,
|
| 298 |
+
"special": false
|
| 299 |
+
},
|
| 300 |
+
"37": {
|
| 301 |
+
"content": "<unused30>",
|
| 302 |
+
"lstrip": false,
|
| 303 |
+
"normalized": false,
|
| 304 |
+
"rstrip": false,
|
| 305 |
+
"single_word": false,
|
| 306 |
+
"special": false
|
| 307 |
+
},
|
| 308 |
+
"38": {
|
| 309 |
+
"content": "<unused31>",
|
| 310 |
+
"lstrip": false,
|
| 311 |
+
"normalized": false,
|
| 312 |
+
"rstrip": false,
|
| 313 |
+
"single_word": false,
|
| 314 |
+
"special": false
|
| 315 |
+
},
|
| 316 |
+
"39": {
|
| 317 |
+
"content": "<unused32>",
|
| 318 |
+
"lstrip": false,
|
| 319 |
+
"normalized": false,
|
| 320 |
+
"rstrip": false,
|
| 321 |
+
"single_word": false,
|
| 322 |
+
"special": false
|
| 323 |
+
},
|
| 324 |
+
"40": {
|
| 325 |
+
"content": "<unused33>",
|
| 326 |
+
"lstrip": false,
|
| 327 |
+
"normalized": false,
|
| 328 |
+
"rstrip": false,
|
| 329 |
+
"single_word": false,
|
| 330 |
+
"special": false
|
| 331 |
+
},
|
| 332 |
+
"41": {
|
| 333 |
+
"content": "<unused34>",
|
| 334 |
+
"lstrip": false,
|
| 335 |
+
"normalized": false,
|
| 336 |
+
"rstrip": false,
|
| 337 |
+
"single_word": false,
|
| 338 |
+
"special": false
|
| 339 |
+
},
|
| 340 |
+
"42": {
|
| 341 |
+
"content": "<unused35>",
|
| 342 |
+
"lstrip": false,
|
| 343 |
+
"normalized": false,
|
| 344 |
+
"rstrip": false,
|
| 345 |
+
"single_word": false,
|
| 346 |
+
"special": false
|
| 347 |
+
},
|
| 348 |
+
"43": {
|
| 349 |
+
"content": "<unused36>",
|
| 350 |
+
"lstrip": false,
|
| 351 |
+
"normalized": false,
|
| 352 |
+
"rstrip": false,
|
| 353 |
+
"single_word": false,
|
| 354 |
+
"special": false
|
| 355 |
+
},
|
| 356 |
+
"44": {
|
| 357 |
+
"content": "<unused37>",
|
| 358 |
+
"lstrip": false,
|
| 359 |
+
"normalized": false,
|
| 360 |
+
"rstrip": false,
|
| 361 |
+
"single_word": false,
|
| 362 |
+
"special": false
|
| 363 |
+
},
|
| 364 |
+
"45": {
|
| 365 |
+
"content": "<unused38>",
|
| 366 |
+
"lstrip": false,
|
| 367 |
+
"normalized": false,
|
| 368 |
+
"rstrip": false,
|
| 369 |
+
"single_word": false,
|
| 370 |
+
"special": false
|
| 371 |
+
},
|
| 372 |
+
"46": {
|
| 373 |
+
"content": "<unused39>",
|
| 374 |
+
"lstrip": false,
|
| 375 |
+
"normalized": false,
|
| 376 |
+
"rstrip": false,
|
| 377 |
+
"single_word": false,
|
| 378 |
+
"special": false
|
| 379 |
+
},
|
| 380 |
+
"47": {
|
| 381 |
+
"content": "<unused40>",
|
| 382 |
+
"lstrip": false,
|
| 383 |
+
"normalized": false,
|
| 384 |
+
"rstrip": false,
|
| 385 |
+
"single_word": false,
|
| 386 |
+
"special": false
|
| 387 |
+
},
|
| 388 |
+
"48": {
|
| 389 |
+
"content": "<unused41>",
|
| 390 |
+
"lstrip": false,
|
| 391 |
+
"normalized": false,
|
| 392 |
+
"rstrip": false,
|
| 393 |
+
"single_word": false,
|
| 394 |
+
"special": false
|
| 395 |
+
},
|
| 396 |
+
"49": {
|
| 397 |
+
"content": "<unused42>",
|
| 398 |
+
"lstrip": false,
|
| 399 |
+
"normalized": false,
|
| 400 |
+
"rstrip": false,
|
| 401 |
+
"single_word": false,
|
| 402 |
+
"special": false
|
| 403 |
+
},
|
| 404 |
+
"50": {
|
| 405 |
+
"content": "<unused43>",
|
| 406 |
+
"lstrip": false,
|
| 407 |
+
"normalized": false,
|
| 408 |
+
"rstrip": false,
|
| 409 |
+
"single_word": false,
|
| 410 |
+
"special": false
|
| 411 |
+
},
|
| 412 |
+
"51": {
|
| 413 |
+
"content": "<unused44>",
|
| 414 |
+
"lstrip": false,
|
| 415 |
+
"normalized": false,
|
| 416 |
+
"rstrip": false,
|
| 417 |
+
"single_word": false,
|
| 418 |
+
"special": false
|
| 419 |
+
},
|
| 420 |
+
"52": {
|
| 421 |
+
"content": "<unused45>",
|
| 422 |
+
"lstrip": false,
|
| 423 |
+
"normalized": false,
|
| 424 |
+
"rstrip": false,
|
| 425 |
+
"single_word": false,
|
| 426 |
+
"special": false
|
| 427 |
+
},
|
| 428 |
+
"53": {
|
| 429 |
+
"content": "<unused46>",
|
| 430 |
+
"lstrip": false,
|
| 431 |
+
"normalized": false,
|
| 432 |
+
"rstrip": false,
|
| 433 |
+
"single_word": false,
|
| 434 |
+
"special": false
|
| 435 |
+
},
|
| 436 |
+
"54": {
|
| 437 |
+
"content": "<unused47>",
|
| 438 |
+
"lstrip": false,
|
| 439 |
+
"normalized": false,
|
| 440 |
+
"rstrip": false,
|
| 441 |
+
"single_word": false,
|
| 442 |
+
"special": false
|
| 443 |
+
},
|
| 444 |
+
"55": {
|
| 445 |
+
"content": "<unused48>",
|
| 446 |
+
"lstrip": false,
|
| 447 |
+
"normalized": false,
|
| 448 |
+
"rstrip": false,
|
| 449 |
+
"single_word": false,
|
| 450 |
+
"special": false
|
| 451 |
+
},
|
| 452 |
+
"56": {
|
| 453 |
+
"content": "<unused49>",
|
| 454 |
+
"lstrip": false,
|
| 455 |
+
"normalized": false,
|
| 456 |
+
"rstrip": false,
|
| 457 |
+
"single_word": false,
|
| 458 |
+
"special": false
|
| 459 |
+
},
|
| 460 |
+
"57": {
|
| 461 |
+
"content": "<unused50>",
|
| 462 |
+
"lstrip": false,
|
| 463 |
+
"normalized": false,
|
| 464 |
+
"rstrip": false,
|
| 465 |
+
"single_word": false,
|
| 466 |
+
"special": false
|
| 467 |
+
},
|
| 468 |
+
"58": {
|
| 469 |
+
"content": "<unused51>",
|
| 470 |
+
"lstrip": false,
|
| 471 |
+
"normalized": false,
|
| 472 |
+
"rstrip": false,
|
| 473 |
+
"single_word": false,
|
| 474 |
+
"special": false
|
| 475 |
+
},
|
| 476 |
+
"59": {
|
| 477 |
+
"content": "<unused52>",
|
| 478 |
+
"lstrip": false,
|
| 479 |
+
"normalized": false,
|
| 480 |
+
"rstrip": false,
|
| 481 |
+
"single_word": false,
|
| 482 |
+
"special": false
|
| 483 |
+
},
|
| 484 |
+
"60": {
|
| 485 |
+
"content": "<unused53>",
|
| 486 |
+
"lstrip": false,
|
| 487 |
+
"normalized": false,
|
| 488 |
+
"rstrip": false,
|
| 489 |
+
"single_word": false,
|
| 490 |
+
"special": false
|
| 491 |
+
},
|
| 492 |
+
"61": {
|
| 493 |
+
"content": "<unused54>",
|
| 494 |
+
"lstrip": false,
|
| 495 |
+
"normalized": false,
|
| 496 |
+
"rstrip": false,
|
| 497 |
+
"single_word": false,
|
| 498 |
+
"special": false
|
| 499 |
+
},
|
| 500 |
+
"62": {
|
| 501 |
+
"content": "<unused55>",
|
| 502 |
+
"lstrip": false,
|
| 503 |
+
"normalized": false,
|
| 504 |
+
"rstrip": false,
|
| 505 |
+
"single_word": false,
|
| 506 |
+
"special": false
|
| 507 |
+
},
|
| 508 |
+
"63": {
|
| 509 |
+
"content": "<unused56>",
|
| 510 |
+
"lstrip": false,
|
| 511 |
+
"normalized": false,
|
| 512 |
+
"rstrip": false,
|
| 513 |
+
"single_word": false,
|
| 514 |
+
"special": false
|
| 515 |
+
},
|
| 516 |
+
"64": {
|
| 517 |
+
"content": "<unused57>",
|
| 518 |
+
"lstrip": false,
|
| 519 |
+
"normalized": false,
|
| 520 |
+
"rstrip": false,
|
| 521 |
+
"single_word": false,
|
| 522 |
+
"special": false
|
| 523 |
+
},
|
| 524 |
+
"65": {
|
| 525 |
+
"content": "<unused58>",
|
| 526 |
+
"lstrip": false,
|
| 527 |
+
"normalized": false,
|
| 528 |
+
"rstrip": false,
|
| 529 |
+
"single_word": false,
|
| 530 |
+
"special": false
|
| 531 |
+
},
|
| 532 |
+
"66": {
|
| 533 |
+
"content": "<unused59>",
|
| 534 |
+
"lstrip": false,
|
| 535 |
+
"normalized": false,
|
| 536 |
+
"rstrip": false,
|
| 537 |
+
"single_word": false,
|
| 538 |
+
"special": false
|
| 539 |
+
},
|
| 540 |
+
"67": {
|
| 541 |
+
"content": "<unused60>",
|
| 542 |
+
"lstrip": false,
|
| 543 |
+
"normalized": false,
|
| 544 |
+
"rstrip": false,
|
| 545 |
+
"single_word": false,
|
| 546 |
+
"special": false
|
| 547 |
+
},
|
| 548 |
+
"68": {
|
| 549 |
+
"content": "<unused61>",
|
| 550 |
+
"lstrip": false,
|
| 551 |
+
"normalized": false,
|
| 552 |
+
"rstrip": false,
|
| 553 |
+
"single_word": false,
|
| 554 |
+
"special": false
|
| 555 |
+
},
|
| 556 |
+
"69": {
|
| 557 |
+
"content": "<unused62>",
|
| 558 |
+
"lstrip": false,
|
| 559 |
+
"normalized": false,
|
| 560 |
+
"rstrip": false,
|
| 561 |
+
"single_word": false,
|
| 562 |
+
"special": false
|
| 563 |
+
},
|
| 564 |
+
"70": {
|
| 565 |
+
"content": "<unused63>",
|
| 566 |
+
"lstrip": false,
|
| 567 |
+
"normalized": false,
|
| 568 |
+
"rstrip": false,
|
| 569 |
+
"single_word": false,
|
| 570 |
+
"special": false
|
| 571 |
+
},
|
| 572 |
+
"71": {
|
| 573 |
+
"content": "<unused64>",
|
| 574 |
+
"lstrip": false,
|
| 575 |
+
"normalized": false,
|
| 576 |
+
"rstrip": false,
|
| 577 |
+
"single_word": false,
|
| 578 |
+
"special": false
|
| 579 |
+
},
|
| 580 |
+
"72": {
|
| 581 |
+
"content": "<unused65>",
|
| 582 |
+
"lstrip": false,
|
| 583 |
+
"normalized": false,
|
| 584 |
+
"rstrip": false,
|
| 585 |
+
"single_word": false,
|
| 586 |
+
"special": false
|
| 587 |
+
},
|
| 588 |
+
"73": {
|
| 589 |
+
"content": "<unused66>",
|
| 590 |
+
"lstrip": false,
|
| 591 |
+
"normalized": false,
|
| 592 |
+
"rstrip": false,
|
| 593 |
+
"single_word": false,
|
| 594 |
+
"special": false
|
| 595 |
+
},
|
| 596 |
+
"74": {
|
| 597 |
+
"content": "<unused67>",
|
| 598 |
+
"lstrip": false,
|
| 599 |
+
"normalized": false,
|
| 600 |
+
"rstrip": false,
|
| 601 |
+
"single_word": false,
|
| 602 |
+
"special": false
|
| 603 |
+
},
|
| 604 |
+
"75": {
|
| 605 |
+
"content": "<unused68>",
|
| 606 |
+
"lstrip": false,
|
| 607 |
+
"normalized": false,
|
| 608 |
+
"rstrip": false,
|
| 609 |
+
"single_word": false,
|
| 610 |
+
"special": false
|
| 611 |
+
},
|
| 612 |
+
"76": {
|
| 613 |
+
"content": "<unused69>",
|
| 614 |
+
"lstrip": false,
|
| 615 |
+
"normalized": false,
|
| 616 |
+
"rstrip": false,
|
| 617 |
+
"single_word": false,
|
| 618 |
+
"special": false
|
| 619 |
+
},
|
| 620 |
+
"77": {
|
| 621 |
+
"content": "<unused70>",
|
| 622 |
+
"lstrip": false,
|
| 623 |
+
"normalized": false,
|
| 624 |
+
"rstrip": false,
|
| 625 |
+
"single_word": false,
|
| 626 |
+
"special": false
|
| 627 |
+
},
|
| 628 |
+
"78": {
|
| 629 |
+
"content": "<unused71>",
|
| 630 |
+
"lstrip": false,
|
| 631 |
+
"normalized": false,
|
| 632 |
+
"rstrip": false,
|
| 633 |
+
"single_word": false,
|
| 634 |
+
"special": false
|
| 635 |
+
},
|
| 636 |
+
"79": {
|
| 637 |
+
"content": "<unused72>",
|
| 638 |
+
"lstrip": false,
|
| 639 |
+
"normalized": false,
|
| 640 |
+
"rstrip": false,
|
| 641 |
+
"single_word": false,
|
| 642 |
+
"special": false
|
| 643 |
+
},
|
| 644 |
+
"80": {
|
| 645 |
+
"content": "<unused73>",
|
| 646 |
+
"lstrip": false,
|
| 647 |
+
"normalized": false,
|
| 648 |
+
"rstrip": false,
|
| 649 |
+
"single_word": false,
|
| 650 |
+
"special": false
|
| 651 |
+
},
|
| 652 |
+
"81": {
|
| 653 |
+
"content": "<unused74>",
|
| 654 |
+
"lstrip": false,
|
| 655 |
+
"normalized": false,
|
| 656 |
+
"rstrip": false,
|
| 657 |
+
"single_word": false,
|
| 658 |
+
"special": false
|
| 659 |
+
},
|
| 660 |
+
"82": {
|
| 661 |
+
"content": "<unused75>",
|
| 662 |
+
"lstrip": false,
|
| 663 |
+
"normalized": false,
|
| 664 |
+
"rstrip": false,
|
| 665 |
+
"single_word": false,
|
| 666 |
+
"special": false
|
| 667 |
+
},
|
| 668 |
+
"83": {
|
| 669 |
+
"content": "<unused76>",
|
| 670 |
+
"lstrip": false,
|
| 671 |
+
"normalized": false,
|
| 672 |
+
"rstrip": false,
|
| 673 |
+
"single_word": false,
|
| 674 |
+
"special": false
|
| 675 |
+
},
|
| 676 |
+
"84": {
|
| 677 |
+
"content": "<unused77>",
|
| 678 |
+
"lstrip": false,
|
| 679 |
+
"normalized": false,
|
| 680 |
+
"rstrip": false,
|
| 681 |
+
"single_word": false,
|
| 682 |
+
"special": false
|
| 683 |
+
},
|
| 684 |
+
"85": {
|
| 685 |
+
"content": "<unused78>",
|
| 686 |
+
"lstrip": false,
|
| 687 |
+
"normalized": false,
|
| 688 |
+
"rstrip": false,
|
| 689 |
+
"single_word": false,
|
| 690 |
+
"special": false
|
| 691 |
+
},
|
| 692 |
+
"86": {
|
| 693 |
+
"content": "<unused79>",
|
| 694 |
+
"lstrip": false,
|
| 695 |
+
"normalized": false,
|
| 696 |
+
"rstrip": false,
|
| 697 |
+
"single_word": false,
|
| 698 |
+
"special": false
|
| 699 |
+
},
|
| 700 |
+
"87": {
|
| 701 |
+
"content": "<unused80>",
|
| 702 |
+
"lstrip": false,
|
| 703 |
+
"normalized": false,
|
| 704 |
+
"rstrip": false,
|
| 705 |
+
"single_word": false,
|
| 706 |
+
"special": false
|
| 707 |
+
},
|
| 708 |
+
"88": {
|
| 709 |
+
"content": "<unused81>",
|
| 710 |
+
"lstrip": false,
|
| 711 |
+
"normalized": false,
|
| 712 |
+
"rstrip": false,
|
| 713 |
+
"single_word": false,
|
| 714 |
+
"special": false
|
| 715 |
+
},
|
| 716 |
+
"89": {
|
| 717 |
+
"content": "<unused82>",
|
| 718 |
+
"lstrip": false,
|
| 719 |
+
"normalized": false,
|
| 720 |
+
"rstrip": false,
|
| 721 |
+
"single_word": false,
|
| 722 |
+
"special": false
|
| 723 |
+
},
|
| 724 |
+
"90": {
|
| 725 |
+
"content": "<unused83>",
|
| 726 |
+
"lstrip": false,
|
| 727 |
+
"normalized": false,
|
| 728 |
+
"rstrip": false,
|
| 729 |
+
"single_word": false,
|
| 730 |
+
"special": false
|
| 731 |
+
},
|
| 732 |
+
"91": {
|
| 733 |
+
"content": "<unused84>",
|
| 734 |
+
"lstrip": false,
|
| 735 |
+
"normalized": false,
|
| 736 |
+
"rstrip": false,
|
| 737 |
+
"single_word": false,
|
| 738 |
+
"special": false
|
| 739 |
+
},
|
| 740 |
+
"92": {
|
| 741 |
+
"content": "<unused85>",
|
| 742 |
+
"lstrip": false,
|
| 743 |
+
"normalized": false,
|
| 744 |
+
"rstrip": false,
|
| 745 |
+
"single_word": false,
|
| 746 |
+
"special": false
|
| 747 |
+
},
|
| 748 |
+
"93": {
|
| 749 |
+
"content": "<unused86>",
|
| 750 |
+
"lstrip": false,
|
| 751 |
+
"normalized": false,
|
| 752 |
+
"rstrip": false,
|
| 753 |
+
"single_word": false,
|
| 754 |
+
"special": false
|
| 755 |
+
},
|
| 756 |
+
"94": {
|
| 757 |
+
"content": "<unused87>",
|
| 758 |
+
"lstrip": false,
|
| 759 |
+
"normalized": false,
|
| 760 |
+
"rstrip": false,
|
| 761 |
+
"single_word": false,
|
| 762 |
+
"special": false
|
| 763 |
+
},
|
| 764 |
+
"95": {
|
| 765 |
+
"content": "<unused88>",
|
| 766 |
+
"lstrip": false,
|
| 767 |
+
"normalized": false,
|
| 768 |
+
"rstrip": false,
|
| 769 |
+
"single_word": false,
|
| 770 |
+
"special": false
|
| 771 |
+
},
|
| 772 |
+
"96": {
|
| 773 |
+
"content": "<unused89>",
|
| 774 |
+
"lstrip": false,
|
| 775 |
+
"normalized": false,
|
| 776 |
+
"rstrip": false,
|
| 777 |
+
"single_word": false,
|
| 778 |
+
"special": false
|
| 779 |
+
},
|
| 780 |
+
"97": {
|
| 781 |
+
"content": "<unused90>",
|
| 782 |
+
"lstrip": false,
|
| 783 |
+
"normalized": false,
|
| 784 |
+
"rstrip": false,
|
| 785 |
+
"single_word": false,
|
| 786 |
+
"special": false
|
| 787 |
+
},
|
| 788 |
+
"98": {
|
| 789 |
+
"content": "<unused91>",
|
| 790 |
+
"lstrip": false,
|
| 791 |
+
"normalized": false,
|
| 792 |
+
"rstrip": false,
|
| 793 |
+
"single_word": false,
|
| 794 |
+
"special": false
|
| 795 |
+
},
|
| 796 |
+
"99": {
|
| 797 |
+
"content": "<unused92>",
|
| 798 |
+
"lstrip": false,
|
| 799 |
+
"normalized": false,
|
| 800 |
+
"rstrip": false,
|
| 801 |
+
"single_word": false,
|
| 802 |
+
"special": false
|
| 803 |
+
},
|
| 804 |
+
"100": {
|
| 805 |
+
"content": "<unused93>",
|
| 806 |
+
"lstrip": false,
|
| 807 |
+
"normalized": false,
|
| 808 |
+
"rstrip": false,
|
| 809 |
+
"single_word": false,
|
| 810 |
+
"special": false
|
| 811 |
+
},
|
| 812 |
+
"101": {
|
| 813 |
+
"content": "<unused94>",
|
| 814 |
+
"lstrip": false,
|
| 815 |
+
"normalized": false,
|
| 816 |
+
"rstrip": false,
|
| 817 |
+
"single_word": false,
|
| 818 |
+
"special": false
|
| 819 |
+
},
|
| 820 |
+
"102": {
|
| 821 |
+
"content": "<unused95>",
|
| 822 |
+
"lstrip": false,
|
| 823 |
+
"normalized": false,
|
| 824 |
+
"rstrip": false,
|
| 825 |
+
"single_word": false,
|
| 826 |
+
"special": false
|
| 827 |
+
},
|
| 828 |
+
"103": {
|
| 829 |
+
"content": "<unused96>",
|
| 830 |
+
"lstrip": false,
|
| 831 |
+
"normalized": false,
|
| 832 |
+
"rstrip": false,
|
| 833 |
+
"single_word": false,
|
| 834 |
+
"special": false
|
| 835 |
+
},
|
| 836 |
+
"104": {
|
| 837 |
+
"content": "<unused97>",
|
| 838 |
+
"lstrip": false,
|
| 839 |
+
"normalized": false,
|
| 840 |
+
"rstrip": false,
|
| 841 |
+
"single_word": false,
|
| 842 |
+
"special": false
|
| 843 |
+
},
|
| 844 |
+
"105": {
|
| 845 |
+
"content": "<unused98>",
|
| 846 |
+
"lstrip": false,
|
| 847 |
+
"normalized": false,
|
| 848 |
+
"rstrip": false,
|
| 849 |
+
"single_word": false,
|
| 850 |
+
"special": false
|
| 851 |
+
},
|
| 852 |
+
"106": {
|
| 853 |
+
"content": "<start_of_turn>",
|
| 854 |
+
"lstrip": false,
|
| 855 |
+
"normalized": false,
|
| 856 |
+
"rstrip": false,
|
| 857 |
+
"single_word": false,
|
| 858 |
+
"special": true
|
| 859 |
+
},
|
| 860 |
+
"107": {
|
| 861 |
+
"content": "<end_of_turn>",
|
| 862 |
+
"lstrip": false,
|
| 863 |
+
"normalized": false,
|
| 864 |
+
"rstrip": false,
|
| 865 |
+
"single_word": false,
|
| 866 |
+
"special": true
|
| 867 |
+
},
|
| 868 |
+
"108": {
|
| 869 |
+
"content": "\n",
|
| 870 |
+
"lstrip": false,
|
| 871 |
+
"normalized": false,
|
| 872 |
+
"rstrip": false,
|
| 873 |
+
"single_word": false,
|
| 874 |
+
"special": false
|
| 875 |
+
},
|
| 876 |
+
"109": {
|
| 877 |
+
"content": "\n\n",
|
| 878 |
+
"lstrip": false,
|
| 879 |
+
"normalized": false,
|
| 880 |
+
"rstrip": false,
|
| 881 |
+
"single_word": false,
|
| 882 |
+
"special": false
|
| 883 |
+
},
|
| 884 |
+
"110": {
|
| 885 |
+
"content": "\n\n\n",
|
| 886 |
+
"lstrip": false,
|
| 887 |
+
"normalized": false,
|
| 888 |
+
"rstrip": false,
|
| 889 |
+
"single_word": false,
|
| 890 |
+
"special": false
|
| 891 |
+
},
|
| 892 |
+
"111": {
|
| 893 |
+
"content": "\n\n\n\n",
|
| 894 |
+
"lstrip": false,
|
| 895 |
+
"normalized": false,
|
| 896 |
+
"rstrip": false,
|
| 897 |
+
"single_word": false,
|
| 898 |
+
"special": false
|
| 899 |
+
},
|
| 900 |
+
"112": {
|
| 901 |
+
"content": "\n\n\n\n\n",
|
| 902 |
+
"lstrip": false,
|
| 903 |
+
"normalized": false,
|
| 904 |
+
"rstrip": false,
|
| 905 |
+
"single_word": false,
|
| 906 |
+
"special": false
|
| 907 |
+
},
|
| 908 |
+
"113": {
|
| 909 |
+
"content": "\n\n\n\n\n\n",
|
| 910 |
+
"lstrip": false,
|
| 911 |
+
"normalized": false,
|
| 912 |
+
"rstrip": false,
|
| 913 |
+
"single_word": false,
|
| 914 |
+
"special": false
|
| 915 |
+
},
|
| 916 |
+
"114": {
|
| 917 |
+
"content": "\n\n\n\n\n\n\n",
|
| 918 |
+
"lstrip": false,
|
| 919 |
+
"normalized": false,
|
| 920 |
+
"rstrip": false,
|
| 921 |
+
"single_word": false,
|
| 922 |
+
"special": false
|
| 923 |
+
},
|
| 924 |
+
"115": {
|
| 925 |
+
"content": "\n\n\n\n\n\n\n\n",
|
| 926 |
+
"lstrip": false,
|
| 927 |
+
"normalized": false,
|
| 928 |
+
"rstrip": false,
|
| 929 |
+
"single_word": false,
|
| 930 |
+
"special": false
|
| 931 |
+
},
|
| 932 |
+
"116": {
|
| 933 |
+
"content": "\n\n\n\n\n\n\n\n\n",
|
| 934 |
+
"lstrip": false,
|
| 935 |
+
"normalized": false,
|
| 936 |
+
"rstrip": false,
|
| 937 |
+
"single_word": false,
|
| 938 |
+
"special": false
|
| 939 |
+
},
|
| 940 |
+
"117": {
|
| 941 |
+
"content": "\n\n\n\n\n\n\n\n\n\n",
|
| 942 |
+
"lstrip": false,
|
| 943 |
+
"normalized": false,
|
| 944 |
+
"rstrip": false,
|
| 945 |
+
"single_word": false,
|
| 946 |
+
"special": false
|
| 947 |
+
},
|
| 948 |
+
"118": {
|
| 949 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n",
|
| 950 |
+
"lstrip": false,
|
| 951 |
+
"normalized": false,
|
| 952 |
+
"rstrip": false,
|
| 953 |
+
"single_word": false,
|
| 954 |
+
"special": false
|
| 955 |
+
},
|
| 956 |
+
"119": {
|
| 957 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 958 |
+
"lstrip": false,
|
| 959 |
+
"normalized": false,
|
| 960 |
+
"rstrip": false,
|
| 961 |
+
"single_word": false,
|
| 962 |
+
"special": false
|
| 963 |
+
},
|
| 964 |
+
"120": {
|
| 965 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 966 |
+
"lstrip": false,
|
| 967 |
+
"normalized": false,
|
| 968 |
+
"rstrip": false,
|
| 969 |
+
"single_word": false,
|
| 970 |
+
"special": false
|
| 971 |
+
},
|
| 972 |
+
"121": {
|
| 973 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 974 |
+
"lstrip": false,
|
| 975 |
+
"normalized": false,
|
| 976 |
+
"rstrip": false,
|
| 977 |
+
"single_word": false,
|
| 978 |
+
"special": false
|
| 979 |
+
},
|
| 980 |
+
"122": {
|
| 981 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 982 |
+
"lstrip": false,
|
| 983 |
+
"normalized": false,
|
| 984 |
+
"rstrip": false,
|
| 985 |
+
"single_word": false,
|
| 986 |
+
"special": false
|
| 987 |
+
},
|
| 988 |
+
"123": {
|
| 989 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 990 |
+
"lstrip": false,
|
| 991 |
+
"normalized": false,
|
| 992 |
+
"rstrip": false,
|
| 993 |
+
"single_word": false,
|
| 994 |
+
"special": false
|
| 995 |
+
},
|
| 996 |
+
"124": {
|
| 997 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 998 |
+
"lstrip": false,
|
| 999 |
+
"normalized": false,
|
| 1000 |
+
"rstrip": false,
|
| 1001 |
+
"single_word": false,
|
| 1002 |
+
"special": false
|
| 1003 |
+
},
|
| 1004 |
+
"125": {
|
| 1005 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1006 |
+
"lstrip": false,
|
| 1007 |
+
"normalized": false,
|
| 1008 |
+
"rstrip": false,
|
| 1009 |
+
"single_word": false,
|
| 1010 |
+
"special": false
|
| 1011 |
+
},
|
| 1012 |
+
"126": {
|
| 1013 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1014 |
+
"lstrip": false,
|
| 1015 |
+
"normalized": false,
|
| 1016 |
+
"rstrip": false,
|
| 1017 |
+
"single_word": false,
|
| 1018 |
+
"special": false
|
| 1019 |
+
},
|
| 1020 |
+
"127": {
|
| 1021 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1022 |
+
"lstrip": false,
|
| 1023 |
+
"normalized": false,
|
| 1024 |
+
"rstrip": false,
|
| 1025 |
+
"single_word": false,
|
| 1026 |
+
"special": false
|
| 1027 |
+
},
|
| 1028 |
+
"128": {
|
| 1029 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1030 |
+
"lstrip": false,
|
| 1031 |
+
"normalized": false,
|
| 1032 |
+
"rstrip": false,
|
| 1033 |
+
"single_word": false,
|
| 1034 |
+
"special": false
|
| 1035 |
+
},
|
| 1036 |
+
"129": {
|
| 1037 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1038 |
+
"lstrip": false,
|
| 1039 |
+
"normalized": false,
|
| 1040 |
+
"rstrip": false,
|
| 1041 |
+
"single_word": false,
|
| 1042 |
+
"special": false
|
| 1043 |
+
},
|
| 1044 |
+
"130": {
|
| 1045 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1046 |
+
"lstrip": false,
|
| 1047 |
+
"normalized": false,
|
| 1048 |
+
"rstrip": false,
|
| 1049 |
+
"single_word": false,
|
| 1050 |
+
"special": false
|
| 1051 |
+
},
|
| 1052 |
+
"131": {
|
| 1053 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1054 |
+
"lstrip": false,
|
| 1055 |
+
"normalized": false,
|
| 1056 |
+
"rstrip": false,
|
| 1057 |
+
"single_word": false,
|
| 1058 |
+
"special": false
|
| 1059 |
+
},
|
| 1060 |
+
"132": {
|
| 1061 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1062 |
+
"lstrip": false,
|
| 1063 |
+
"normalized": false,
|
| 1064 |
+
"rstrip": false,
|
| 1065 |
+
"single_word": false,
|
| 1066 |
+
"special": false
|
| 1067 |
+
},
|
| 1068 |
+
"133": {
|
| 1069 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1070 |
+
"lstrip": false,
|
| 1071 |
+
"normalized": false,
|
| 1072 |
+
"rstrip": false,
|
| 1073 |
+
"single_word": false,
|
| 1074 |
+
"special": false
|
| 1075 |
+
},
|
| 1076 |
+
"134": {
|
| 1077 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1078 |
+
"lstrip": false,
|
| 1079 |
+
"normalized": false,
|
| 1080 |
+
"rstrip": false,
|
| 1081 |
+
"single_word": false,
|
| 1082 |
+
"special": false
|
| 1083 |
+
},
|
| 1084 |
+
"135": {
|
| 1085 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1086 |
+
"lstrip": false,
|
| 1087 |
+
"normalized": false,
|
| 1088 |
+
"rstrip": false,
|
| 1089 |
+
"single_word": false,
|
| 1090 |
+
"special": false
|
| 1091 |
+
},
|
| 1092 |
+
"136": {
|
| 1093 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1094 |
+
"lstrip": false,
|
| 1095 |
+
"normalized": false,
|
| 1096 |
+
"rstrip": false,
|
| 1097 |
+
"single_word": false,
|
| 1098 |
+
"special": false
|
| 1099 |
+
},
|
| 1100 |
+
"137": {
|
| 1101 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1102 |
+
"lstrip": false,
|
| 1103 |
+
"normalized": false,
|
| 1104 |
+
"rstrip": false,
|
| 1105 |
+
"single_word": false,
|
| 1106 |
+
"special": false
|
| 1107 |
+
},
|
| 1108 |
+
"138": {
|
| 1109 |
+
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
|
| 1110 |
+
"lstrip": false,
|
| 1111 |
+
"normalized": false,
|
| 1112 |
+
"rstrip": false,
|
| 1113 |
+
"single_word": false,
|
| 1114 |
+
"special": false
|
| 1115 |
+
},
|
| 1116 |
+
"139": {
|
| 1117 |
+
"content": "▁▁",
|
| 1118 |
+
"lstrip": false,
|
| 1119 |
+
"normalized": false,
|
| 1120 |
+
"rstrip": false,
|
| 1121 |
+
"single_word": false,
|
| 1122 |
+
"special": false
|
| 1123 |
+
},
|
| 1124 |
+
"140": {
|
| 1125 |
+
"content": "▁▁▁",
|
| 1126 |
+
"lstrip": false,
|
| 1127 |
+
"normalized": false,
|
| 1128 |
+
"rstrip": false,
|
| 1129 |
+
"single_word": false,
|
| 1130 |
+
"special": false
|
| 1131 |
+
},
|
| 1132 |
+
"141": {
|
| 1133 |
+
"content": "▁▁▁▁",
|
| 1134 |
+
"lstrip": false,
|
| 1135 |
+
"normalized": false,
|
| 1136 |
+
"rstrip": false,
|
| 1137 |
+
"single_word": false,
|
| 1138 |
+
"special": false
|
| 1139 |
+
},
|
| 1140 |
+
"142": {
|
| 1141 |
+
"content": "▁▁▁▁▁",
|
| 1142 |
+
"lstrip": false,
|
| 1143 |
+
"normalized": false,
|
| 1144 |
+
"rstrip": false,
|
| 1145 |
+
"single_word": false,
|
| 1146 |
+
"special": false
|
| 1147 |
+
},
|
| 1148 |
+
"143": {
|
| 1149 |
+
"content": "▁▁▁▁▁▁",
|
| 1150 |
+
"lstrip": false,
|
| 1151 |
+
"normalized": false,
|
| 1152 |
+
"rstrip": false,
|
| 1153 |
+
"single_word": false,
|
| 1154 |
+
"special": false
|
| 1155 |
+
},
|
| 1156 |
+
"144": {
|
| 1157 |
+
"content": "▁▁▁▁▁▁▁",
|
| 1158 |
+
"lstrip": false,
|
| 1159 |
+
"normalized": false,
|
| 1160 |
+
"rstrip": false,
|
| 1161 |
+
"single_word": false,
|
| 1162 |
+
"special": false
|
| 1163 |
+
},
|
| 1164 |
+
"145": {
|
| 1165 |
+
"content": "▁▁▁▁▁▁▁▁",
|
| 1166 |
+
"lstrip": false,
|
| 1167 |
+
"normalized": false,
|
| 1168 |
+
"rstrip": false,
|
| 1169 |
+
"single_word": false,
|
| 1170 |
+
"special": false
|
| 1171 |
+
},
|
| 1172 |
+
"146": {
|
| 1173 |
+
"content": "▁▁▁▁▁▁▁▁▁",
|
| 1174 |
+
"lstrip": false,
|
| 1175 |
+
"normalized": false,
|
| 1176 |
+
"rstrip": false,
|
| 1177 |
+
"single_word": false,
|
| 1178 |
+
"special": false
|
| 1179 |
+
},
|
| 1180 |
+
"147": {
|
| 1181 |
+
"content": "▁▁▁▁▁▁▁▁▁▁",
|
| 1182 |
+
"lstrip": false,
|
| 1183 |
+
"normalized": false,
|
| 1184 |
+
"rstrip": false,
|
| 1185 |
+
"single_word": false,
|
| 1186 |
+
"special": false
|
| 1187 |
+
},
|
| 1188 |
+
"148": {
|
| 1189 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁",
|
| 1190 |
+
"lstrip": false,
|
| 1191 |
+
"normalized": false,
|
| 1192 |
+
"rstrip": false,
|
| 1193 |
+
"single_word": false,
|
| 1194 |
+
"special": false
|
| 1195 |
+
},
|
| 1196 |
+
"149": {
|
| 1197 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1198 |
+
"lstrip": false,
|
| 1199 |
+
"normalized": false,
|
| 1200 |
+
"rstrip": false,
|
| 1201 |
+
"single_word": false,
|
| 1202 |
+
"special": false
|
| 1203 |
+
},
|
| 1204 |
+
"150": {
|
| 1205 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1206 |
+
"lstrip": false,
|
| 1207 |
+
"normalized": false,
|
| 1208 |
+
"rstrip": false,
|
| 1209 |
+
"single_word": false,
|
| 1210 |
+
"special": false
|
| 1211 |
+
},
|
| 1212 |
+
"151": {
|
| 1213 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1214 |
+
"lstrip": false,
|
| 1215 |
+
"normalized": false,
|
| 1216 |
+
"rstrip": false,
|
| 1217 |
+
"single_word": false,
|
| 1218 |
+
"special": false
|
| 1219 |
+
},
|
| 1220 |
+
"152": {
|
| 1221 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1222 |
+
"lstrip": false,
|
| 1223 |
+
"normalized": false,
|
| 1224 |
+
"rstrip": false,
|
| 1225 |
+
"single_word": false,
|
| 1226 |
+
"special": false
|
| 1227 |
+
},
|
| 1228 |
+
"153": {
|
| 1229 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1230 |
+
"lstrip": false,
|
| 1231 |
+
"normalized": false,
|
| 1232 |
+
"rstrip": false,
|
| 1233 |
+
"single_word": false,
|
| 1234 |
+
"special": false
|
| 1235 |
+
},
|
| 1236 |
+
"154": {
|
| 1237 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1238 |
+
"lstrip": false,
|
| 1239 |
+
"normalized": false,
|
| 1240 |
+
"rstrip": false,
|
| 1241 |
+
"single_word": false,
|
| 1242 |
+
"special": false
|
| 1243 |
+
},
|
| 1244 |
+
"155": {
|
| 1245 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1246 |
+
"lstrip": false,
|
| 1247 |
+
"normalized": false,
|
| 1248 |
+
"rstrip": false,
|
| 1249 |
+
"single_word": false,
|
| 1250 |
+
"special": false
|
| 1251 |
+
},
|
| 1252 |
+
"156": {
|
| 1253 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1254 |
+
"lstrip": false,
|
| 1255 |
+
"normalized": false,
|
| 1256 |
+
"rstrip": false,
|
| 1257 |
+
"single_word": false,
|
| 1258 |
+
"special": false
|
| 1259 |
+
},
|
| 1260 |
+
"157": {
|
| 1261 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1262 |
+
"lstrip": false,
|
| 1263 |
+
"normalized": false,
|
| 1264 |
+
"rstrip": false,
|
| 1265 |
+
"single_word": false,
|
| 1266 |
+
"special": false
|
| 1267 |
+
},
|
| 1268 |
+
"158": {
|
| 1269 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1270 |
+
"lstrip": false,
|
| 1271 |
+
"normalized": false,
|
| 1272 |
+
"rstrip": false,
|
| 1273 |
+
"single_word": false,
|
| 1274 |
+
"special": false
|
| 1275 |
+
},
|
| 1276 |
+
"159": {
|
| 1277 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1278 |
+
"lstrip": false,
|
| 1279 |
+
"normalized": false,
|
| 1280 |
+
"rstrip": false,
|
| 1281 |
+
"single_word": false,
|
| 1282 |
+
"special": false
|
| 1283 |
+
},
|
| 1284 |
+
"160": {
|
| 1285 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1286 |
+
"lstrip": false,
|
| 1287 |
+
"normalized": false,
|
| 1288 |
+
"rstrip": false,
|
| 1289 |
+
"single_word": false,
|
| 1290 |
+
"special": false
|
| 1291 |
+
},
|
| 1292 |
+
"161": {
|
| 1293 |
+
"content": "▁▁▁���▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1294 |
+
"lstrip": false,
|
| 1295 |
+
"normalized": false,
|
| 1296 |
+
"rstrip": false,
|
| 1297 |
+
"single_word": false,
|
| 1298 |
+
"special": false
|
| 1299 |
+
},
|
| 1300 |
+
"162": {
|
| 1301 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1302 |
+
"lstrip": false,
|
| 1303 |
+
"normalized": false,
|
| 1304 |
+
"rstrip": false,
|
| 1305 |
+
"single_word": false,
|
| 1306 |
+
"special": false
|
| 1307 |
+
},
|
| 1308 |
+
"163": {
|
| 1309 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1310 |
+
"lstrip": false,
|
| 1311 |
+
"normalized": false,
|
| 1312 |
+
"rstrip": false,
|
| 1313 |
+
"single_word": false,
|
| 1314 |
+
"special": false
|
| 1315 |
+
},
|
| 1316 |
+
"164": {
|
| 1317 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1318 |
+
"lstrip": false,
|
| 1319 |
+
"normalized": false,
|
| 1320 |
+
"rstrip": false,
|
| 1321 |
+
"single_word": false,
|
| 1322 |
+
"special": false
|
| 1323 |
+
},
|
| 1324 |
+
"165": {
|
| 1325 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1326 |
+
"lstrip": false,
|
| 1327 |
+
"normalized": false,
|
| 1328 |
+
"rstrip": false,
|
| 1329 |
+
"single_word": false,
|
| 1330 |
+
"special": false
|
| 1331 |
+
},
|
| 1332 |
+
"166": {
|
| 1333 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1334 |
+
"lstrip": false,
|
| 1335 |
+
"normalized": false,
|
| 1336 |
+
"rstrip": false,
|
| 1337 |
+
"single_word": false,
|
| 1338 |
+
"special": false
|
| 1339 |
+
},
|
| 1340 |
+
"167": {
|
| 1341 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1342 |
+
"lstrip": false,
|
| 1343 |
+
"normalized": false,
|
| 1344 |
+
"rstrip": false,
|
| 1345 |
+
"single_word": false,
|
| 1346 |
+
"special": false
|
| 1347 |
+
},
|
| 1348 |
+
"168": {
|
| 1349 |
+
"content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
|
| 1350 |
+
"lstrip": false,
|
| 1351 |
+
"normalized": false,
|
| 1352 |
+
"rstrip": false,
|
| 1353 |
+
"single_word": false,
|
| 1354 |
+
"special": false
|
| 1355 |
+
},
|
| 1356 |
+
"169": {
|
| 1357 |
+
"content": "<table>",
|
| 1358 |
+
"lstrip": false,
|
| 1359 |
+
"normalized": false,
|
| 1360 |
+
"rstrip": false,
|
| 1361 |
+
"single_word": false,
|
| 1362 |
+
"special": false
|
| 1363 |
+
},
|
| 1364 |
+
"170": {
|
| 1365 |
+
"content": "<caption>",
|
| 1366 |
+
"lstrip": false,
|
| 1367 |
+
"normalized": false,
|
| 1368 |
+
"rstrip": false,
|
| 1369 |
+
"single_word": false,
|
| 1370 |
+
"special": false
|
| 1371 |
+
},
|
| 1372 |
+
"171": {
|
| 1373 |
+
"content": "<thead>",
|
| 1374 |
+
"lstrip": false,
|
| 1375 |
+
"normalized": false,
|
| 1376 |
+
"rstrip": false,
|
| 1377 |
+
"single_word": false,
|
| 1378 |
+
"special": false
|
| 1379 |
+
},
|
| 1380 |
+
"172": {
|
| 1381 |
+
"content": "<tbody>",
|
| 1382 |
+
"lstrip": false,
|
| 1383 |
+
"normalized": false,
|
| 1384 |
+
"rstrip": false,
|
| 1385 |
+
"single_word": false,
|
| 1386 |
+
"special": false
|
| 1387 |
+
},
|
| 1388 |
+
"173": {
|
| 1389 |
+
"content": "<tfoot>",
|
| 1390 |
+
"lstrip": false,
|
| 1391 |
+
"normalized": false,
|
| 1392 |
+
"rstrip": false,
|
| 1393 |
+
"single_word": false,
|
| 1394 |
+
"special": false
|
| 1395 |
+
},
|
| 1396 |
+
"174": {
|
| 1397 |
+
"content": "<tr>",
|
| 1398 |
+
"lstrip": false,
|
| 1399 |
+
"normalized": false,
|
| 1400 |
+
"rstrip": false,
|
| 1401 |
+
"single_word": false,
|
| 1402 |
+
"special": false
|
| 1403 |
+
},
|
| 1404 |
+
"175": {
|
| 1405 |
+
"content": "<th>",
|
| 1406 |
+
"lstrip": false,
|
| 1407 |
+
"normalized": false,
|
| 1408 |
+
"rstrip": false,
|
| 1409 |
+
"single_word": false,
|
| 1410 |
+
"special": false
|
| 1411 |
+
},
|
| 1412 |
+
"176": {
|
| 1413 |
+
"content": "<td>",
|
| 1414 |
+
"lstrip": false,
|
| 1415 |
+
"normalized": false,
|
| 1416 |
+
"rstrip": false,
|
| 1417 |
+
"single_word": false,
|
| 1418 |
+
"special": false
|
| 1419 |
+
},
|
| 1420 |
+
"177": {
|
| 1421 |
+
"content": "</table>",
|
| 1422 |
+
"lstrip": false,
|
| 1423 |
+
"normalized": false,
|
| 1424 |
+
"rstrip": false,
|
| 1425 |
+
"single_word": false,
|
| 1426 |
+
"special": false
|
| 1427 |
+
},
|
| 1428 |
+
"178": {
|
| 1429 |
+
"content": "</caption>",
|
| 1430 |
+
"lstrip": false,
|
| 1431 |
+
"normalized": false,
|
| 1432 |
+
"rstrip": false,
|
| 1433 |
+
"single_word": false,
|
| 1434 |
+
"special": false
|
| 1435 |
+
},
|
| 1436 |
+
"179": {
|
| 1437 |
+
"content": "</thead>",
|
| 1438 |
+
"lstrip": false,
|
| 1439 |
+
"normalized": false,
|
| 1440 |
+
"rstrip": false,
|
| 1441 |
+
"single_word": false,
|
| 1442 |
+
"special": false
|
| 1443 |
+
},
|
| 1444 |
+
"180": {
|
| 1445 |
+
"content": "</tbody>",
|
| 1446 |
+
"lstrip": false,
|
| 1447 |
+
"normalized": false,
|
| 1448 |
+
"rstrip": false,
|
| 1449 |
+
"single_word": false,
|
| 1450 |
+
"special": false
|
| 1451 |
+
},
|
| 1452 |
+
"181": {
|
| 1453 |
+
"content": "</tfoot>",
|
| 1454 |
+
"lstrip": false,
|
| 1455 |
+
"normalized": false,
|
| 1456 |
+
"rstrip": false,
|
| 1457 |
+
"single_word": false,
|
| 1458 |
+
"special": false
|
| 1459 |
+
},
|
| 1460 |
+
"182": {
|
| 1461 |
+
"content": "</tr>",
|
| 1462 |
+
"lstrip": false,
|
| 1463 |
+
"normalized": false,
|
| 1464 |
+
"rstrip": false,
|
| 1465 |
+
"single_word": false,
|
| 1466 |
+
"special": false
|
| 1467 |
+
},
|
| 1468 |
+
"183": {
|
| 1469 |
+
"content": "</th>",
|
| 1470 |
+
"lstrip": false,
|
| 1471 |
+
"normalized": false,
|
| 1472 |
+
"rstrip": false,
|
| 1473 |
+
"single_word": false,
|
| 1474 |
+
"special": false
|
| 1475 |
+
},
|
| 1476 |
+
"184": {
|
| 1477 |
+
"content": "</td>",
|
| 1478 |
+
"lstrip": false,
|
| 1479 |
+
"normalized": false,
|
| 1480 |
+
"rstrip": false,
|
| 1481 |
+
"single_word": false,
|
| 1482 |
+
"special": false
|
| 1483 |
+
},
|
| 1484 |
+
"185": {
|
| 1485 |
+
"content": "<h1>",
|
| 1486 |
+
"lstrip": false,
|
| 1487 |
+
"normalized": false,
|
| 1488 |
+
"rstrip": false,
|
| 1489 |
+
"single_word": false,
|
| 1490 |
+
"special": false
|
| 1491 |
+
},
|
| 1492 |
+
"186": {
|
| 1493 |
+
"content": "<h2>",
|
| 1494 |
+
"lstrip": false,
|
| 1495 |
+
"normalized": false,
|
| 1496 |
+
"rstrip": false,
|
| 1497 |
+
"single_word": false,
|
| 1498 |
+
"special": false
|
| 1499 |
+
},
|
| 1500 |
+
"187": {
|
| 1501 |
+
"content": "<h3>",
|
| 1502 |
+
"lstrip": false,
|
| 1503 |
+
"normalized": false,
|
| 1504 |
+
"rstrip": false,
|
| 1505 |
+
"single_word": false,
|
| 1506 |
+
"special": false
|
| 1507 |
+
},
|
| 1508 |
+
"188": {
|
| 1509 |
+
"content": "<h4>",
|
| 1510 |
+
"lstrip": false,
|
| 1511 |
+
"normalized": false,
|
| 1512 |
+
"rstrip": false,
|
| 1513 |
+
"single_word": false,
|
| 1514 |
+
"special": false
|
| 1515 |
+
},
|
| 1516 |
+
"189": {
|
| 1517 |
+
"content": "<h5>",
|
| 1518 |
+
"lstrip": false,
|
| 1519 |
+
"normalized": false,
|
| 1520 |
+
"rstrip": false,
|
| 1521 |
+
"single_word": false,
|
| 1522 |
+
"special": false
|
| 1523 |
+
},
|
| 1524 |
+
"190": {
|
| 1525 |
+
"content": "<h6>",
|
| 1526 |
+
"lstrip": false,
|
| 1527 |
+
"normalized": false,
|
| 1528 |
+
"rstrip": false,
|
| 1529 |
+
"single_word": false,
|
| 1530 |
+
"special": false
|
| 1531 |
+
},
|
| 1532 |
+
"191": {
|
| 1533 |
+
"content": "<blockquote>",
|
| 1534 |
+
"lstrip": false,
|
| 1535 |
+
"normalized": false,
|
| 1536 |
+
"rstrip": false,
|
| 1537 |
+
"single_word": false,
|
| 1538 |
+
"special": false
|
| 1539 |
+
},
|
| 1540 |
+
"192": {
|
| 1541 |
+
"content": "</h1>",
|
| 1542 |
+
"lstrip": false,
|
| 1543 |
+
"normalized": false,
|
| 1544 |
+
"rstrip": false,
|
| 1545 |
+
"single_word": false,
|
| 1546 |
+
"special": false
|
| 1547 |
+
},
|
| 1548 |
+
"193": {
|
| 1549 |
+
"content": "</h2>",
|
| 1550 |
+
"lstrip": false,
|
| 1551 |
+
"normalized": false,
|
| 1552 |
+
"rstrip": false,
|
| 1553 |
+
"single_word": false,
|
| 1554 |
+
"special": false
|
| 1555 |
+
},
|
| 1556 |
+
"194": {
|
| 1557 |
+
"content": "</h3>",
|
| 1558 |
+
"lstrip": false,
|
| 1559 |
+
"normalized": false,
|
| 1560 |
+
"rstrip": false,
|
| 1561 |
+
"single_word": false,
|
| 1562 |
+
"special": false
|
| 1563 |
+
},
|
| 1564 |
+
"195": {
|
| 1565 |
+
"content": "</h4>",
|
| 1566 |
+
"lstrip": false,
|
| 1567 |
+
"normalized": false,
|
| 1568 |
+
"rstrip": false,
|
| 1569 |
+
"single_word": false,
|
| 1570 |
+
"special": false
|
| 1571 |
+
},
|
| 1572 |
+
"196": {
|
| 1573 |
+
"content": "</h5>",
|
| 1574 |
+
"lstrip": false,
|
| 1575 |
+
"normalized": false,
|
| 1576 |
+
"rstrip": false,
|
| 1577 |
+
"single_word": false,
|
| 1578 |
+
"special": false
|
| 1579 |
+
},
|
| 1580 |
+
"197": {
|
| 1581 |
+
"content": "</h6>",
|
| 1582 |
+
"lstrip": false,
|
| 1583 |
+
"normalized": false,
|
| 1584 |
+
"rstrip": false,
|
| 1585 |
+
"single_word": false,
|
| 1586 |
+
"special": false
|
| 1587 |
+
},
|
| 1588 |
+
"198": {
|
| 1589 |
+
"content": "</blockquote>",
|
| 1590 |
+
"lstrip": false,
|
| 1591 |
+
"normalized": false,
|
| 1592 |
+
"rstrip": false,
|
| 1593 |
+
"single_word": false,
|
| 1594 |
+
"special": false
|
| 1595 |
+
},
|
| 1596 |
+
"199": {
|
| 1597 |
+
"content": "<strong>",
|
| 1598 |
+
"lstrip": false,
|
| 1599 |
+
"normalized": false,
|
| 1600 |
+
"rstrip": false,
|
| 1601 |
+
"single_word": false,
|
| 1602 |
+
"special": false
|
| 1603 |
+
},
|
| 1604 |
+
"200": {
|
| 1605 |
+
"content": "<em>",
|
| 1606 |
+
"lstrip": false,
|
| 1607 |
+
"normalized": false,
|
| 1608 |
+
"rstrip": false,
|
| 1609 |
+
"single_word": false,
|
| 1610 |
+
"special": false
|
| 1611 |
+
},
|
| 1612 |
+
"201": {
|
| 1613 |
+
"content": "<b>",
|
| 1614 |
+
"lstrip": false,
|
| 1615 |
+
"normalized": false,
|
| 1616 |
+
"rstrip": false,
|
| 1617 |
+
"single_word": false,
|
| 1618 |
+
"special": false
|
| 1619 |
+
},
|
| 1620 |
+
"202": {
|
| 1621 |
+
"content": "<i>",
|
| 1622 |
+
"lstrip": false,
|
| 1623 |
+
"normalized": false,
|
| 1624 |
+
"rstrip": false,
|
| 1625 |
+
"single_word": false,
|
| 1626 |
+
"special": false
|
| 1627 |
+
},
|
| 1628 |
+
"203": {
|
| 1629 |
+
"content": "<u>",
|
| 1630 |
+
"lstrip": false,
|
| 1631 |
+
"normalized": false,
|
| 1632 |
+
"rstrip": false,
|
| 1633 |
+
"single_word": false,
|
| 1634 |
+
"special": false
|
| 1635 |
+
},
|
| 1636 |
+
"204": {
|
| 1637 |
+
"content": "<s>",
|
| 1638 |
+
"lstrip": false,
|
| 1639 |
+
"normalized": false,
|
| 1640 |
+
"rstrip": false,
|
| 1641 |
+
"single_word": false,
|
| 1642 |
+
"special": false
|
| 1643 |
+
},
|
| 1644 |
+
"205": {
|
| 1645 |
+
"content": "<sub>",
|
| 1646 |
+
"lstrip": false,
|
| 1647 |
+
"normalized": false,
|
| 1648 |
+
"rstrip": false,
|
| 1649 |
+
"single_word": false,
|
| 1650 |
+
"special": false
|
| 1651 |
+
},
|
| 1652 |
+
"206": {
|
| 1653 |
+
"content": "<sup>",
|
| 1654 |
+
"lstrip": false,
|
| 1655 |
+
"normalized": false,
|
| 1656 |
+
"rstrip": false,
|
| 1657 |
+
"single_word": false,
|
| 1658 |
+
"special": false
|
| 1659 |
+
},
|
| 1660 |
+
"207": {
|
| 1661 |
+
"content": "<code>",
|
| 1662 |
+
"lstrip": false,
|
| 1663 |
+
"normalized": false,
|
| 1664 |
+
"rstrip": false,
|
| 1665 |
+
"single_word": false,
|
| 1666 |
+
"special": false
|
| 1667 |
+
},
|
| 1668 |
+
"208": {
|
| 1669 |
+
"content": "</strong>",
|
| 1670 |
+
"lstrip": false,
|
| 1671 |
+
"normalized": false,
|
| 1672 |
+
"rstrip": false,
|
| 1673 |
+
"single_word": false,
|
| 1674 |
+
"special": false
|
| 1675 |
+
},
|
| 1676 |
+
"209": {
|
| 1677 |
+
"content": "</em>",
|
| 1678 |
+
"lstrip": false,
|
| 1679 |
+
"normalized": false,
|
| 1680 |
+
"rstrip": false,
|
| 1681 |
+
"single_word": false,
|
| 1682 |
+
"special": false
|
| 1683 |
+
},
|
| 1684 |
+
"210": {
|
| 1685 |
+
"content": "</b>",
|
| 1686 |
+
"lstrip": false,
|
| 1687 |
+
"normalized": false,
|
| 1688 |
+
"rstrip": false,
|
| 1689 |
+
"single_word": false,
|
| 1690 |
+
"special": false
|
| 1691 |
+
},
|
| 1692 |
+
"211": {
|
| 1693 |
+
"content": "</i>",
|
| 1694 |
+
"lstrip": false,
|
| 1695 |
+
"normalized": false,
|
| 1696 |
+
"rstrip": false,
|
| 1697 |
+
"single_word": false,
|
| 1698 |
+
"special": false
|
| 1699 |
+
},
|
| 1700 |
+
"212": {
|
| 1701 |
+
"content": "</u>",
|
| 1702 |
+
"lstrip": false,
|
| 1703 |
+
"normalized": false,
|
| 1704 |
+
"rstrip": false,
|
| 1705 |
+
"single_word": false,
|
| 1706 |
+
"special": false
|
| 1707 |
+
},
|
| 1708 |
+
"213": {
|
| 1709 |
+
"content": "</s>",
|
| 1710 |
+
"lstrip": false,
|
| 1711 |
+
"normalized": false,
|
| 1712 |
+
"rstrip": false,
|
| 1713 |
+
"single_word": false,
|
| 1714 |
+
"special": false
|
| 1715 |
+
},
|
| 1716 |
+
"214": {
|
| 1717 |
+
"content": "</sub>",
|
| 1718 |
+
"lstrip": false,
|
| 1719 |
+
"normalized": false,
|
| 1720 |
+
"rstrip": false,
|
| 1721 |
+
"single_word": false,
|
| 1722 |
+
"special": false
|
| 1723 |
+
},
|
| 1724 |
+
"215": {
|
| 1725 |
+
"content": "</sup>",
|
| 1726 |
+
"lstrip": false,
|
| 1727 |
+
"normalized": false,
|
| 1728 |
+
"rstrip": false,
|
| 1729 |
+
"single_word": false,
|
| 1730 |
+
"special": false
|
| 1731 |
+
},
|
| 1732 |
+
"216": {
|
| 1733 |
+
"content": "</code>",
|
| 1734 |
+
"lstrip": false,
|
| 1735 |
+
"normalized": false,
|
| 1736 |
+
"rstrip": false,
|
| 1737 |
+
"single_word": false,
|
| 1738 |
+
"special": false
|
| 1739 |
+
},
|
| 1740 |
+
"255968": {
|
| 1741 |
+
"content": "[toxicity=0]",
|
| 1742 |
+
"lstrip": false,
|
| 1743 |
+
"normalized": false,
|
| 1744 |
+
"rstrip": false,
|
| 1745 |
+
"single_word": false,
|
| 1746 |
+
"special": false
|
| 1747 |
+
},
|
| 1748 |
+
"255969": {
|
| 1749 |
+
"content": "\t\t",
|
| 1750 |
+
"lstrip": false,
|
| 1751 |
+
"normalized": false,
|
| 1752 |
+
"rstrip": false,
|
| 1753 |
+
"single_word": false,
|
| 1754 |
+
"special": false
|
| 1755 |
+
},
|
| 1756 |
+
"255970": {
|
| 1757 |
+
"content": "\t\t\t",
|
| 1758 |
+
"lstrip": false,
|
| 1759 |
+
"normalized": false,
|
| 1760 |
+
"rstrip": false,
|
| 1761 |
+
"single_word": false,
|
| 1762 |
+
"special": false
|
| 1763 |
+
},
|
| 1764 |
+
"255971": {
|
| 1765 |
+
"content": "\t\t\t\t",
|
| 1766 |
+
"lstrip": false,
|
| 1767 |
+
"normalized": false,
|
| 1768 |
+
"rstrip": false,
|
| 1769 |
+
"single_word": false,
|
| 1770 |
+
"special": false
|
| 1771 |
+
},
|
| 1772 |
+
"255972": {
|
| 1773 |
+
"content": "\t\t\t\t\t",
|
| 1774 |
+
"lstrip": false,
|
| 1775 |
+
"normalized": false,
|
| 1776 |
+
"rstrip": false,
|
| 1777 |
+
"single_word": false,
|
| 1778 |
+
"special": false
|
| 1779 |
+
},
|
| 1780 |
+
"255973": {
|
| 1781 |
+
"content": "\t\t\t\t\t\t",
|
| 1782 |
+
"lstrip": false,
|
| 1783 |
+
"normalized": false,
|
| 1784 |
+
"rstrip": false,
|
| 1785 |
+
"single_word": false,
|
| 1786 |
+
"special": false
|
| 1787 |
+
},
|
| 1788 |
+
"255974": {
|
| 1789 |
+
"content": "\t\t\t\t\t\t\t",
|
| 1790 |
+
"lstrip": false,
|
| 1791 |
+
"normalized": false,
|
| 1792 |
+
"rstrip": false,
|
| 1793 |
+
"single_word": false,
|
| 1794 |
+
"special": false
|
| 1795 |
+
},
|
| 1796 |
+
"255975": {
|
| 1797 |
+
"content": "\t\t\t\t\t\t\t\t",
|
| 1798 |
+
"lstrip": false,
|
| 1799 |
+
"normalized": false,
|
| 1800 |
+
"rstrip": false,
|
| 1801 |
+
"single_word": false,
|
| 1802 |
+
"special": false
|
| 1803 |
+
},
|
| 1804 |
+
"255976": {
|
| 1805 |
+
"content": "\t\t\t\t\t\t\t\t\t",
|
| 1806 |
+
"lstrip": false,
|
| 1807 |
+
"normalized": false,
|
| 1808 |
+
"rstrip": false,
|
| 1809 |
+
"single_word": false,
|
| 1810 |
+
"special": false
|
| 1811 |
+
},
|
| 1812 |
+
"255977": {
|
| 1813 |
+
"content": "\t\t\t\t\t\t\t\t\t\t",
|
| 1814 |
+
"lstrip": false,
|
| 1815 |
+
"normalized": false,
|
| 1816 |
+
"rstrip": false,
|
| 1817 |
+
"single_word": false,
|
| 1818 |
+
"special": false
|
| 1819 |
+
},
|
| 1820 |
+
"255978": {
|
| 1821 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t",
|
| 1822 |
+
"lstrip": false,
|
| 1823 |
+
"normalized": false,
|
| 1824 |
+
"rstrip": false,
|
| 1825 |
+
"single_word": false,
|
| 1826 |
+
"special": false
|
| 1827 |
+
},
|
| 1828 |
+
"255979": {
|
| 1829 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1830 |
+
"lstrip": false,
|
| 1831 |
+
"normalized": false,
|
| 1832 |
+
"rstrip": false,
|
| 1833 |
+
"single_word": false,
|
| 1834 |
+
"special": false
|
| 1835 |
+
},
|
| 1836 |
+
"255980": {
|
| 1837 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1838 |
+
"lstrip": false,
|
| 1839 |
+
"normalized": false,
|
| 1840 |
+
"rstrip": false,
|
| 1841 |
+
"single_word": false,
|
| 1842 |
+
"special": false
|
| 1843 |
+
},
|
| 1844 |
+
"255981": {
|
| 1845 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1846 |
+
"lstrip": false,
|
| 1847 |
+
"normalized": false,
|
| 1848 |
+
"rstrip": false,
|
| 1849 |
+
"single_word": false,
|
| 1850 |
+
"special": false
|
| 1851 |
+
},
|
| 1852 |
+
"255982": {
|
| 1853 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1854 |
+
"lstrip": false,
|
| 1855 |
+
"normalized": false,
|
| 1856 |
+
"rstrip": false,
|
| 1857 |
+
"single_word": false,
|
| 1858 |
+
"special": false
|
| 1859 |
+
},
|
| 1860 |
+
"255983": {
|
| 1861 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1862 |
+
"lstrip": false,
|
| 1863 |
+
"normalized": false,
|
| 1864 |
+
"rstrip": false,
|
| 1865 |
+
"single_word": false,
|
| 1866 |
+
"special": false
|
| 1867 |
+
},
|
| 1868 |
+
"255984": {
|
| 1869 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1870 |
+
"lstrip": false,
|
| 1871 |
+
"normalized": false,
|
| 1872 |
+
"rstrip": false,
|
| 1873 |
+
"single_word": false,
|
| 1874 |
+
"special": false
|
| 1875 |
+
},
|
| 1876 |
+
"255985": {
|
| 1877 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1878 |
+
"lstrip": false,
|
| 1879 |
+
"normalized": false,
|
| 1880 |
+
"rstrip": false,
|
| 1881 |
+
"single_word": false,
|
| 1882 |
+
"special": false
|
| 1883 |
+
},
|
| 1884 |
+
"255986": {
|
| 1885 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1886 |
+
"lstrip": false,
|
| 1887 |
+
"normalized": false,
|
| 1888 |
+
"rstrip": false,
|
| 1889 |
+
"single_word": false,
|
| 1890 |
+
"special": false
|
| 1891 |
+
},
|
| 1892 |
+
"255987": {
|
| 1893 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1894 |
+
"lstrip": false,
|
| 1895 |
+
"normalized": false,
|
| 1896 |
+
"rstrip": false,
|
| 1897 |
+
"single_word": false,
|
| 1898 |
+
"special": false
|
| 1899 |
+
},
|
| 1900 |
+
"255988": {
|
| 1901 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1902 |
+
"lstrip": false,
|
| 1903 |
+
"normalized": false,
|
| 1904 |
+
"rstrip": false,
|
| 1905 |
+
"single_word": false,
|
| 1906 |
+
"special": false
|
| 1907 |
+
},
|
| 1908 |
+
"255989": {
|
| 1909 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1910 |
+
"lstrip": false,
|
| 1911 |
+
"normalized": false,
|
| 1912 |
+
"rstrip": false,
|
| 1913 |
+
"single_word": false,
|
| 1914 |
+
"special": false
|
| 1915 |
+
},
|
| 1916 |
+
"255990": {
|
| 1917 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1918 |
+
"lstrip": false,
|
| 1919 |
+
"normalized": false,
|
| 1920 |
+
"rstrip": false,
|
| 1921 |
+
"single_word": false,
|
| 1922 |
+
"special": false
|
| 1923 |
+
},
|
| 1924 |
+
"255991": {
|
| 1925 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1926 |
+
"lstrip": false,
|
| 1927 |
+
"normalized": false,
|
| 1928 |
+
"rstrip": false,
|
| 1929 |
+
"single_word": false,
|
| 1930 |
+
"special": false
|
| 1931 |
+
},
|
| 1932 |
+
"255992": {
|
| 1933 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1934 |
+
"lstrip": false,
|
| 1935 |
+
"normalized": false,
|
| 1936 |
+
"rstrip": false,
|
| 1937 |
+
"single_word": false,
|
| 1938 |
+
"special": false
|
| 1939 |
+
},
|
| 1940 |
+
"255993": {
|
| 1941 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1942 |
+
"lstrip": false,
|
| 1943 |
+
"normalized": false,
|
| 1944 |
+
"rstrip": false,
|
| 1945 |
+
"single_word": false,
|
| 1946 |
+
"special": false
|
| 1947 |
+
},
|
| 1948 |
+
"255994": {
|
| 1949 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1950 |
+
"lstrip": false,
|
| 1951 |
+
"normalized": false,
|
| 1952 |
+
"rstrip": false,
|
| 1953 |
+
"single_word": false,
|
| 1954 |
+
"special": false
|
| 1955 |
+
},
|
| 1956 |
+
"255995": {
|
| 1957 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1958 |
+
"lstrip": false,
|
| 1959 |
+
"normalized": false,
|
| 1960 |
+
"rstrip": false,
|
| 1961 |
+
"single_word": false,
|
| 1962 |
+
"special": false
|
| 1963 |
+
},
|
| 1964 |
+
"255996": {
|
| 1965 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1966 |
+
"lstrip": false,
|
| 1967 |
+
"normalized": false,
|
| 1968 |
+
"rstrip": false,
|
| 1969 |
+
"single_word": false,
|
| 1970 |
+
"special": false
|
| 1971 |
+
},
|
| 1972 |
+
"255997": {
|
| 1973 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1974 |
+
"lstrip": false,
|
| 1975 |
+
"normalized": false,
|
| 1976 |
+
"rstrip": false,
|
| 1977 |
+
"single_word": false,
|
| 1978 |
+
"special": false
|
| 1979 |
+
},
|
| 1980 |
+
"255998": {
|
| 1981 |
+
"content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
|
| 1982 |
+
"lstrip": false,
|
| 1983 |
+
"normalized": false,
|
| 1984 |
+
"rstrip": false,
|
| 1985 |
+
"single_word": false,
|
| 1986 |
+
"special": false
|
| 1987 |
+
},
|
| 1988 |
+
"255999": {
|
| 1989 |
+
"content": "<unused99>",
|
| 1990 |
+
"lstrip": false,
|
| 1991 |
+
"normalized": false,
|
| 1992 |
+
"rstrip": false,
|
| 1993 |
+
"single_word": false,
|
| 1994 |
+
"special": false
|
| 1995 |
+
}
|
| 1996 |
+
},
|
| 1997 |
+
"additional_special_tokens": [
|
| 1998 |
+
"<start_of_turn>",
|
| 1999 |
+
"<end_of_turn>"
|
| 2000 |
+
],
|
| 2001 |
+
"bos_token": "<bos>",
|
| 2002 |
+
"clean_up_tokenization_spaces": false,
|
| 2003 |
+
"cls_token": "<bos>",
|
| 2004 |
+
"eos_token": "<eos>",
|
| 2005 |
+
"extra_special_tokens": {},
|
| 2006 |
+
"mask_token": "<mask>",
|
| 2007 |
+
"model_input_names": [
|
| 2008 |
+
"input_ids",
|
| 2009 |
+
"attention_mask"
|
| 2010 |
+
],
|
| 2011 |
+
"model_max_length": 8192,
|
| 2012 |
+
"pad_token": "<pad>",
|
| 2013 |
+
"padding_side": "right",
|
| 2014 |
+
"sep_token": "<eos>",
|
| 2015 |
+
"spaces_between_special_tokens": false,
|
| 2016 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 2017 |
+
"unk_token": "<unk>"
|
| 2018 |
+
}
|
train/rd_dataset_loader.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
UWV/wim-synthetic-data-rd dataset loader for multi-label classification.
|
| 4 |
+
Carmack-style: minimal abstraction, direct data flow, fast operations.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def load_rd_wim_dataset(max_samples=None, split='train'):
|
| 12 |
+
"""
|
| 13 |
+
Load UWV/wim-synthetic-data-rd dataset and encode multi-labels.
|
| 14 |
+
|
| 15 |
+
Dataset contains Dutch municipal complaint conversations with two types of labels:
|
| 16 |
+
- onderwerp: What the message is about (96 unique labels)
|
| 17 |
+
- beleving: How the citizen experienced the interaction (26 unique labels)
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
max_samples: Limit number of samples (None = all 9,351 samples)
|
| 21 |
+
split: Dataset split to load (default: 'train')
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
texts: List of conversation strings
|
| 25 |
+
onderwerp_encoded: numpy array [n_samples, 96] - multi-hot encoded topics
|
| 26 |
+
beleving_encoded: numpy array [n_samples, 26] - multi-hot encoded experiences
|
| 27 |
+
onderwerp_labels: List of 96 onderwerp label names (sorted alphabetically)
|
| 28 |
+
beleving_labels: List of 26 beleving label names (sorted alphabetically)
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
# Load dataset from HuggingFace
|
| 32 |
+
print(f"Loading UWV/wim-synthetic-data-rd dataset (split={split})...")
|
| 33 |
+
ds = load_dataset('UWV/wim-synthetic-data-rd', split=split)
|
| 34 |
+
|
| 35 |
+
# Limit samples if requested
|
| 36 |
+
if max_samples is not None:
|
| 37 |
+
ds = ds.select(range(min(max_samples, len(ds))))
|
| 38 |
+
|
| 39 |
+
print(f"Loaded {len(ds)} samples")
|
| 40 |
+
|
| 41 |
+
# Extract all unique labels from the entire dataset
|
| 42 |
+
onderwerp_set = set()
|
| 43 |
+
beleving_set = set()
|
| 44 |
+
|
| 45 |
+
for sample in ds:
|
| 46 |
+
for label in sample['onderwerp_labels']:
|
| 47 |
+
onderwerp_set.add(label)
|
| 48 |
+
for label in sample['beleving_labels']:
|
| 49 |
+
beleving_set.add(label)
|
| 50 |
+
|
| 51 |
+
# Sort labels alphabetically for consistent indexing across runs
|
| 52 |
+
onderwerp_labels = sorted(onderwerp_set)
|
| 53 |
+
beleving_labels = sorted(beleving_set)
|
| 54 |
+
|
| 55 |
+
print(f"Found {len(onderwerp_labels)} unique onderwerp labels")
|
| 56 |
+
print(f"Found {len(beleving_labels)} unique beleving labels")
|
| 57 |
+
|
| 58 |
+
# Create label -> index mappings
|
| 59 |
+
onderwerp_to_idx = {label: idx for idx, label in enumerate(onderwerp_labels)}
|
| 60 |
+
beleving_to_idx = {label: idx for idx, label in enumerate(beleving_labels)}
|
| 61 |
+
|
| 62 |
+
# Encode labels to multi-hot vectors
|
| 63 |
+
n_samples = len(ds)
|
| 64 |
+
n_onderwerp = len(onderwerp_labels)
|
| 65 |
+
n_beleving = len(beleving_labels)
|
| 66 |
+
|
| 67 |
+
# Preallocate arrays (faster than appending)
|
| 68 |
+
texts = []
|
| 69 |
+
onderwerp_encoded = np.zeros((n_samples, n_onderwerp), dtype=np.float32)
|
| 70 |
+
beleving_encoded = np.zeros((n_samples, n_beleving), dtype=np.float32)
|
| 71 |
+
|
| 72 |
+
# Fill arrays
|
| 73 |
+
for i, sample in enumerate(ds):
|
| 74 |
+
texts.append(sample['text'])
|
| 75 |
+
|
| 76 |
+
# Encode onderwerp labels (multi-hot)
|
| 77 |
+
for label in sample['onderwerp_labels']:
|
| 78 |
+
idx = onderwerp_to_idx[label]
|
| 79 |
+
onderwerp_encoded[i, idx] = 1.0
|
| 80 |
+
|
| 81 |
+
# Encode beleving labels (multi-hot)
|
| 82 |
+
for label in sample['beleving_labels']:
|
| 83 |
+
idx = beleving_to_idx[label]
|
| 84 |
+
beleving_encoded[i, idx] = 1.0
|
| 85 |
+
|
| 86 |
+
print(f"Encoded {n_samples} samples")
|
| 87 |
+
print(f" onderwerp shape: {onderwerp_encoded.shape}")
|
| 88 |
+
print(f" beleving shape: {beleving_encoded.shape}")
|
| 89 |
+
|
| 90 |
+
return texts, onderwerp_encoded, beleving_encoded, onderwerp_labels, beleving_labels
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def print_sample_info(texts, onderwerp_encoded, beleving_encoded,
|
| 94 |
+
onderwerp_labels, beleving_labels, sample_idx=0):
|
| 95 |
+
"""
|
| 96 |
+
Print information about a specific sample (useful for debugging).
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
All outputs from load_uwv_wim_dataset()
|
| 100 |
+
sample_idx: Which sample to print (default: 0)
|
| 101 |
+
"""
|
| 102 |
+
print(f"\n{'='*60}")
|
| 103 |
+
print(f"SAMPLE {sample_idx}")
|
| 104 |
+
print(f"{'='*60}")
|
| 105 |
+
print(f"Text: {texts[sample_idx][:200]}...")
|
| 106 |
+
print()
|
| 107 |
+
|
| 108 |
+
# Get active onderwerp labels
|
| 109 |
+
onderwerp_active = [onderwerp_labels[i] for i, val in enumerate(onderwerp_encoded[sample_idx]) if val == 1]
|
| 110 |
+
print(f"Onderwerp labels ({len(onderwerp_active)}):")
|
| 111 |
+
for label in onderwerp_active:
|
| 112 |
+
print(f" - {label}")
|
| 113 |
+
print()
|
| 114 |
+
|
| 115 |
+
# Get active beleving labels
|
| 116 |
+
beleving_active = [beleving_labels[i] for i, val in enumerate(beleving_encoded[sample_idx]) if val == 1]
|
| 117 |
+
print(f"Beleving labels ({len(beleving_active)}):")
|
| 118 |
+
for label in beleving_active:
|
| 119 |
+
print(f" - {label}")
|
| 120 |
+
print(f"{'='*60}\n")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
# Test the loader
|
| 125 |
+
print("Testing UWV dataset loader...\n")
|
| 126 |
+
|
| 127 |
+
# Load small subset for testing
|
| 128 |
+
texts, onderwerp, beleving, onderwerp_names, beleving_names = load_rd_wim_dataset(max_samples=10)
|
| 129 |
+
|
| 130 |
+
# Print first sample
|
| 131 |
+
print_sample_info(texts, onderwerp, beleving, onderwerp_names, beleving_names, sample_idx=0)
|
| 132 |
+
|
| 133 |
+
# Print statistics
|
| 134 |
+
print("\nDataset Statistics:")
|
| 135 |
+
print(f" Total samples: {len(texts)}")
|
| 136 |
+
print(f" Avg onderwerp labels per sample: {onderwerp.sum(axis=1).mean():.2f}")
|
| 137 |
+
print(f" Avg beleving labels per sample: {beleving.sum(axis=1).mean():.2f}")
|
| 138 |
+
print(f" Text length range: {min(len(t) for t in texts)} - {max(len(t) for t in texts)} chars")
|
train/train_mmbert_dual_soft_f1_simplified.py
ADDED
|
@@ -0,0 +1,953 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Dual-head multi-label PyTorch training script for mmBERT-base.
|
| 4 |
+
Two classification heads: onderwerp (96 labels) and beleving (26 labels).
|
| 5 |
+
Uses combined F1+BCE loss with weight α (configurable balance).
|
| 6 |
+
Features: learnable thresholds, warmup + cosine LR, gradient clipping.
|
| 7 |
+
mmBERT: Modern multilingual encoder (1800+ languages, 2x faster than XLM-R).
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn as nn
|
| 12 |
+
import torch.nn.functional as F
|
| 13 |
+
from torch.utils.data import Dataset, DataLoader
|
| 14 |
+
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
|
| 15 |
+
from transformers import AutoTokenizer, AutoModel
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
+
import numpy as np
|
| 19 |
+
import random
|
| 20 |
+
import wandb
|
| 21 |
+
from rd_dataset_loader import load_rd_wim_dataset
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Threshold helpers: logit ↔ probability conversions
|
| 25 |
+
def prob_to_logit(p: torch.Tensor, eps: float = 1e-7) -> torch.Tensor:
|
| 26 |
+
"""Convert probabilities to logits (inverse sigmoid). Numerically stable."""
|
| 27 |
+
p = torch.clamp(p, eps, 1 - eps)
|
| 28 |
+
return torch.log(p / (1 - p))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def logit_to_prob(l: torch.Tensor) -> torch.Tensor:
|
| 32 |
+
"""Convert logits to probabilities using sigmoid."""
|
| 33 |
+
return torch.sigmoid(l)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Set device - MPS for Apple Silicon, fallback to CPU
|
| 37 |
+
def get_device():
|
| 38 |
+
if torch.backends.mps.is_available():
|
| 39 |
+
device = torch.device("mps")
|
| 40 |
+
print("Using MPS (Apple Silicon) for acceleration")
|
| 41 |
+
elif torch.cuda.is_available():
|
| 42 |
+
device = torch.device("cuda")
|
| 43 |
+
print("Using CUDA GPU")
|
| 44 |
+
else:
|
| 45 |
+
device = torch.device("cpu")
|
| 46 |
+
print("Using CPU")
|
| 47 |
+
return device
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def set_seed(seed):
|
| 51 |
+
"""Set random seeds for reproducibility across torch, numpy, and Python random."""
|
| 52 |
+
torch.manual_seed(seed)
|
| 53 |
+
np.random.seed(seed)
|
| 54 |
+
random.seed(seed)
|
| 55 |
+
if torch.cuda.is_available():
|
| 56 |
+
torch.cuda.manual_seed_all(seed)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class mmBERTDualHead(nn.Module):
|
| 60 |
+
"""
|
| 61 |
+
mmBERT with two classification heads for multi-task learning.
|
| 62 |
+
Shared encoder with separate heads for onderwerp and beleving.
|
| 63 |
+
Optionally includes learnable thresholds for each head.
|
| 64 |
+
"""
|
| 65 |
+
def __init__(self, model_name, num_onderwerp, num_beleving, dropout, initial_threshold, use_thresholds: bool = True):
|
| 66 |
+
super().__init__()
|
| 67 |
+
self.use_thresholds = use_thresholds
|
| 68 |
+
|
| 69 |
+
# Shared mmBERT encoder (22 layers, 768 hidden, supports up to 8192 tokens)
|
| 70 |
+
self.encoder = AutoModel.from_pretrained(model_name)
|
| 71 |
+
hidden_size = self.encoder.config.hidden_size # 768 for mmBERT-base
|
| 72 |
+
|
| 73 |
+
# Classification head for onderwerp (topics)
|
| 74 |
+
self.onderwerp_head = nn.Sequential(
|
| 75 |
+
nn.Linear(hidden_size, hidden_size),
|
| 76 |
+
nn.Dropout(dropout),
|
| 77 |
+
nn.ReLU(),
|
| 78 |
+
nn.Linear(hidden_size, num_onderwerp)
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Classification head for beleving (experiences)
|
| 82 |
+
self.beleving_head = nn.Sequential(
|
| 83 |
+
nn.Linear(hidden_size, hidden_size),
|
| 84 |
+
nn.Dropout(dropout),
|
| 85 |
+
nn.ReLU(),
|
| 86 |
+
nn.Linear(hidden_size, num_beleving)
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Thresholds are optionally parameterized in **logit space** (tau_logit).
|
| 90 |
+
# Why: (1) avoids prob clamping and keeps grads healthy, (2) matches the space of logits,
|
| 91 |
+
# (3) lets Soft-F1 express per-class decision boundaries independent of BCE calibration.
|
| 92 |
+
self.onderwerp_tau_logit = None
|
| 93 |
+
self.beleving_tau_logit = None
|
| 94 |
+
if self.use_thresholds:
|
| 95 |
+
init_logit = prob_to_logit(torch.tensor(initial_threshold))
|
| 96 |
+
self.onderwerp_tau_logit = nn.Parameter(torch.full((num_onderwerp,), init_logit))
|
| 97 |
+
self.beleving_tau_logit = nn.Parameter(torch.full((num_beleving,), init_logit))
|
| 98 |
+
|
| 99 |
+
def forward(self, input_ids, attention_mask):
|
| 100 |
+
# Get shared representation from mmBERT encoder
|
| 101 |
+
outputs = self.encoder(
|
| 102 |
+
input_ids=input_ids,
|
| 103 |
+
attention_mask=attention_mask
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# mmBERT doesn't have pooler_output, use CLS token from last_hidden_state
|
| 107 |
+
# Extract [CLS] token representation (first token in sequence)
|
| 108 |
+
pooled_output = outputs.last_hidden_state[:, 0, :]
|
| 109 |
+
|
| 110 |
+
# Generate predictions from both heads
|
| 111 |
+
onderwerp_logits = self.onderwerp_head(pooled_output)
|
| 112 |
+
beleving_logits = self.beleving_head(pooled_output)
|
| 113 |
+
|
| 114 |
+
return onderwerp_logits, beleving_logits
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class DutchDualLabelDataset(Dataset):
|
| 118 |
+
"""Dataset for dual-label classification (onderwerp + beleving)."""
|
| 119 |
+
|
| 120 |
+
def __init__(self, texts, onderwerp_labels, beleving_labels, tokenizer, max_length):
|
| 121 |
+
self.texts = texts
|
| 122 |
+
self.onderwerp_labels = onderwerp_labels
|
| 123 |
+
self.beleving_labels = beleving_labels
|
| 124 |
+
self.tokenizer = tokenizer
|
| 125 |
+
self.max_length = max_length
|
| 126 |
+
|
| 127 |
+
def __len__(self):
|
| 128 |
+
return len(self.texts)
|
| 129 |
+
|
| 130 |
+
def __getitem__(self, idx):
|
| 131 |
+
text = self.texts[idx]
|
| 132 |
+
|
| 133 |
+
# Tokenize text
|
| 134 |
+
encoding = self.tokenizer(
|
| 135 |
+
text,
|
| 136 |
+
truncation=True,
|
| 137 |
+
padding='max_length',
|
| 138 |
+
max_length=self.max_length,
|
| 139 |
+
return_tensors='pt'
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
return {
|
| 143 |
+
'input_ids': encoding['input_ids'].squeeze(),
|
| 144 |
+
'attention_mask': encoding['attention_mask'].squeeze(),
|
| 145 |
+
'onderwerp_labels': torch.tensor(self.onderwerp_labels[idx], dtype=torch.float),
|
| 146 |
+
'beleving_labels': torch.tensor(self.beleving_labels[idx], dtype=torch.float)
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def calculate_soft_f1(logits, labels, logit_threshold=None, temperature=1.0):
|
| 151 |
+
"""
|
| 152 |
+
Calculate differentiable F1 score using sigmoid approximation.
|
| 153 |
+
|
| 154 |
+
If logit_threshold is None: y_soft = sigmoid(logits * T)
|
| 155 |
+
Else: y_soft = sigmoid((logits - logit_threshold) * T)
|
| 156 |
+
|
| 157 |
+
Rationale:
|
| 158 |
+
- With thresholds ON, Soft-F1 learns per-class decision boundaries in logit space.
|
| 159 |
+
- With thresholds OFF, we follow POLA: a single, obvious source (head logits).
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
logits: Model predictions (before sigmoid)
|
| 163 |
+
labels: True labels (multi-hot encoded)
|
| 164 |
+
logit_threshold: Optional decision threshold in LOGIT space (None = no shift)
|
| 165 |
+
temperature: Sharpness of sigmoid approximation
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
soft_f1: Differentiable F1 score
|
| 169 |
+
"""
|
| 170 |
+
# Compute shifted logits (or raw logits if threshold is None)
|
| 171 |
+
if logit_threshold is None:
|
| 172 |
+
shifted = logits * temperature
|
| 173 |
+
else:
|
| 174 |
+
shifted = (logits - logit_threshold) * temperature
|
| 175 |
+
|
| 176 |
+
# Soft predictions using sigmoid
|
| 177 |
+
y_pred_soft = torch.sigmoid(shifted)
|
| 178 |
+
|
| 179 |
+
# Soft confusion matrix elements
|
| 180 |
+
TP = (y_pred_soft * labels).sum(dim=-1) # True Positives
|
| 181 |
+
FP = (y_pred_soft * (1 - labels)).sum(dim=-1) # False Positives
|
| 182 |
+
FN = ((1 - y_pred_soft) * labels).sum(dim=-1) # False Negatives
|
| 183 |
+
|
| 184 |
+
# Differentiable F1 score
|
| 185 |
+
eps = 1e-8
|
| 186 |
+
precision = TP / (TP + FP + eps)
|
| 187 |
+
recall = TP / (TP + FN + eps)
|
| 188 |
+
f1 = 2 * precision * recall / (precision + recall + eps)
|
| 189 |
+
|
| 190 |
+
return f1.mean() # Average across batch
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def evaluate(model, val_texts, val_onderwerp, val_beleving, tokenizer, device,
|
| 194 |
+
onderwerp_names, beleving_names, num_samples, max_length):
|
| 195 |
+
"""
|
| 196 |
+
Evaluate model on validation set and return metrics.
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
model: The trained model
|
| 200 |
+
val_texts: List of validation texts
|
| 201 |
+
val_onderwerp: Validation onderwerp labels
|
| 202 |
+
val_beleving: Validation beleving labels
|
| 203 |
+
tokenizer: Tokenizer for encoding text
|
| 204 |
+
device: Device to run evaluation on
|
| 205 |
+
onderwerp_names: List of onderwerp label names
|
| 206 |
+
beleving_names: List of beleving label names
|
| 207 |
+
num_samples: Number of samples to evaluate (None = all)
|
| 208 |
+
max_length: Max sequence length
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
dict: Dictionary containing all evaluation metrics
|
| 212 |
+
"""
|
| 213 |
+
model.eval()
|
| 214 |
+
|
| 215 |
+
# Determine number of samples to evaluate
|
| 216 |
+
if num_samples is None:
|
| 217 |
+
num_samples = len(val_texts)
|
| 218 |
+
else:
|
| 219 |
+
num_samples = min(num_samples, len(val_texts))
|
| 220 |
+
|
| 221 |
+
# Track metrics
|
| 222 |
+
onderwerp_correct = np.zeros(len(onderwerp_names))
|
| 223 |
+
onderwerp_total = np.zeros(len(onderwerp_names))
|
| 224 |
+
beleving_correct = np.zeros(len(beleving_names))
|
| 225 |
+
beleving_total = np.zeros(len(beleving_names))
|
| 226 |
+
|
| 227 |
+
# Track F1 components
|
| 228 |
+
onderwerp_tp = 0
|
| 229 |
+
onderwerp_fp = 0
|
| 230 |
+
onderwerp_fn = 0
|
| 231 |
+
beleving_tp = 0
|
| 232 |
+
beleving_fp = 0
|
| 233 |
+
beleving_fn = 0
|
| 234 |
+
|
| 235 |
+
with torch.inference_mode():
|
| 236 |
+
for i in range(num_samples):
|
| 237 |
+
# Tokenize
|
| 238 |
+
encoding = tokenizer(
|
| 239 |
+
val_texts[i],
|
| 240 |
+
truncation=True,
|
| 241 |
+
padding='max_length',
|
| 242 |
+
max_length=max_length,
|
| 243 |
+
return_tensors='pt'
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
# Move to device
|
| 247 |
+
input_ids = encoding['input_ids'].to(device)
|
| 248 |
+
attention_mask = encoding['attention_mask'].to(device)
|
| 249 |
+
|
| 250 |
+
# Get predictions
|
| 251 |
+
onderwerp_logits, beleving_logits = model(input_ids, attention_mask)
|
| 252 |
+
|
| 253 |
+
# Convert to probabilities
|
| 254 |
+
onderwerp_probs = torch.sigmoid(onderwerp_logits)
|
| 255 |
+
beleving_probs = torch.sigmoid(beleving_logits)
|
| 256 |
+
|
| 257 |
+
# Apply learned per-class thresholds (if enabled) or fixed 0.5 cutoff
|
| 258 |
+
if model.use_thresholds:
|
| 259 |
+
tau_on = logit_to_prob(model.onderwerp_tau_logit) # [C1]
|
| 260 |
+
tau_be = logit_to_prob(model.beleving_tau_logit) # [C2]
|
| 261 |
+
else:
|
| 262 |
+
# Fixed probability cutoff (POLA-friendly)
|
| 263 |
+
tau_on = torch.full_like(onderwerp_probs[0], 0.5)
|
| 264 |
+
tau_be = torch.full_like(beleving_probs[0], 0.5)
|
| 265 |
+
|
| 266 |
+
onderwerp_pred = (onderwerp_probs > tau_on).squeeze().cpu().numpy()
|
| 267 |
+
beleving_pred = (beleving_probs > tau_be).squeeze().cpu().numpy()
|
| 268 |
+
|
| 269 |
+
# Get true labels
|
| 270 |
+
onderwerp_true = val_onderwerp[i]
|
| 271 |
+
beleving_true = val_beleving[i]
|
| 272 |
+
|
| 273 |
+
# Update F1 components
|
| 274 |
+
onderwerp_tp += ((onderwerp_pred == 1) & (onderwerp_true == 1)).sum()
|
| 275 |
+
onderwerp_fp += ((onderwerp_pred == 1) & (onderwerp_true == 0)).sum()
|
| 276 |
+
onderwerp_fn += ((onderwerp_pred == 0) & (onderwerp_true == 1)).sum()
|
| 277 |
+
|
| 278 |
+
beleving_tp += ((beleving_pred == 1) & (beleving_true == 1)).sum()
|
| 279 |
+
beleving_fp += ((beleving_pred == 1) & (beleving_true == 0)).sum()
|
| 280 |
+
beleving_fn += ((beleving_pred == 0) & (beleving_true == 1)).sum()
|
| 281 |
+
|
| 282 |
+
# Update accuracy metrics
|
| 283 |
+
for j in range(len(onderwerp_names)):
|
| 284 |
+
if onderwerp_pred[j] == onderwerp_true[j]:
|
| 285 |
+
onderwerp_correct[j] += 1
|
| 286 |
+
onderwerp_total[j] += 1
|
| 287 |
+
|
| 288 |
+
for j in range(len(beleving_names)):
|
| 289 |
+
if beleving_pred[j] == beleving_true[j]:
|
| 290 |
+
beleving_correct[j] += 1
|
| 291 |
+
beleving_total[j] += 1
|
| 292 |
+
|
| 293 |
+
# Calculate F1 scores
|
| 294 |
+
epsilon = 1e-8
|
| 295 |
+
onderwerp_precision = onderwerp_tp / (onderwerp_tp + onderwerp_fp + epsilon)
|
| 296 |
+
onderwerp_recall = onderwerp_tp / (onderwerp_tp + onderwerp_fn + epsilon)
|
| 297 |
+
onderwerp_f1_score = 2 * onderwerp_precision * onderwerp_recall / (onderwerp_precision + onderwerp_recall + epsilon)
|
| 298 |
+
|
| 299 |
+
beleving_precision = beleving_tp / (beleving_tp + beleving_fp + epsilon)
|
| 300 |
+
beleving_recall = beleving_tp / (beleving_tp + beleving_fn + epsilon)
|
| 301 |
+
beleving_f1_score = 2 * beleving_precision * beleving_recall / (beleving_precision + beleving_recall + epsilon)
|
| 302 |
+
|
| 303 |
+
# Calculate accuracies
|
| 304 |
+
onderwerp_acc = onderwerp_correct.sum() / onderwerp_total.sum()
|
| 305 |
+
beleving_acc = beleving_correct.sum() / beleving_total.sum()
|
| 306 |
+
|
| 307 |
+
# Get threshold statistics (convert to probability space for human readability)
|
| 308 |
+
if model.use_thresholds:
|
| 309 |
+
onderwerp_thresh_mean = logit_to_prob(model.onderwerp_tau_logit).mean().item()
|
| 310 |
+
onderwerp_thresh_min = logit_to_prob(model.onderwerp_tau_logit).min().item()
|
| 311 |
+
onderwerp_thresh_max = logit_to_prob(model.onderwerp_tau_logit).max().item()
|
| 312 |
+
onderwerp_thresh_std = logit_to_prob(model.onderwerp_tau_logit).std().item()
|
| 313 |
+
beleving_thresh_mean = logit_to_prob(model.beleving_tau_logit).mean().item()
|
| 314 |
+
beleving_thresh_min = logit_to_prob(model.beleving_tau_logit).min().item()
|
| 315 |
+
beleving_thresh_max = logit_to_prob(model.beleving_tau_logit).max().item()
|
| 316 |
+
beleving_thresh_std = logit_to_prob(model.beleving_tau_logit).std().item()
|
| 317 |
+
else:
|
| 318 |
+
# Fixed threshold values
|
| 319 |
+
onderwerp_thresh_mean = onderwerp_thresh_min = onderwerp_thresh_max = onderwerp_thresh_std = 0.5
|
| 320 |
+
beleving_thresh_mean = beleving_thresh_min = beleving_thresh_max = beleving_thresh_std = 0.5
|
| 321 |
+
|
| 322 |
+
# Return metrics dictionary
|
| 323 |
+
return {
|
| 324 |
+
'onderwerp_acc': onderwerp_acc,
|
| 325 |
+
'onderwerp_precision': onderwerp_precision,
|
| 326 |
+
'onderwerp_recall': onderwerp_recall,
|
| 327 |
+
'onderwerp_f1': onderwerp_f1_score,
|
| 328 |
+
'beleving_acc': beleving_acc,
|
| 329 |
+
'beleving_precision': beleving_precision,
|
| 330 |
+
'beleving_recall': beleving_recall,
|
| 331 |
+
'beleving_f1': beleving_f1_score,
|
| 332 |
+
'combined_acc': (onderwerp_acc + beleving_acc) / 2,
|
| 333 |
+
'combined_f1': (onderwerp_f1_score + beleving_f1_score) / 2,
|
| 334 |
+
'onderwerp_thresh_mean': onderwerp_thresh_mean,
|
| 335 |
+
'onderwerp_thresh_min': onderwerp_thresh_min,
|
| 336 |
+
'onderwerp_thresh_max': onderwerp_thresh_max,
|
| 337 |
+
'onderwerp_thresh_std': onderwerp_thresh_std,
|
| 338 |
+
'beleving_thresh_mean': beleving_thresh_mean,
|
| 339 |
+
'beleving_thresh_min': beleving_thresh_min,
|
| 340 |
+
'beleving_thresh_max': beleving_thresh_max,
|
| 341 |
+
'beleving_thresh_std': beleving_thresh_std,
|
| 342 |
+
'num_samples_evaluated': num_samples
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def grad_l2_norm(params):
|
| 347 |
+
"""
|
| 348 |
+
Calculate L2 norm of gradients safely (avoids Python int→Tensor addition).
|
| 349 |
+
|
| 350 |
+
Args:
|
| 351 |
+
params: Iterator of parameters (e.g., model.parameters())
|
| 352 |
+
|
| 353 |
+
Returns:
|
| 354 |
+
float: L2 norm of all gradients, or 0.0 if no gradients exist
|
| 355 |
+
"""
|
| 356 |
+
sq_sum = None
|
| 357 |
+
for p in params:
|
| 358 |
+
if p.grad is None:
|
| 359 |
+
continue
|
| 360 |
+
g = p.grad
|
| 361 |
+
val = g.pow(2).sum()
|
| 362 |
+
sq_sum = val if sq_sum is None else (sq_sum + val)
|
| 363 |
+
if sq_sum is None:
|
| 364 |
+
return 0.0
|
| 365 |
+
return sq_sum.sqrt().item()
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def make_opt_sched(model, enc_lr, thr_lr, total_steps, warmup_ratio, eta_min):
|
| 369 |
+
"""
|
| 370 |
+
Create optimizer+scheduler for training.
|
| 371 |
+
Optimizer has 1-2 param groups: [0]=encoder+heads, [1]=thresholds (optional).
|
| 372 |
+
"""
|
| 373 |
+
# Group 0: encoder + heads
|
| 374 |
+
encoder_params = [p for n, p in model.named_parameters()
|
| 375 |
+
if not (model.use_thresholds and 'tau_logit' in n)]
|
| 376 |
+
param_groups = [{"params": encoder_params, "lr": enc_lr, "weight_decay": 0.0}]
|
| 377 |
+
|
| 378 |
+
# Group 1 (optional): thresholds
|
| 379 |
+
if model.use_thresholds:
|
| 380 |
+
thr_params = [model.onderwerp_tau_logit, model.beleving_tau_logit]
|
| 381 |
+
param_groups.append({"params": thr_params, "lr": thr_lr, "weight_decay": 0.0})
|
| 382 |
+
|
| 383 |
+
optimizer = torch.optim.AdamW(param_groups)
|
| 384 |
+
|
| 385 |
+
# Warmup → cosine schedule
|
| 386 |
+
warmup_steps = min(max(1, int(warmup_ratio * total_steps)), max(1, total_steps - 1))
|
| 387 |
+
warmup = LinearLR(optimizer, start_factor=1e-10, end_factor=1.0, total_iters=warmup_steps)
|
| 388 |
+
cosine = CosineAnnealingLR(optimizer, T_max=max(1, total_steps - warmup_steps), eta_min=eta_min)
|
| 389 |
+
scheduler = SequentialLR(optimizer, [warmup, cosine], milestones=[warmup_steps])
|
| 390 |
+
|
| 391 |
+
return optimizer, scheduler
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def run_epochs(model, tokenizer, train_loader, val_texts, val_onderwerp, val_beleving,
|
| 395 |
+
onderwerp_names, beleving_names, device,
|
| 396 |
+
*, start_epoch, end_epoch, phase_name="train",
|
| 397 |
+
optimizer, scheduler, temperature, alpha,
|
| 398 |
+
max_length, global_step):
|
| 399 |
+
"""
|
| 400 |
+
Run training for a range of epochs.
|
| 401 |
+
|
| 402 |
+
Args:
|
| 403 |
+
model: The model to train
|
| 404 |
+
tokenizer: Tokenizer for text encoding
|
| 405 |
+
train_loader: DataLoader for training batches
|
| 406 |
+
val_texts, val_onderwerp, val_beleving: Validation data
|
| 407 |
+
onderwerp_names, beleving_names: Label names
|
| 408 |
+
device: Device to train on
|
| 409 |
+
start_epoch: Starting epoch (inclusive)
|
| 410 |
+
end_epoch: Ending epoch (exclusive)
|
| 411 |
+
phase_name: Name for logging (default: "train")
|
| 412 |
+
optimizer: Optimizer
|
| 413 |
+
scheduler: LR scheduler
|
| 414 |
+
temperature: Soft-F1 temperature
|
| 415 |
+
alpha: Loss weighting (F1 vs BCE)
|
| 416 |
+
max_length: Max sequence length
|
| 417 |
+
global_step: Starting global step counter
|
| 418 |
+
|
| 419 |
+
Returns:
|
| 420 |
+
Updated global_step
|
| 421 |
+
"""
|
| 422 |
+
num_epochs = end_epoch - start_epoch
|
| 423 |
+
phase_total_steps = max(1, len(train_loader) * num_epochs)
|
| 424 |
+
|
| 425 |
+
model.train()
|
| 426 |
+
|
| 427 |
+
for epoch in range(start_epoch, end_epoch):
|
| 428 |
+
total_loss = 0
|
| 429 |
+
total_onderwerp_f1 = 0
|
| 430 |
+
total_beleving_f1 = 0
|
| 431 |
+
total_bce_loss = 0
|
| 432 |
+
total_f1_loss = 0
|
| 433 |
+
num_batches = 0
|
| 434 |
+
|
| 435 |
+
print(f"\n[{phase_name.upper()}] Epoch {epoch + 1}/{end_epoch}")
|
| 436 |
+
print("-" * 40)
|
| 437 |
+
|
| 438 |
+
for batch_idx, batch in enumerate(train_loader):
|
| 439 |
+
# Move batch to device
|
| 440 |
+
input_ids = batch['input_ids'].to(device)
|
| 441 |
+
attention_mask = batch['attention_mask'].to(device)
|
| 442 |
+
onderwerp_labels = batch['onderwerp_labels'].to(device)
|
| 443 |
+
beleving_labels = batch['beleving_labels'].to(device)
|
| 444 |
+
|
| 445 |
+
# Zero gradients
|
| 446 |
+
optimizer.zero_grad()
|
| 447 |
+
|
| 448 |
+
# Forward pass
|
| 449 |
+
onderwerp_logits, beleving_logits = model(input_ids, attention_mask)
|
| 450 |
+
|
| 451 |
+
# Calculate Soft-F1 for both heads (conditionally pass thresholds)
|
| 452 |
+
onderwerp_f1 = calculate_soft_f1(
|
| 453 |
+
onderwerp_logits, onderwerp_labels,
|
| 454 |
+
model.onderwerp_tau_logit if model.use_thresholds else None,
|
| 455 |
+
temperature
|
| 456 |
+
)
|
| 457 |
+
beleving_f1 = calculate_soft_f1(
|
| 458 |
+
beleving_logits, beleving_labels,
|
| 459 |
+
model.beleving_tau_logit if model.use_thresholds else None,
|
| 460 |
+
temperature
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
# Calculate BCE loss
|
| 464 |
+
# Design choice (POLA):
|
| 465 |
+
# - BCE is computed on raw logits to maintain probability calibration.
|
| 466 |
+
# - Soft-F1 may use a shifted logit (if thresholds ON) to learn F1-friendly boundaries.
|
| 467 |
+
# - If thresholds OFF, Soft-F1 acts directly on logits; there is a single "source of truth".
|
| 468 |
+
# This keeps behavior unsurprising: either (A) calibrated logits + separate boundary learning,
|
| 469 |
+
# or (B) no extra threshold machinery; F1 and BCE both reference the same logits.
|
| 470 |
+
bce_onderwerp = F.binary_cross_entropy_with_logits(onderwerp_logits, onderwerp_labels)
|
| 471 |
+
bce_beleving = F.binary_cross_entropy_with_logits(beleving_logits, beleving_labels)
|
| 472 |
+
|
| 473 |
+
# Combined loss
|
| 474 |
+
f1_loss = (1 - onderwerp_f1) + (1 - beleving_f1)
|
| 475 |
+
bce_loss = bce_onderwerp + bce_beleving
|
| 476 |
+
loss = alpha * (f1_loss / 2) + (1 - alpha) * (bce_loss / 2)
|
| 477 |
+
|
| 478 |
+
# Periodic logging
|
| 479 |
+
if batch_idx % 20 == 0:
|
| 480 |
+
with torch.no_grad():
|
| 481 |
+
# Get predictions (convert thresholds from logit-space to prob-space if enabled)
|
| 482 |
+
onderwerp_probs = torch.sigmoid(onderwerp_logits)
|
| 483 |
+
beleving_probs = torch.sigmoid(beleving_logits)
|
| 484 |
+
if model.use_thresholds:
|
| 485 |
+
tau_on = logit_to_prob(model.onderwerp_tau_logit)
|
| 486 |
+
tau_be = logit_to_prob(model.beleving_tau_logit)
|
| 487 |
+
else:
|
| 488 |
+
tau_on = torch.full_like(onderwerp_probs[0], 0.5)
|
| 489 |
+
tau_be = torch.full_like(beleving_probs[0], 0.5)
|
| 490 |
+
onderwerp_pred = (onderwerp_probs > tau_on).float()
|
| 491 |
+
beleving_pred = (beleving_probs > tau_be).float()
|
| 492 |
+
|
| 493 |
+
# Log actual optimizer param group LRs
|
| 494 |
+
lrs = scheduler.get_last_lr()
|
| 495 |
+
encoder_head_lr = lrs[0] # Param group 0: encoder + heads
|
| 496 |
+
threshold_lr = lrs[1] if len(lrs) > 1 else None # Param group 1: thresholds (optional)
|
| 497 |
+
|
| 498 |
+
# Threshold statistics (convert to probability space for readability)
|
| 499 |
+
if model.use_thresholds:
|
| 500 |
+
onderwerp_thresh_mean = logit_to_prob(model.onderwerp_tau_logit).mean().item()
|
| 501 |
+
onderwerp_thresh_min = logit_to_prob(model.onderwerp_tau_logit).min().item()
|
| 502 |
+
onderwerp_thresh_max = logit_to_prob(model.onderwerp_tau_logit).max().item()
|
| 503 |
+
beleving_thresh_mean = logit_to_prob(model.beleving_tau_logit).mean().item()
|
| 504 |
+
beleving_thresh_min = logit_to_prob(model.beleving_tau_logit).min().item()
|
| 505 |
+
beleving_thresh_max = logit_to_prob(model.beleving_tau_logit).max().item()
|
| 506 |
+
else:
|
| 507 |
+
onderwerp_thresh_mean = onderwerp_thresh_min = onderwerp_thresh_max = 0.5
|
| 508 |
+
beleving_thresh_mean = beleving_thresh_min = beleving_thresh_max = 0.5
|
| 509 |
+
|
| 510 |
+
print(f" Batch {batch_idx + 1} | Step {global_step + 1}/{phase_total_steps}:")
|
| 511 |
+
if threshold_lr is not None:
|
| 512 |
+
print(f" Total loss: {loss.item():.4f} (α={alpha} F1 + {1-alpha} BCE) | LR: enc_head={encoder_head_lr:.2e} thresh={threshold_lr:.2e}")
|
| 513 |
+
else:
|
| 514 |
+
print(f" Total loss: {loss.item():.4f} (α={alpha} F1 + {1-alpha} BCE) | LR: enc_head={encoder_head_lr:.2e}")
|
| 515 |
+
print(f" F1 loss: {(f1_loss/2).item():.4f} | BCE loss: {(bce_loss/2).item():.4f}")
|
| 516 |
+
print(f" Onderwerp F1: {onderwerp_f1.item():.4f} | BCE: {bce_onderwerp.item():.4f} | Thresh: {onderwerp_thresh_mean:.3f} [{onderwerp_thresh_min:.3f}-{onderwerp_thresh_max:.3f}]")
|
| 517 |
+
print(f" Beleving F1: {beleving_f1.item():.4f} | BCE: {bce_beleving.item():.4f} | Thresh: {beleving_thresh_mean:.3f} [{beleving_thresh_min:.3f}-{beleving_thresh_max:.3f}]")
|
| 518 |
+
print(f" Onderwerp preds: {int(onderwerp_pred.sum())} / {int(onderwerp_labels.sum())} true")
|
| 519 |
+
print(f" Beleving preds: {int(beleving_pred.sum())} / {int(beleving_labels.sum())} true")
|
| 520 |
+
|
| 521 |
+
# Log to wandb
|
| 522 |
+
log_dict = {
|
| 523 |
+
"phase": phase_name,
|
| 524 |
+
"train/loss": loss.item(),
|
| 525 |
+
"train/f1_loss": (f1_loss / 2).item(),
|
| 526 |
+
"train/bce_loss": (bce_loss / 2).item(),
|
| 527 |
+
"train/onderwerp_f1": onderwerp_f1.item(),
|
| 528 |
+
"train/onderwerp_bce": bce_onderwerp.item(),
|
| 529 |
+
"train/beleving_f1": beleving_f1.item(),
|
| 530 |
+
"train/beleving_bce": bce_beleving.item(),
|
| 531 |
+
"train/encoder_head_lr": encoder_head_lr,
|
| 532 |
+
"train/onderwerp_threshold_mean": onderwerp_thresh_mean,
|
| 533 |
+
"train/onderwerp_threshold_min": onderwerp_thresh_min,
|
| 534 |
+
"train/onderwerp_threshold_max": onderwerp_thresh_max,
|
| 535 |
+
"train/beleving_threshold_mean": beleving_thresh_mean,
|
| 536 |
+
"train/beleving_threshold_min": beleving_thresh_min,
|
| 537 |
+
"train/beleving_threshold_max": beleving_thresh_max,
|
| 538 |
+
}
|
| 539 |
+
if threshold_lr is not None:
|
| 540 |
+
log_dict["train/threshold_lr"] = threshold_lr
|
| 541 |
+
wandb.log(log_dict, step=global_step)
|
| 542 |
+
|
| 543 |
+
# Backward pass
|
| 544 |
+
loss.backward()
|
| 545 |
+
|
| 546 |
+
# Calculate gradient norms
|
| 547 |
+
with torch.no_grad():
|
| 548 |
+
onderwerp_thresh_grad = (model.onderwerp_tau_logit.grad.abs().mean().item()
|
| 549 |
+
if model.use_thresholds and model.onderwerp_tau_logit.grad is not None else 0.0)
|
| 550 |
+
beleving_thresh_grad = (model.beleving_tau_logit.grad.abs().mean().item()
|
| 551 |
+
if model.use_thresholds and model.beleving_tau_logit.grad is not None else 0.0)
|
| 552 |
+
|
| 553 |
+
encoder_grad_norm = grad_l2_norm(model.encoder.parameters())
|
| 554 |
+
onderwerp_head_grad_norm = grad_l2_norm(model.onderwerp_head.parameters())
|
| 555 |
+
beleving_head_grad_norm = grad_l2_norm(model.beleving_head.parameters())
|
| 556 |
+
global_grad_norm = grad_l2_norm(model.parameters())
|
| 557 |
+
|
| 558 |
+
# Log gradient norms
|
| 559 |
+
wandb.log({
|
| 560 |
+
"phase": phase_name,
|
| 561 |
+
"grads/threshold_onderwerp": onderwerp_thresh_grad,
|
| 562 |
+
"grads/threshold_beleving": beleving_thresh_grad,
|
| 563 |
+
"grads/encoder": encoder_grad_norm,
|
| 564 |
+
"grads/onderwerp_head": onderwerp_head_grad_norm,
|
| 565 |
+
"grads/beleving_head": beleving_head_grad_norm,
|
| 566 |
+
"grads/global_norm": global_grad_norm,
|
| 567 |
+
}, step=global_step)
|
| 568 |
+
|
| 569 |
+
# Gradient clipping
|
| 570 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
| 571 |
+
|
| 572 |
+
# Update weights and LR
|
| 573 |
+
optimizer.step()
|
| 574 |
+
scheduler.step()
|
| 575 |
+
|
| 576 |
+
# Update counters
|
| 577 |
+
global_step += 1
|
| 578 |
+
total_loss += loss.item()
|
| 579 |
+
total_onderwerp_f1 += onderwerp_f1.item()
|
| 580 |
+
total_beleving_f1 += beleving_f1.item()
|
| 581 |
+
total_f1_loss += (f1_loss / 2).item()
|
| 582 |
+
total_bce_loss += (bce_loss / 2).item()
|
| 583 |
+
num_batches += 1
|
| 584 |
+
|
| 585 |
+
# Epoch summary
|
| 586 |
+
avg_loss = total_loss / max(1, num_batches)
|
| 587 |
+
avg_onderwerp_f1 = total_onderwerp_f1 / max(1, num_batches)
|
| 588 |
+
avg_beleving_f1 = total_beleving_f1 / max(1, num_batches)
|
| 589 |
+
avg_f1_loss = total_f1_loss / max(1, num_batches)
|
| 590 |
+
avg_bce_loss = total_bce_loss / max(1, num_batches)
|
| 591 |
+
|
| 592 |
+
# Get current LR for summary
|
| 593 |
+
lrs = scheduler.get_last_lr()
|
| 594 |
+
current_lr = lrs[0] # Display first group LR
|
| 595 |
+
|
| 596 |
+
# Threshold statistics (convert to probability space for readability)
|
| 597 |
+
if model.use_thresholds:
|
| 598 |
+
onderwerp_thresh_mean = logit_to_prob(model.onderwerp_tau_logit).mean().item()
|
| 599 |
+
onderwerp_thresh_std = logit_to_prob(model.onderwerp_tau_logit).std().item()
|
| 600 |
+
beleving_thresh_mean = logit_to_prob(model.beleving_tau_logit).mean().item()
|
| 601 |
+
beleving_thresh_std = logit_to_prob(model.beleving_tau_logit).std().item()
|
| 602 |
+
else:
|
| 603 |
+
onderwerp_thresh_mean = onderwerp_thresh_std = 0.5
|
| 604 |
+
beleving_thresh_mean = beleving_thresh_std = 0.5
|
| 605 |
+
|
| 606 |
+
print(f"\n [{phase_name.upper()}] Epoch {epoch + 1} Summary:")
|
| 607 |
+
print(f" Average total loss: {avg_loss:.4f} (α={alpha} F1 + {1-alpha} BCE)")
|
| 608 |
+
print(f" Average F1 loss: {avg_f1_loss:.4f} | Average BCE loss: {avg_bce_loss:.4f}")
|
| 609 |
+
print(f" Average onderwerp F1: {avg_onderwerp_f1:.4f} | Threshold: {onderwerp_thresh_mean:.3f} (σ={onderwerp_thresh_std:.3f})")
|
| 610 |
+
print(f" Average beleving F1: {avg_beleving_f1:.4f} | Threshold: {beleving_thresh_mean:.3f} (σ={beleving_thresh_std:.3f})")
|
| 611 |
+
print(f" Average combined F1: {(avg_onderwerp_f1 + avg_beleving_f1) / 2:.4f}")
|
| 612 |
+
print(f" Current learning rate: {current_lr:.2e}")
|
| 613 |
+
|
| 614 |
+
# Per-epoch validation
|
| 615 |
+
print(f"\n Running validation on 200 samples...")
|
| 616 |
+
val_metrics = evaluate(
|
| 617 |
+
model, val_texts, val_onderwerp, val_beleving, tokenizer, device,
|
| 618 |
+
onderwerp_names, beleving_names, num_samples=200, max_length=max_length
|
| 619 |
+
)
|
| 620 |
+
|
| 621 |
+
# Log validation metrics
|
| 622 |
+
wandb.log({
|
| 623 |
+
"phase": phase_name,
|
| 624 |
+
"val/onderwerp_acc": val_metrics['onderwerp_acc'],
|
| 625 |
+
"val/onderwerp_precision": val_metrics['onderwerp_precision'],
|
| 626 |
+
"val/onderwerp_recall": val_metrics['onderwerp_recall'],
|
| 627 |
+
"val/onderwerp_f1": val_metrics['onderwerp_f1'],
|
| 628 |
+
"val/beleving_acc": val_metrics['beleving_acc'],
|
| 629 |
+
"val/beleving_precision": val_metrics['beleving_precision'],
|
| 630 |
+
"val/beleving_recall": val_metrics['beleving_recall'],
|
| 631 |
+
"val/beleving_f1": val_metrics['beleving_f1'],
|
| 632 |
+
"val/combined_acc": val_metrics['combined_acc'],
|
| 633 |
+
"val/combined_f1": val_metrics['combined_f1'],
|
| 634 |
+
"val/onderwerp_threshold_mean": val_metrics['onderwerp_thresh_mean'],
|
| 635 |
+
"val/beleving_threshold_mean": val_metrics['beleving_thresh_mean'],
|
| 636 |
+
"epoch": epoch + 1
|
| 637 |
+
}, step=global_step)
|
| 638 |
+
|
| 639 |
+
# Log threshold histograms (convert to probability space for readability)
|
| 640 |
+
if model.use_thresholds:
|
| 641 |
+
wandb.log({
|
| 642 |
+
"phase": phase_name,
|
| 643 |
+
"thresholds/onderwerp": wandb.Histogram(logit_to_prob(model.onderwerp_tau_logit).detach().cpu().numpy()),
|
| 644 |
+
"thresholds/beleving": wandb.Histogram(logit_to_prob(model.beleving_tau_logit).detach().cpu().numpy()),
|
| 645 |
+
"epoch": epoch + 1
|
| 646 |
+
}, step=global_step)
|
| 647 |
+
|
| 648 |
+
print(f" Val onderwerp F1: {val_metrics['onderwerp_f1']:.4f} | Val beleving F1: {val_metrics['beleving_f1']:.4f}")
|
| 649 |
+
print(f" Val combined F1: {val_metrics['combined_f1']:.4f}")
|
| 650 |
+
|
| 651 |
+
# Return to training mode
|
| 652 |
+
model.train()
|
| 653 |
+
|
| 654 |
+
return global_step
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
def main():
|
| 658 |
+
# Enable TensorFloat32 for better performance on modern NVIDIA GPUs
|
| 659 |
+
if torch.cuda.is_available():
|
| 660 |
+
torch.set_float32_matmul_precision('high')
|
| 661 |
+
|
| 662 |
+
# Initialize device
|
| 663 |
+
device = get_device()
|
| 664 |
+
|
| 665 |
+
# ============== CONFIGURATION FOR WANDB SWEEPS ==============
|
| 666 |
+
# Fixed model configuration (not swept)
|
| 667 |
+
model_name = "jhu-clsp/mmBERT-base"
|
| 668 |
+
|
| 669 |
+
# Sweepable hyperparameters with defaults
|
| 670 |
+
default_config = dict(
|
| 671 |
+
# Reproducibility
|
| 672 |
+
seed=42,
|
| 673 |
+
|
| 674 |
+
# Model architecture
|
| 675 |
+
dropout=0.2,
|
| 676 |
+
initial_threshold=0.565,
|
| 677 |
+
max_length=1408,
|
| 678 |
+
|
| 679 |
+
# Training switches
|
| 680 |
+
use_thresholds=False, # If False: no learnable thresholds; Soft-F1 uses raw logits
|
| 681 |
+
|
| 682 |
+
# Training
|
| 683 |
+
encoder_peak_lr=8e-5,
|
| 684 |
+
threshold_lr_mult=5.0, # Threshold LR = encoder_peak_lr * threshold_lr_mult
|
| 685 |
+
num_epochs=15,
|
| 686 |
+
batch_size=16,
|
| 687 |
+
|
| 688 |
+
# Loss function
|
| 689 |
+
alpha=0.15, # Weight for F1 loss in combined loss (0.5 = balanced)
|
| 690 |
+
temperature=2.0, # Sigmoid smoothing (lower = softer, higher = sharper)
|
| 691 |
+
|
| 692 |
+
# LR schedule
|
| 693 |
+
warmup_ratio=0.1, # 10% warmup
|
| 694 |
+
min_lr=1e-6,
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
# Initialize wandb and get config (allows sweep agent to override defaults)
|
| 698 |
+
wandb.init(project="wim-multilabel-mmbert", config=default_config)
|
| 699 |
+
cfg = wandb.config
|
| 700 |
+
|
| 701 |
+
# Set seed for reproducibility (before loading data)
|
| 702 |
+
set_seed(cfg.seed)
|
| 703 |
+
|
| 704 |
+
# Load RD dataset
|
| 705 |
+
print("\nLoading FULL RD dataset (9,351 samples)...")
|
| 706 |
+
texts, onderwerp, beleving, onderwerp_names, beleving_names = load_rd_wim_dataset(
|
| 707 |
+
max_samples=None # Using full dataset for better training
|
| 708 |
+
)
|
| 709 |
+
|
| 710 |
+
print(f"\nDataset loaded:")
|
| 711 |
+
print(f" Samples: {len(texts)}")
|
| 712 |
+
print(f" Onderwerp labels: {len(onderwerp_names)}")
|
| 713 |
+
print(f" Beleving labels: {len(beleving_names)}")
|
| 714 |
+
print(f" Avg onderwerp per sample: {onderwerp.sum(axis=1).mean():.2f}")
|
| 715 |
+
print(f" Avg beleving per sample: {beleving.sum(axis=1).mean():.2f}")
|
| 716 |
+
|
| 717 |
+
# Unpack hyperparameters from wandb.config
|
| 718 |
+
dropout = cfg.dropout
|
| 719 |
+
initial_threshold = cfg.initial_threshold
|
| 720 |
+
max_length = cfg.max_length
|
| 721 |
+
encoder_peak_lr = cfg.encoder_peak_lr
|
| 722 |
+
threshold_peak_lr = encoder_peak_lr * cfg.threshold_lr_mult # Derived from multiplier
|
| 723 |
+
num_epochs = cfg.num_epochs
|
| 724 |
+
batch_size = cfg.batch_size
|
| 725 |
+
alpha = cfg.alpha
|
| 726 |
+
temperature = cfg.temperature
|
| 727 |
+
warmup_ratio = cfg.warmup_ratio
|
| 728 |
+
min_lr = cfg.min_lr
|
| 729 |
+
# ================================================================
|
| 730 |
+
|
| 731 |
+
# Load tokenizer and create model
|
| 732 |
+
print("\nLoading mmBERT-base tokenizer and creating dual-head model...")
|
| 733 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 734 |
+
|
| 735 |
+
model = mmBERTDualHead(
|
| 736 |
+
model_name=model_name,
|
| 737 |
+
num_onderwerp=len(onderwerp_names),
|
| 738 |
+
num_beleving=len(beleving_names),
|
| 739 |
+
dropout=dropout,
|
| 740 |
+
initial_threshold=initial_threshold,
|
| 741 |
+
use_thresholds=cfg.use_thresholds
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
# Move model to device
|
| 745 |
+
model = model.to(device)
|
| 746 |
+
|
| 747 |
+
# Ensure thresholds match encoder dtype for mixed precision safety
|
| 748 |
+
encoder_dtype = next(model.encoder.parameters()).dtype
|
| 749 |
+
with torch.no_grad():
|
| 750 |
+
if model.use_thresholds:
|
| 751 |
+
model.onderwerp_tau_logit.copy_(model.onderwerp_tau_logit.to(encoder_dtype))
|
| 752 |
+
model.beleving_tau_logit.copy_(model.beleving_tau_logit.to(encoder_dtype))
|
| 753 |
+
|
| 754 |
+
print(f"Model loaded and moved to {device}")
|
| 755 |
+
print(f" Onderwerp head: {len(onderwerp_names)} outputs")
|
| 756 |
+
print(f" Beleving head: {len(beleving_names)} outputs")
|
| 757 |
+
|
| 758 |
+
# Split data into train/val (80/20)
|
| 759 |
+
split_idx = int(0.8 * len(texts))
|
| 760 |
+
train_texts = texts[:split_idx]
|
| 761 |
+
train_onderwerp = onderwerp[:split_idx]
|
| 762 |
+
train_beleving = beleving[:split_idx]
|
| 763 |
+
val_texts = texts[split_idx:]
|
| 764 |
+
val_onderwerp = onderwerp[split_idx:]
|
| 765 |
+
val_beleving = beleving[split_idx:]
|
| 766 |
+
|
| 767 |
+
print(f"\nData split:")
|
| 768 |
+
print(f" Train: {len(train_texts)} samples")
|
| 769 |
+
print(f" Val: {len(val_texts)} samples")
|
| 770 |
+
|
| 771 |
+
# Create training dataset and dataloader
|
| 772 |
+
train_dataset = DutchDualLabelDataset(
|
| 773 |
+
train_texts, train_onderwerp, train_beleving, tokenizer, max_length
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
| 777 |
+
|
| 778 |
+
steps_per_epoch = len(train_loader)
|
| 779 |
+
total_training_steps = steps_per_epoch * num_epochs
|
| 780 |
+
|
| 781 |
+
# Log derived/computed values to wandb (sweepable params already in config)
|
| 782 |
+
wandb.config.update({
|
| 783 |
+
# Fixed model configuration
|
| 784 |
+
"model_name": model_name,
|
| 785 |
+
"num_onderwerp": len(onderwerp_names),
|
| 786 |
+
"num_beleving": len(beleving_names),
|
| 787 |
+
|
| 788 |
+
# Derived training params
|
| 789 |
+
"threshold_peak_lr": threshold_peak_lr,
|
| 790 |
+
"total_training_steps": total_training_steps,
|
| 791 |
+
|
| 792 |
+
# Dataset info
|
| 793 |
+
"train_samples": len(train_texts),
|
| 794 |
+
"val_samples": len(val_texts),
|
| 795 |
+
"total_samples": len(texts),
|
| 796 |
+
"split_ratio": 0.8,
|
| 797 |
+
|
| 798 |
+
# Loss configuration (derived from alpha)
|
| 799 |
+
"loss_type": "combined_f1_bce",
|
| 800 |
+
"f1_weight": alpha,
|
| 801 |
+
"bce_weight": 1 - alpha,
|
| 802 |
+
|
| 803 |
+
# Fixed features
|
| 804 |
+
"learnable_thresholds": cfg.use_thresholds,
|
| 805 |
+
"per_class_thresholds": cfg.use_thresholds,
|
| 806 |
+
"gradient_clipping": True,
|
| 807 |
+
"max_grad_norm": 1.0,
|
| 808 |
+
}, allow_val_change=True)
|
| 809 |
+
|
| 810 |
+
# Print training info
|
| 811 |
+
print(f"\nStarting training for {num_epochs} total epochs with COMBINED F1+BCE LOSS...")
|
| 812 |
+
print(f"Loss formula: {alpha} * (1-F1) + {1-alpha} * BCE")
|
| 813 |
+
print(f"Temperature for Soft-F1: {temperature} | Initial thresholds: {initial_threshold}")
|
| 814 |
+
print(f"Batch size: {batch_size} | Total training batches: {steps_per_epoch}")
|
| 815 |
+
print(f"Learnable thresholds enabled for both onderwerp and beleving heads")
|
| 816 |
+
print("=" * 60)
|
| 817 |
+
|
| 818 |
+
# ===== SINGLE-PHASE TRAINING =====
|
| 819 |
+
print(f"\n{'='*60}")
|
| 820 |
+
print(f"TRAINING: {num_epochs} epoch(s)")
|
| 821 |
+
print(f"{'='*60}")
|
| 822 |
+
|
| 823 |
+
# Create optimizer and scheduler
|
| 824 |
+
optimizer, scheduler = make_opt_sched(
|
| 825 |
+
model,
|
| 826 |
+
enc_lr=encoder_peak_lr,
|
| 827 |
+
thr_lr=threshold_peak_lr,
|
| 828 |
+
total_steps=total_training_steps,
|
| 829 |
+
warmup_ratio=warmup_ratio,
|
| 830 |
+
eta_min=min_lr
|
| 831 |
+
)
|
| 832 |
+
|
| 833 |
+
# Run training
|
| 834 |
+
global_step = run_epochs(
|
| 835 |
+
model, tokenizer, train_loader,
|
| 836 |
+
val_texts, val_onderwerp, val_beleving,
|
| 837 |
+
onderwerp_names, beleving_names, device,
|
| 838 |
+
start_epoch=0, end_epoch=num_epochs,
|
| 839 |
+
phase_name="train",
|
| 840 |
+
optimizer=optimizer, scheduler=scheduler,
|
| 841 |
+
temperature=temperature, alpha=alpha,
|
| 842 |
+
max_length=max_length, global_step=0
|
| 843 |
+
)
|
| 844 |
+
|
| 845 |
+
# Training complete
|
| 846 |
+
print(f"\n{'='*60}")
|
| 847 |
+
print("TRAINING COMPLETE")
|
| 848 |
+
print(f"{'='*60}")
|
| 849 |
+
|
| 850 |
+
# Final evaluation on larger validation set
|
| 851 |
+
print("\n" + "=" * 60)
|
| 852 |
+
print("FINAL EVALUATION ON VALIDATION SET")
|
| 853 |
+
print("=" * 60)
|
| 854 |
+
|
| 855 |
+
print(f"\nEvaluating on 500 validation samples...")
|
| 856 |
+
final_metrics = evaluate(
|
| 857 |
+
model, val_texts, val_onderwerp, val_beleving, tokenizer, device,
|
| 858 |
+
onderwerp_names, beleving_names, num_samples=500, max_length=max_length
|
| 859 |
+
)
|
| 860 |
+
|
| 861 |
+
# Print overall metrics
|
| 862 |
+
print("\n" + "=" * 60)
|
| 863 |
+
print(f"FINAL METRICS (on {final_metrics['num_samples_evaluated']} validation samples)")
|
| 864 |
+
print("-" * 40)
|
| 865 |
+
|
| 866 |
+
print(f" Onderwerp:")
|
| 867 |
+
print(f" Accuracy: {final_metrics['onderwerp_acc']:.1%}")
|
| 868 |
+
print(f" Precision: {final_metrics['onderwerp_precision']:.3f}")
|
| 869 |
+
print(f" Recall: {final_metrics['onderwerp_recall']:.3f}")
|
| 870 |
+
print(f" F1 Score: {final_metrics['onderwerp_f1']:.3f}")
|
| 871 |
+
|
| 872 |
+
print(f"\n Beleving:")
|
| 873 |
+
print(f" Accuracy: {final_metrics['beleving_acc']:.1%}")
|
| 874 |
+
print(f" Precision: {final_metrics['beleving_precision']:.3f}")
|
| 875 |
+
print(f" Recall: {final_metrics['beleving_recall']:.3f}")
|
| 876 |
+
print(f" F1 Score: {final_metrics['beleving_f1']:.3f}")
|
| 877 |
+
|
| 878 |
+
print(f"\n Combined:")
|
| 879 |
+
print(f" Average Accuracy: {final_metrics['combined_acc']:.1%}")
|
| 880 |
+
print(f" Average F1: {final_metrics['combined_f1']:.3f}")
|
| 881 |
+
|
| 882 |
+
# Log final metrics to wandb
|
| 883 |
+
wandb.log({
|
| 884 |
+
"final/onderwerp_acc": final_metrics['onderwerp_acc'],
|
| 885 |
+
"final/onderwerp_precision": final_metrics['onderwerp_precision'],
|
| 886 |
+
"final/onderwerp_recall": final_metrics['onderwerp_recall'],
|
| 887 |
+
"final/onderwerp_f1": final_metrics['onderwerp_f1'],
|
| 888 |
+
"final/beleving_acc": final_metrics['beleving_acc'],
|
| 889 |
+
"final/beleving_precision": final_metrics['beleving_precision'],
|
| 890 |
+
"final/beleving_recall": final_metrics['beleving_recall'],
|
| 891 |
+
"final/beleving_f1": final_metrics['beleving_f1'],
|
| 892 |
+
"final/combined_acc": final_metrics['combined_acc'],
|
| 893 |
+
"final/combined_f1": final_metrics['combined_f1'],
|
| 894 |
+
}, step=global_step)
|
| 895 |
+
|
| 896 |
+
print("\n" + "=" * 60)
|
| 897 |
+
print("Training complete! 🎉")
|
| 898 |
+
print("mmBERT-base dual-head architecture with balanced F1+BCE loss")
|
| 899 |
+
print(f"Loss formula: {alpha} * (1-F1) + {1-alpha} * BCE")
|
| 900 |
+
print(f"Temperature: {temperature}")
|
| 901 |
+
if cfg.use_thresholds:
|
| 902 |
+
print(f"Learned per-class thresholds:")
|
| 903 |
+
print(f" Onderwerp ({len(onderwerp_names)} classes): mean={final_metrics['onderwerp_thresh_mean']:.3f} [{final_metrics['onderwerp_thresh_min']:.3f}-{final_metrics['onderwerp_thresh_max']:.3f}] σ={final_metrics['onderwerp_thresh_std']:.3f}")
|
| 904 |
+
print(f" Beleving ({len(beleving_names)} classes): mean={final_metrics['beleving_thresh_mean']:.3f} [{final_metrics['beleving_thresh_min']:.3f}-{final_metrics['beleving_thresh_max']:.3f}] σ={final_metrics['beleving_thresh_std']:.3f}")
|
| 905 |
+
else:
|
| 906 |
+
print("Thresholds disabled (fixed cutoff τ=0.5 for both heads).")
|
| 907 |
+
print(f"With gradient clipping (max_norm=1.0) and warmup LR schedule")
|
| 908 |
+
print(f"Full dataset: {len(texts)} samples | Batch size: {batch_size} | Epochs: {num_epochs}")
|
| 909 |
+
print(f"mmBERT: Modern multilingual encoder (1800+ languages, max_length: {max_length})")
|
| 910 |
+
|
| 911 |
+
# Save final model weights (minimal model saving)
|
| 912 |
+
save_path = "mmbert_dual_head_final.pt"
|
| 913 |
+
torch.save(model.state_dict(), save_path)
|
| 914 |
+
print(f"\nModel weights saved to {save_path}")
|
| 915 |
+
|
| 916 |
+
# Save Hugging Face-compatible checkpoint (encoder + tokenizer + custom heads)
|
| 917 |
+
hf_dir = "mmbert_dual_head_hf"
|
| 918 |
+
os.makedirs(hf_dir, exist_ok=True)
|
| 919 |
+
# Save base encoder and tokenizer in HF format
|
| 920 |
+
model.encoder.save_pretrained(hf_dir)
|
| 921 |
+
tokenizer.save_pretrained(hf_dir)
|
| 922 |
+
# Save custom heads and metadata alongside
|
| 923 |
+
head_state = {
|
| 924 |
+
"onderwerp_head_state": model.onderwerp_head.state_dict(),
|
| 925 |
+
"beleving_head_state": model.beleving_head.state_dict(),
|
| 926 |
+
"use_thresholds": model.use_thresholds,
|
| 927 |
+
"num_onderwerp": len(onderwerp_names),
|
| 928 |
+
"num_beleving": len(beleving_names),
|
| 929 |
+
"dropout": dropout,
|
| 930 |
+
"max_length": max_length,
|
| 931 |
+
"alpha": alpha,
|
| 932 |
+
"temperature": temperature,
|
| 933 |
+
"model_name": model_name,
|
| 934 |
+
}
|
| 935 |
+
if model.use_thresholds:
|
| 936 |
+
head_state["onderwerp_tau_logit"] = model.onderwerp_tau_logit.detach().cpu()
|
| 937 |
+
head_state["beleving_tau_logit"] = model.beleving_tau_logit.detach().cpu()
|
| 938 |
+
torch.save(head_state, os.path.join(hf_dir, "dual_head_state.pt"))
|
| 939 |
+
# Save label names for convenience
|
| 940 |
+
with open(os.path.join(hf_dir, "label_names.json"), "w") as f:
|
| 941 |
+
json.dump({
|
| 942 |
+
"onderwerp": list(map(str, onderwerp_names)),
|
| 943 |
+
"beleving": list(map(str, beleving_names))
|
| 944 |
+
}, f, ensure_ascii=False, indent=2)
|
| 945 |
+
print(f"HF-compatible checkpoint saved to '{hf_dir}' (encoder+tokenizer), with heads in dual_head_state.pt")
|
| 946 |
+
|
| 947 |
+
# Finish wandb run
|
| 948 |
+
wandb.finish()
|
| 949 |
+
print("\nWandB logging completed and run finished.")
|
| 950 |
+
|
| 951 |
+
|
| 952 |
+
if __name__ == "__main__":
|
| 953 |
+
main()
|