UWV
/

wimbert-synth-v0

@@ -5,21 +5,25 @@ Carmack-style: minimal abstraction, direct data flow, fast operations.
 """
 import numpy as np
-from datasets import load_dataset
 def load_rd_wim_dataset(max_samples=None, split='train', filter_calamity=True):
     """
-    Load UWV/wim-synthetic-data-rd dataset and encode multi-labels.
     Dataset contains Dutch municipal complaint conversations with two types of labels:
-    - onderwerp: What the message is about (96 unique labels)
-    - beleving: How the citizen experienced the interaction (26 unique labels)
     Args:
         max_samples: Limit number of samples (None = all samples)
         split: Dataset split to load (default: 'train')
-        filter_calamity: If True, exclude samples with is_calamity=True (default: True)
     Returns:
         texts: List of conversation strings
@@ -29,25 +33,50 @@ def load_rd_wim_dataset(max_samples=None, split='train', filter_calamity=True):
         beleving_labels: List of beleving label names (sorted alphabetically)
     """
-    # Load dataset from HuggingFace
     print(f"Loading UWV/wim-synthetic-data-rd dataset (split={split})...")
-    ds = load_dataset('UWV/wim-synthetic-data-rd', split=split)
     # Filter out calamity samples if requested
     if filter_calamity:
-        original_len = len(ds)
-        ds = ds.filter(lambda x: not x['is_calamity'])
-        filtered_len = len(ds)
         print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
-    # Replace "No subtopic found" with empty list
-    original_len_before_replacement = len(ds)
     ds = ds.map(lambda x: {
         **x,
-        'onderwerp_labels': [] if x['onderwerp_labels'] == ['No subtopic found'] else x['onderwerp_labels']
     })
-    no_subtopic_count = sum(1 for sample in ds if len(sample['onderwerp_labels']) == 0)
-    print(f"Replaced 'No subtopic found' with empty list for samples with no valid subtopic ({no_subtopic_count} samples)")
     # Limit samples if requested
     if max_samples is not None:

 """
 import numpy as np
+from datasets import load_dataset, concatenate_datasets
 def load_rd_wim_dataset(max_samples=None, split='train', filter_calamity=True):
     """
+    Load combined UWV datasets and encode multi-labels.
+    Combines two datasets:
+    - UWV/wim-synthetic-data-rd: Original RD dataset
+    - UWV/wim_synthetic_data_for_testing_split_labels: Validated testing dataset
     Dataset contains Dutch municipal complaint conversations with two types of labels:
+    - onderwerp: What the message is about
+    - beleving: How the citizen experienced the interaction
     Args:
         max_samples: Limit number of samples (None = all samples)
         split: Dataset split to load (default: 'train')
+        filter_calamity: If True, exclude samples with is_calamity=True from RD dataset (default: True)
     Returns:
         texts: List of conversation strings
         beleving_labels: List of beleving label names (sorted alphabetically)
     """
+    # Load RD dataset
     print(f"Loading UWV/wim-synthetic-data-rd dataset (split={split})...")
+    ds_rd = load_dataset('UWV/wim-synthetic-data-rd', split=split)
     # Filter out calamity samples if requested
     if filter_calamity:
+        original_len = len(ds_rd)
+        ds_rd = ds_rd.filter(lambda x: not x['is_calamity'])
+        filtered_len = len(ds_rd)
         print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
+    # Keep only essential columns from RD dataset
+    ds_rd = ds_rd.select_columns(['text', 'onderwerp_labels', 'beleving_labels'])
+    print(f"RD dataset: {len(ds_rd)} samples")
+    # Load testing dataset
+    print(f"Loading UWV/wim_synthetic_data_for_testing_split_labels dataset (split={split})...")
+    ds_test = load_dataset('UWV/wim_synthetic_data_for_testing_split_labels', split=split)
+    # Rename columns to match RD dataset structure
+    ds_test = ds_test.map(lambda x: {
+        'text': x['Synthetic Text'],
+        'onderwerp_labels': x['validated_onderwerp_labels'],
+        'beleving_labels': x['validated_beleving_labels']
+    }, remove_columns=ds_test.column_names)
+    print(f"Testing dataset: {len(ds_test)} samples")
+    # Concatenate datasets
+    ds = concatenate_datasets([ds_rd, ds_test])
+    print(f"Combined dataset: {len(ds)} samples")
+    # Shuffle with fixed seed for reproducibility
+    ds = ds.shuffle(seed=42)
+    print(f"Shuffled combined dataset")
+    # Replace "No subtopic found" with empty list (for both onderwerp and beleving)
     ds = ds.map(lambda x: {
         **x,
+        'onderwerp_labels': [] if x['onderwerp_labels'] == ['No subtopic found'] else x['onderwerp_labels'],
+        'beleving_labels': [] if x['beleving_labels'] == ['No subtopic found'] else x['beleving_labels']
     })
+    no_onderwerp_count = sum(1 for sample in ds if len(sample['onderwerp_labels']) == 0)
+    no_beleving_count = sum(1 for sample in ds if len(sample['beleving_labels']) == 0)
+    print(f"Replaced 'No subtopic found' with empty list: {no_onderwerp_count} onderwerp, {no_beleving_count} beleving")
     # Limit samples if requested
     if max_samples is not None: