Replace no subtopic found with empty list
Browse files
train/rd_dataset_loader.py
CHANGED
|
@@ -40,6 +40,15 @@ def load_rd_wim_dataset(max_samples=None, split='train', filter_calamity=True):
|
|
| 40 |
filtered_len = len(ds)
|
| 41 |
print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Limit samples if requested
|
| 44 |
if max_samples is not None:
|
| 45 |
ds = ds.select(range(min(max_samples, len(ds))))
|
|
|
|
| 40 |
filtered_len = len(ds)
|
| 41 |
print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
|
| 42 |
|
| 43 |
+
# Replace "No subtopic found" with empty list
|
| 44 |
+
original_len_before_replacement = len(ds)
|
| 45 |
+
ds = ds.map(lambda x: {
|
| 46 |
+
**x,
|
| 47 |
+
'onderwerp_labels': [] if x['onderwerp_labels'] == ['No subtopic found'] else x['onderwerp_labels']
|
| 48 |
+
})
|
| 49 |
+
no_subtopic_count = sum(1 for sample in ds if len(sample['onderwerp_labels']) == 0)
|
| 50 |
+
print(f"Replaced 'No subtopic found' with empty list for samples with no valid subtopic ({no_subtopic_count} samples)")
|
| 51 |
+
|
| 52 |
# Limit samples if requested
|
| 53 |
if max_samples is not None:
|
| 54 |
ds = ds.select(range(min(max_samples, len(ds))))
|