yhavinga commited on
Commit
7a24f2a
·
1 Parent(s): 13d4fa0

Replace no subtopic found with empty list

Browse files
Files changed (1) hide show
  1. train/rd_dataset_loader.py +9 -0
train/rd_dataset_loader.py CHANGED
@@ -40,6 +40,15 @@ def load_rd_wim_dataset(max_samples=None, split='train', filter_calamity=True):
40
  filtered_len = len(ds)
41
  print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
42
 
 
 
 
 
 
 
 
 
 
43
  # Limit samples if requested
44
  if max_samples is not None:
45
  ds = ds.select(range(min(max_samples, len(ds))))
 
40
  filtered_len = len(ds)
41
  print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
42
 
43
+ # Replace "No subtopic found" with empty list
44
+ original_len_before_replacement = len(ds)
45
+ ds = ds.map(lambda x: {
46
+ **x,
47
+ 'onderwerp_labels': [] if x['onderwerp_labels'] == ['No subtopic found'] else x['onderwerp_labels']
48
+ })
49
+ no_subtopic_count = sum(1 for sample in ds if len(sample['onderwerp_labels']) == 0)
50
+ print(f"Replaced 'No subtopic found' with empty list for samples with no valid subtopic ({no_subtopic_count} samples)")
51
+
52
  # Limit samples if requested
53
  if max_samples is not None:
54
  ds = ds.select(range(min(max_samples, len(ds))))