| import pandas as pd | |
| import numpy as np | |
| import random | |
| random.seed(1996) | |
| CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv" | |
| CORPUS_ALL = "data/migration/corpus_all.csv" | |
| RATIO_DEV = 0.05 | |
| RATIO_TEST = 0.25 | |
| def preprocess_annotated(): | |
| print("Loading corpus...") | |
| df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1") | |
| print(f"\tfound {len(df)} annotated headlines") | |
| train_idx = [] | |
| dev_idx = [] | |
| test_idx = [] | |
| print("Making random train/dev/test split...") | |
| for i in range(len(df)): | |
| rnd = random.random() | |
| if rnd < RATIO_DEV: | |
| dev_idx.append(i) | |
| elif rnd < (RATIO_DEV + RATIO_TEST): | |
| test_idx.append(i) | |
| else: | |
| train_idx.append(i) | |
| print(f"\tassigned {len(train_idx)} samples to train") | |
| print(f"\tassigned {len(dev_idx)} samples to dev") | |
| print(f"\tassigned {len(test_idx)} samples to test") | |
| df_train = df.iloc[train_idx] | |
| df_dev = df.iloc[dev_idx] | |
| df_test = df.iloc[test_idx] | |
| df_train.to_csv("output/migration/preprocess/annotations_train.csv") | |
| df_dev.to_csv("output/migration/preprocess/annotations_dev.csv") | |
| df_test.to_csv("output/migration/preprocess/annotations_test.csv") | |
| def preprocess_all(): | |
| df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1") | |
| for _, row in df.iterrows(): | |
| pass | |
| if __name__ == "__main__": | |
| # preprocess_annotated() | |
| preprocess_all() | |