| import random | |
| import datetime | |
| import pandas as pd | |
| random.seed(1996) | |
| DEV_RATIO = 0.10 | |
| def choose_best_casing(orig, predicted): | |
| num_upper_tokens = len([c == c.upper() for c in orig.upper()]) | |
| if num_upper_tokens > 0.5 * len(orig): | |
| return predicted | |
| return predicted | |
| def split_data(): | |
| events_main = [] | |
| texts_main = [] | |
| events_dev = [] | |
| texts_dev = [] | |
| with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f: | |
| titles_tc = [line.strip() for line in f] | |
| df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1") | |
| for idx, (_, row) in enumerate(df_all.iterrows()): | |
| if idx % 1000 == 0: | |
| print("Processing line:", idx) | |
| year = int(row["Anno"]) | |
| event_data = { | |
| "event:id": idx, | |
| "event:year": year, | |
| } | |
| text_data = { | |
| "event_id": idx, | |
| "text_id": idx, | |
| "pubyear": year, | |
| "language": "Italian", | |
| "provider": row["Testata"].lstrip("*T_"), | |
| "title": choose_best_casing(row["Titolo"], titles_tc[idx]), | |
| "title_truecased": titles_tc[idx], | |
| "title_orig": row["Titolo"] | |
| } | |
| if random.random() < DEV_RATIO: | |
| events_dev.append(event_data) | |
| texts_dev.append(text_data) | |
| with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: | |
| f_out.write(text_data["title"]) | |
| with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: | |
| f_out.write(text_data["title_orig"]) | |
| with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: | |
| f_out.write(text_data["title_truecased"]) | |
| else: | |
| events_main.append(event_data) | |
| texts_main.append(text_data) | |
| with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: | |
| f_out.write(text_data["title"]) | |
| with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: | |
| f_out.write(text_data["title_orig"]) | |
| with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: | |
| f_out.write(text_data["title_truecased"]) | |
| pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv") | |
| pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv") | |
| pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv") | |
| pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv") | |
| if __name__ == "__main__": | |
| split_data() |