| import json | |
| from tqdm import tqdm | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained('gpt2') | |
| for i in tqdm(range(298)): | |
| with open(f'wikipedia_json_64_filtered/wikipedia.segmented.nltk.split.seq64.{i}.json', 'r') as f: | |
| rows = json.load(f) | |
| tokens = [row['gpt2_token'] for row in rows] | |
| texts = tokenizer.batch_decode(tokens) | |
| with open(f'wikipedia/{i}.txt', 'w') as f: | |
| for txt in texts: | |
| f.write(txt.strip() + '\n') | |