Loading scripts/get_newsroom_train_ids.py 0 → 100644 +16 −0 Original line number Diff line number Diff line import sys from newsroom import jsonl PATH_TO_NEWSROOM_DATA = sys.argv[1] lines_to_write = [] with jsonl.open(PATH_TO_NEWSROOM_DATA, gzip = True) as file: for entry in file: link = entry["archive"] title = entry["title"] id_string = link + "\t||\t" + title lines_to_write.append(id_string) with open("../data/newsroom_train.txt", "w") as f: for line in lines_to_write: f.write(line) Loading
scripts/get_newsroom_train_ids.py 0 → 100644 +16 −0 Original line number Diff line number Diff line import sys from newsroom import jsonl PATH_TO_NEWSROOM_DATA = sys.argv[1] lines_to_write = [] with jsonl.open(PATH_TO_NEWSROOM_DATA, gzip = True) as file: for entry in file: link = entry["archive"] title = entry["title"] id_string = link + "\t||\t" + title lines_to_write.append(id_string) with open("../data/newsroom_train.txt", "w") as f: for line in lines_to_write: f.write(line)