Commit 66634997 authored by born's avatar born
Browse files

Added script to get newsroom train ids

parent 969e78cb
Loading
Loading
Loading
Loading
+16 −0
Original line number Diff line number Diff line
import sys
from newsroom import jsonl

PATH_TO_NEWSROOM_DATA = sys.argv[1]

lines_to_write = []
with jsonl.open(PATH_TO_NEWSROOM_DATA, gzip = True) as file:
	for entry in file:
		link = entry["archive"]
		title = entry["title"]
		id_string = link + "\t||\t" + title
		lines_to_write.append(id_string)

with open("../data/newsroom_train.txt", "w") as f:
	for line in lines_to_write:
		f.write(line)