Fixed indendation (6d7ad93b) · Commits · born / newsroomTLS

scripts/AllKeywordDateTimeline.py

+8 −9

Original line number	Diff line number	Diff line
		@@ -5,8 +5,7 @@ import codecs
		import sys
		import nltk
		import os
		import matplotlib.pyplot as plt
		from constantseveryEvent import KEYWORD_SEEDS, EVENTS_TO_RANGE
		from constants import KEYWORD_SEEDS, EVENTS_TO_RANGE

		path_to_newsroom_data = sys.argv[1]
		corpustype = path_to_newsroom_data.split(".data")[0]
		@@ -17,10 +16,10 @@ corpustype = corpustype.split("/")[-1]
		#keyword_seed = KEYWORD_SEEDS[event_name]

		ArticlePerTopic = {}
		for x in EVENTS_TO_RANGE:
		ArticlePerTopic[x] = 1
		IDfile = codecs.open(x + "/"+x + "SimpleIDs.txt", "w" , "utf-8")
		IDfile.write("#"+topic+" #simple #" + str(EVENTS_TO_RANGE[topic][0]) +" - " + str(EVENTS_TO_RANGE[topic][1] + "\n")
		for topic in EVENTS_TO_RANGE:
		ArticlePerTopic[topic] = 1
		IDfile = codecs.open(topic + "SimpleIDs.txt", "w" , "utf-8")
		IDfile.write("#"+topic+" #simple #" + str(EVENTS_TO_RANGE[topic][0]) +" - " + str(EVENTS_TO_RANGE[topic][1]) + "\n")
		IDfile.close()
		with jsonl.open(path_to_newsroom_data, gzip = True) as train_file:
		#print(event_name + " Extracting for previous mentioned event could take some hours")
		@@ -54,8 +53,8 @@ with jsonl.open(path_to_newsroom_data, gzip = True) as train_file:
		day = entry["date"][6:8]
		date = datetime.date(int(year),int(month),int(day))
		if date > earliest and date < latest:
		output = codecs.open(event_source_name + "/"+event_source_name + "SimpleIDs.txt", "a" , "utf-8")
		output.write(str(entry[archive]) + " " + str(entry[title]) +"\n")
		output = codecs.open(event_source_name + "SimpleIDs.txt", "a" , "utf-8")
		output.write(str(entry["archive"]) + " " + str(entry["title"]) +"\n")
		ArticlePerTopic[event_source_name] = ArticlePerTopic[event_source_name] + 1
		print(ArticlePerTopic)