Loading scripts/AllKeywordDateTimeline.py +8 −9 Original line number Diff line number Diff line Loading @@ -5,8 +5,7 @@ import codecs import sys import nltk import os import matplotlib.pyplot as plt from constantseveryEvent import KEYWORD_SEEDS, EVENTS_TO_RANGE from constants import KEYWORD_SEEDS, EVENTS_TO_RANGE path_to_newsroom_data = sys.argv[1] corpustype = path_to_newsroom_data.split(".data")[0] Loading @@ -17,10 +16,10 @@ corpustype = corpustype.split("/")[-1] #keyword_seed = KEYWORD_SEEDS[event_name] ArticlePerTopic = {} for x in EVENTS_TO_RANGE: ArticlePerTopic[x] = 1 IDfile = codecs.open(x + "/"+x + "SimpleIDs.txt", "w" , "utf-8") IDfile.write("#"+topic+" #simple #" + str(EVENTS_TO_RANGE[topic][0]) +" - " + str(EVENTS_TO_RANGE[topic][1] + "\n") for topic in EVENTS_TO_RANGE: ArticlePerTopic[topic] = 1 IDfile = codecs.open(topic + "SimpleIDs.txt", "w" , "utf-8") IDfile.write("#"+topic+" #simple #" + str(EVENTS_TO_RANGE[topic][0]) +" - " + str(EVENTS_TO_RANGE[topic][1]) + "\n") IDfile.close() with jsonl.open(path_to_newsroom_data, gzip = True) as train_file: #print(event_name + " Extracting for previous mentioned event could take some hours") Loading Loading @@ -54,8 +53,8 @@ with jsonl.open(path_to_newsroom_data, gzip = True) as train_file: day = entry["date"][6:8] date = datetime.date(int(year),int(month),int(day)) if date > earliest and date < latest: output = codecs.open(event_source_name + "/"+event_source_name + "SimpleIDs.txt", "a" , "utf-8") output.write(str(entry[archive]) + " " + str(entry[title]) +"\n") output = codecs.open(event_source_name + "SimpleIDs.txt", "a" , "utf-8") output.write(str(entry["archive"]) + " " + str(entry["title"]) +"\n") ArticlePerTopic[event_source_name] = ArticlePerTopic[event_source_name] + 1 print(ArticlePerTopic) Loading Loading
scripts/AllKeywordDateTimeline.py +8 −9 Original line number Diff line number Diff line Loading @@ -5,8 +5,7 @@ import codecs import sys import nltk import os import matplotlib.pyplot as plt from constantseveryEvent import KEYWORD_SEEDS, EVENTS_TO_RANGE from constants import KEYWORD_SEEDS, EVENTS_TO_RANGE path_to_newsroom_data = sys.argv[1] corpustype = path_to_newsroom_data.split(".data")[0] Loading @@ -17,10 +16,10 @@ corpustype = corpustype.split("/")[-1] #keyword_seed = KEYWORD_SEEDS[event_name] ArticlePerTopic = {} for x in EVENTS_TO_RANGE: ArticlePerTopic[x] = 1 IDfile = codecs.open(x + "/"+x + "SimpleIDs.txt", "w" , "utf-8") IDfile.write("#"+topic+" #simple #" + str(EVENTS_TO_RANGE[topic][0]) +" - " + str(EVENTS_TO_RANGE[topic][1] + "\n") for topic in EVENTS_TO_RANGE: ArticlePerTopic[topic] = 1 IDfile = codecs.open(topic + "SimpleIDs.txt", "w" , "utf-8") IDfile.write("#"+topic+" #simple #" + str(EVENTS_TO_RANGE[topic][0]) +" - " + str(EVENTS_TO_RANGE[topic][1]) + "\n") IDfile.close() with jsonl.open(path_to_newsroom_data, gzip = True) as train_file: #print(event_name + " Extracting for previous mentioned event could take some hours") Loading Loading @@ -54,8 +53,8 @@ with jsonl.open(path_to_newsroom_data, gzip = True) as train_file: day = entry["date"][6:8] date = datetime.date(int(year),int(month),int(day)) if date > earliest and date < latest: output = codecs.open(event_source_name + "/"+event_source_name + "SimpleIDs.txt", "a" , "utf-8") output.write(str(entry[archive]) + " " + str(entry[title]) +"\n") output = codecs.open(event_source_name + "SimpleIDs.txt", "a" , "utf-8") output.write(str(entry["archive"]) + " " + str(entry["title"]) +"\n") ArticlePerTopic[event_source_name] = ArticlePerTopic[event_source_name] + 1 print(ArticlePerTopic) Loading