Loading srl.py +32 −40 Original line number Diff line number Diff line from allennlp.predictors.predictor import Predictor import pandas as pd import logging, re import time import time, os from argparse import ArgumentParser def load_nameslab(): Loading @@ -22,7 +22,6 @@ def load_flipper(): namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' Loading Loading @@ -53,60 +52,54 @@ def hardcoded_clean(names): parser = ArgumentParser() parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from") parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") #parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") args = parser.parse_args() sent = pd.read_csv('data/sent_df.csv') sent = sent[sent.s.apply(crude_ner) == True] #print(max(sent.index)) #vdberg_output_11315.csv.csv.csv.csv 64210 over = [i for i in sent.index if i > args.start_i] #99.000 #130562 start_i = over[0] data = sent.loc[start_i:] srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") preprocess = True #args.preprocess times = [] verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id']) for i, r in data.iterrows(): t0 = time.time() names = ner(r.s) tag_out = 'outputs/srl_ner_tagged_{}.csv'.format(i) proc_out = 'outputs/vdberg_output_{}.csv'.format(i) if not os.path.isfile(tag_out): print('Processing: {}, {}'.format(i, r.s)) # perform ner and srl names = ner(r.s) roles = srl_pred.predict(sentence=r.s)['verbs'] #print('roles', roles) # extract info tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles] #print('tagged', tagged) verbs = [gr['verb'] for gr in roles] #print('verbs', verbs) tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs) verbs.to_csv('outputs/srl_ner_tagged_{}.csv'.format(i)) if preprocess: tagged.to_csv(tag_out) if not os.path.isfile(proc_out): persons = tagged[tagged.n.str.endswith('PER')] names = persons[persons.w.apply(crude_ner) == True].copy() #names.w = names.w.str.strip('—') #print('names', names) names = hardcoded_clean(names) # convert to desired output format verbs = names.set_index(['w']).iloc[:,1:] if not verbs.empty: verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'}) #print('verbs', verbs) verbs = verbs.replace({'O':None}).dropna() #print('verbs', verbs) #names['nambias'] = namlabs_df.loc[verbs.index].bias.values verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias #print('verbs', verbs) verbs['id'] = [i] * len(verbs) #verbs.arg = verbs.arg.str[2:] verb_df = verb_df.append(verbs) print('Found {} verbs'.format(len(verbs))) verbs.to_csv('outputs/vdberg_output_{}.csv'.format(i)) verbs.to_csv(proc_out) verb_df = verb_df.append(verbs) t1 = time.time() total_n = t1-t0 Loading @@ -115,7 +108,6 @@ for i, r in data.iterrows(): verb_df.to_csv('vdberg_output.csv') #grs = verb_df.groupby('same') #for n, gr in grs: # print(str(n).upper()) Loading Loading
srl.py +32 −40 Original line number Diff line number Diff line from allennlp.predictors.predictor import Predictor import pandas as pd import logging, re import time import time, os from argparse import ArgumentParser def load_nameslab(): Loading @@ -22,7 +22,6 @@ def load_flipper(): namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' Loading Loading @@ -53,60 +52,54 @@ def hardcoded_clean(names): parser = ArgumentParser() parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from") parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") #parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") args = parser.parse_args() sent = pd.read_csv('data/sent_df.csv') sent = sent[sent.s.apply(crude_ner) == True] #print(max(sent.index)) #vdberg_output_11315.csv.csv.csv.csv 64210 over = [i for i in sent.index if i > args.start_i] #99.000 #130562 start_i = over[0] data = sent.loc[start_i:] srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") preprocess = True #args.preprocess times = [] verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id']) for i, r in data.iterrows(): t0 = time.time() names = ner(r.s) tag_out = 'outputs/srl_ner_tagged_{}.csv'.format(i) proc_out = 'outputs/vdberg_output_{}.csv'.format(i) if not os.path.isfile(tag_out): print('Processing: {}, {}'.format(i, r.s)) # perform ner and srl names = ner(r.s) roles = srl_pred.predict(sentence=r.s)['verbs'] #print('roles', roles) # extract info tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles] #print('tagged', tagged) verbs = [gr['verb'] for gr in roles] #print('verbs', verbs) tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs) verbs.to_csv('outputs/srl_ner_tagged_{}.csv'.format(i)) if preprocess: tagged.to_csv(tag_out) if not os.path.isfile(proc_out): persons = tagged[tagged.n.str.endswith('PER')] names = persons[persons.w.apply(crude_ner) == True].copy() #names.w = names.w.str.strip('—') #print('names', names) names = hardcoded_clean(names) # convert to desired output format verbs = names.set_index(['w']).iloc[:,1:] if not verbs.empty: verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'}) #print('verbs', verbs) verbs = verbs.replace({'O':None}).dropna() #print('verbs', verbs) #names['nambias'] = namlabs_df.loc[verbs.index].bias.values verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias #print('verbs', verbs) verbs['id'] = [i] * len(verbs) #verbs.arg = verbs.arg.str[2:] verb_df = verb_df.append(verbs) print('Found {} verbs'.format(len(verbs))) verbs.to_csv('outputs/vdberg_output_{}.csv'.format(i)) verbs.to_csv(proc_out) verb_df = verb_df.append(verbs) t1 = time.time() total_n = t1-t0 Loading @@ -115,7 +108,6 @@ for i, r in data.iterrows(): verb_df.to_csv('vdberg_output.csv') #grs = verb_df.groupby('same') #for n, gr in grs: # print(str(n).upper()) Loading