Loading srl.py 0 → 100644 +122 −0 Original line number Diff line number Diff line from allennlp.predictors.predictor import Predictor import pandas as pd import logging, re import time, os from argparse import ArgumentParser def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) df = df.replace({'l': 'From the Left', 'r': 'From the Right', 'c':'From the Center'}) return df, df.name_bias.to_dict() def load_flipper(): datafn = 'data/webis_bias_flipper.csv' df = pd.read_csv(datafn, error_bad_lines=False) df = df.fillna('n.a.') n = df.shape[0] cols = [i for i in df.columns] print('Loaded {0}/6472 rows w/ columns: {1}'.format(n, cols)) return df namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' m = re.search(nampat, sent) return bool(m) def ner(t): global allen_ner o = allen_ner.predict(sentence=t) return o ### load flipper #flipper = load_flipper() #flipper['text'] = flipper.original_title +'. '+ flipper.original_body ### load sent_df def hardcoded_clean(names): names.w = names.w.str.replace('\ufeff', '') names.w = names.w.str.replace('—', '') names.w = names.w.str.replace('‘s', '') names.w = names.w.str.replace('2.', '') names.w = names.w.str.replace('’a', '') return names parser = ArgumentParser() parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from") #parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") args = parser.parse_args() sent = pd.read_csv('data/sent_df.csv') sent = sent[sent.s.apply(crude_ner) == True] #print(max(sent.index)) #vdberg_output_11315.csv.csv.csv.csv 64210 over = [i for i in sent.index if i > args.start_i] #99.000 #130562 start_i = over[0] data = sent.loc[start_i:] srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") times = [] verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id']) for i, r in data.iterrows(): t0 = time.time() tag_out = 'outputs/srl_ner_tagged_{}.csv'.format(i) proc_out = 'outputs/vdberg_output_{}.csv'.format(i) if not os.path.isfile(tag_out): print('Processing: {}, {}'.format(i, r.s)) # perform ner and srl names = ner(r.s) roles = srl_pred.predict(sentence=r.s)['verbs'] tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles] verbs = [gr['verb'] for gr in roles] tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs) tagged.to_csv(tag_out) if not os.path.isfile(proc_out): persons = tagged[tagged.n.str.endswith('PER')] names = persons[persons.w.apply(crude_ner) == True].copy() names = hardcoded_clean(names) # convert to desired output format verbs = names.set_index(['w']).iloc[:,1:] if not verbs.empty: verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'}) verbs = verbs.replace({'O':None}).dropna() verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias verbs['id'] = [i] * len(verbs) print('Found {} verbs'.format(len(verbs))) verbs.to_csv(proc_out) verb_df = verb_df.append(verbs) t1 = time.time() total_n = t1-t0 times.append(total_n) print('Took {} seconds'.format(sum(times)/len(times))) verb_df.to_csv('vdberg_output.csv') #grs = verb_df.groupby('same') #for n, gr in grs: # print(str(n).upper()) # for c in gr: # print(gr[c].value_counts().head(10)) # print() # print('---') Loading
srl.py 0 → 100644 +122 −0 Original line number Diff line number Diff line from allennlp.predictors.predictor import Predictor import pandas as pd import logging, re import time, os from argparse import ArgumentParser def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) df = df.replace({'l': 'From the Left', 'r': 'From the Right', 'c':'From the Center'}) return df, df.name_bias.to_dict() def load_flipper(): datafn = 'data/webis_bias_flipper.csv' df = pd.read_csv(datafn, error_bad_lines=False) df = df.fillna('n.a.') n = df.shape[0] cols = [i for i in df.columns] print('Loaded {0}/6472 rows w/ columns: {1}'.format(n, cols)) return df namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' m = re.search(nampat, sent) return bool(m) def ner(t): global allen_ner o = allen_ner.predict(sentence=t) return o ### load flipper #flipper = load_flipper() #flipper['text'] = flipper.original_title +'. '+ flipper.original_body ### load sent_df def hardcoded_clean(names): names.w = names.w.str.replace('\ufeff', '') names.w = names.w.str.replace('—', '') names.w = names.w.str.replace('‘s', '') names.w = names.w.str.replace('2.', '') names.w = names.w.str.replace('’a', '') return names parser = ArgumentParser() parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from") #parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") args = parser.parse_args() sent = pd.read_csv('data/sent_df.csv') sent = sent[sent.s.apply(crude_ner) == True] #print(max(sent.index)) #vdberg_output_11315.csv.csv.csv.csv 64210 over = [i for i in sent.index if i > args.start_i] #99.000 #130562 start_i = over[0] data = sent.loc[start_i:] srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") times = [] verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id']) for i, r in data.iterrows(): t0 = time.time() tag_out = 'outputs/srl_ner_tagged_{}.csv'.format(i) proc_out = 'outputs/vdberg_output_{}.csv'.format(i) if not os.path.isfile(tag_out): print('Processing: {}, {}'.format(i, r.s)) # perform ner and srl names = ner(r.s) roles = srl_pred.predict(sentence=r.s)['verbs'] tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles] verbs = [gr['verb'] for gr in roles] tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs) tagged.to_csv(tag_out) if not os.path.isfile(proc_out): persons = tagged[tagged.n.str.endswith('PER')] names = persons[persons.w.apply(crude_ner) == True].copy() names = hardcoded_clean(names) # convert to desired output format verbs = names.set_index(['w']).iloc[:,1:] if not verbs.empty: verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'}) verbs = verbs.replace({'O':None}).dropna() verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias verbs['id'] = [i] * len(verbs) print('Found {} verbs'.format(len(verbs))) verbs.to_csv(proc_out) verb_df = verb_df.append(verbs) t1 = time.time() total_n = t1-t0 times.append(total_n) print('Took {} seconds'.format(sum(times)/len(times))) verb_df.to_csv('vdberg_output.csv') #grs = verb_df.groupby('same') #for n, gr in grs: # print(str(n).upper()) # for c in gr: # print(gr[c].value_counts().head(10)) # print() # print('---')