Loading srl.pydeleted 100644 → 0 +0 −129 Original line number Diff line number Diff line from allennlp.predictors.predictor import Predictor import pandas as pd import logging, re import time from argparse import ArgumentParser def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) df = df.replace({'l': 'From the Left', 'r': 'From the Right', 'c':'From the Center'}) return df, df.name_bias.to_dict() def load_flipper(): datafn = 'data/webis_bias_flipper.csv' df = pd.read_csv(datafn, error_bad_lines=False) df = df.fillna('n.a.') n = df.shape[0] cols = [i for i in df.columns] print('Loaded {0}/6472 rows w/ columns: {1}'.format(n, cols)) return df namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' m = re.search(nampat, sent) return bool(m) def ner(t): global allen_ner o = allen_ner.predict(sentence=t) return o ### load flipper #flipper = load_flipper() #flipper['text'] = flipper.original_title +'. '+ flipper.original_body ### load sent_df def hardcoded_clean(names): names.w = names.w.str.replace('\ufeff', '') names.w = names.w.str.replace('—', '') names.w = names.w.str.replace('‘s', '') names.w = names.w.str.replace('2.', '') names.w = names.w.str.replace('’a', '') return names parser = ArgumentParser() parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from") parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") args = parser.parse_args() sent = pd.read_csv('data/sent_df.csv') sent = sent[sent.s.apply(crude_ner) == True] #print(max(sent.index)) #vdberg_output_11315.csv.csv.csv.csv 64210 over = [i for i in sent.index if i > args.start_i] #99.000 #130562 start_i = over[0] data = sent.loc[start_i:] srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") preprocess = True #args.preprocess times = [] verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id']) for i, r in data.iterrows(): t0 = time.time() names = ner(r.s) print('Processing: {}, {}'.format(i, r.s)) roles = srl_pred.predict(sentence=r.s)['verbs'] #print('roles', roles) # extract info tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles] #print('tagged', tagged) verbs = [gr['verb'] for gr in roles] #print('verbs', verbs) tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs) tagged.to_csv('outputs/srl_ner_tagged_{}.csv'.format(i)) if preprocess: persons = tagged[tagged.n.str.endswith('PER')] names = persons[persons.w.apply(crude_ner) == True].copy() #names.w = names.w.str.strip('—') #print('names', names) names = hardcoded_clean(names) verbs = names.set_index(['w']).iloc[:,1:] if not verbs.empty: verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'}) #print('verbs', verbs) verbs = verbs.replace({'O':None}).dropna() #print('verbs', verbs) #names['nambias'] = namlabs_df.loc[verbs.index].bias.values verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias #print('verbs', verbs) verbs['id'] = [i] * len(verbs) #verbs.arg = verbs.arg.str[2:] verb_df = verb_df.append(verbs) print('Found {} verbs'.format(len(verbs))) verbs.to_csv('outputs/vdberg_output_{}.csv'.format(i)) t1 = time.time() total_n = t1-t0 times.append(total_n) print('Took {} seconds'.format(sum(times)/len(times))) verb_df.to_csv('vdberg_output.csv') #grs = verb_df.groupby('same') #for n, gr in grs: # print(str(n).upper()) # for c in gr: # print(gr[c].value_counts().head(10)) # print() # print('---') Loading
srl.pydeleted 100644 → 0 +0 −129 Original line number Diff line number Diff line from allennlp.predictors.predictor import Predictor import pandas as pd import logging, re import time from argparse import ArgumentParser def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) df = df.replace({'l': 'From the Left', 'r': 'From the Right', 'c':'From the Center'}) return df, df.name_bias.to_dict() def load_flipper(): datafn = 'data/webis_bias_flipper.csv' df = pd.read_csv(datafn, error_bad_lines=False) df = df.fillna('n.a.') n = df.shape[0] cols = [i for i in df.columns] print('Loaded {0}/6472 rows w/ columns: {1}'.format(n, cols)) return df namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' m = re.search(nampat, sent) return bool(m) def ner(t): global allen_ner o = allen_ner.predict(sentence=t) return o ### load flipper #flipper = load_flipper() #flipper['text'] = flipper.original_title +'. '+ flipper.original_body ### load sent_df def hardcoded_clean(names): names.w = names.w.str.replace('\ufeff', '') names.w = names.w.str.replace('—', '') names.w = names.w.str.replace('‘s', '') names.w = names.w.str.replace('2.', '') names.w = names.w.str.replace('’a', '') return names parser = ArgumentParser() parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from") parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not") args = parser.parse_args() sent = pd.read_csv('data/sent_df.csv') sent = sent[sent.s.apply(crude_ner) == True] #print(max(sent.index)) #vdberg_output_11315.csv.csv.csv.csv 64210 over = [i for i in sent.index if i > args.start_i] #99.000 #130562 start_i = over[0] data = sent.loc[start_i:] srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") preprocess = True #args.preprocess times = [] verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id']) for i, r in data.iterrows(): t0 = time.time() names = ner(r.s) print('Processing: {}, {}'.format(i, r.s)) roles = srl_pred.predict(sentence=r.s)['verbs'] #print('roles', roles) # extract info tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles] #print('tagged', tagged) verbs = [gr['verb'] for gr in roles] #print('verbs', verbs) tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs) tagged.to_csv('outputs/srl_ner_tagged_{}.csv'.format(i)) if preprocess: persons = tagged[tagged.n.str.endswith('PER')] names = persons[persons.w.apply(crude_ner) == True].copy() #names.w = names.w.str.strip('—') #print('names', names) names = hardcoded_clean(names) verbs = names.set_index(['w']).iloc[:,1:] if not verbs.empty: verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'}) #print('verbs', verbs) verbs = verbs.replace({'O':None}).dropna() #print('verbs', verbs) #names['nambias'] = namlabs_df.loc[verbs.index].bias.values verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias #print('verbs', verbs) verbs['id'] = [i] * len(verbs) #verbs.arg = verbs.arg.str[2:] verb_df = verb_df.append(verbs) print('Found {} verbs'.format(len(verbs))) verbs.to_csv('outputs/vdberg_output_{}.csv'.format(i)) t1 = time.time() total_n = t1-t0 times.append(total_n) print('Took {} seconds'.format(sum(times)/len(times))) verb_df.to_csv('vdberg_output.csv') #grs = verb_df.groupby('same') #for n, gr in grs: # print(str(n).upper()) # for c in gr: # print(gr[c].value_counts().head(10)) # print() # print('---')