Commit c06dff79 authored by EstherMaria's avatar EstherMaria
Browse files

merg

parents 0fbadc66 3097b63b
Loading
Loading
Loading
Loading

srl.py

deleted100644 → 0
+0 −129
Original line number Diff line number Diff line
from allennlp.predictors.predictor import Predictor
import pandas as pd
import logging, re
import time
from argparse import ArgumentParser

def load_nameslab():
    namfn = 'data/names_labeled.csv'
    df = pd.read_csv(namfn, index_col=0)
    df = df.replace({'l': 'From the Left', 'r': 'From the Right', 'c':'From the Center'})
    return df, df.name_bias.to_dict()


def load_flipper():
    datafn = 'data/webis_bias_flipper.csv'
    df = pd.read_csv(datafn, error_bad_lines=False)
    df = df.fillna('n.a.')
    n = df.shape[0]
    cols = [i for i in df.columns]
    print('Loaded {0}/6472 rows w/ columns: {1}'.format(n, cols))
    return df

namlabs_df, namlabs = load_nameslab()


def crude_ner(sent):
    global namlabs
    nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b'
    m = re.search(nampat, sent)
    return bool(m)


def ner(t):
    global allen_ner
    o = allen_ner.predict(sentence=t)
    return o


### load flipper

#flipper = load_flipper()
#flipper['text'] = flipper.original_title +'. '+ flipper.original_body

### load sent_df

def hardcoded_clean(names):
    names.w = names.w.str.replace('\ufeff', '')
    names.w = names.w.str.replace('', '')
    names.w = names.w.str.replace('‘s', '')
    names.w = names.w.str.replace('2.', '')
    names.w = names.w.str.replace('’a', '')
    return names

parser = ArgumentParser()
parser.add_argument("-s", "--start_i", type=int, default=157584, help="which i to start from")
parser.add_argument("-pre", "--preprocess", action="store_true", default=False, help="whether to preprocess or not")
args = parser.parse_args()

sent = pd.read_csv('data/sent_df.csv')
sent = sent[sent.s.apply(crude_ner) == True]
#print(max(sent.index))
#vdberg_output_11315.csv.csv.csv.csv 64210
over = [i for i in sent.index if i > args.start_i] #99.000 #130562
start_i = over[0]
data = sent.loc[start_i:]
srl_pred = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz")
allen_ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")

preprocess = True #args.preprocess

times = []
verb_df = pd.DataFrame(columns=['arg', 'verb', 'same', 'id'])
for i, r in data.iterrows():
    t0 = time.time()

    names = ner(r.s)
    print('Processing: {}, {}'.format(i, r.s))

    roles = srl_pred.predict(sentence=r.s)['verbs']
    #print('roles', roles)

    # extract info
    tagged = [names['words'], names['tags']] + [gr['tags'] for gr in roles]
    #print('tagged', tagged)
    verbs = [gr['verb'] for gr in roles]
    #print('verbs', verbs)

    tagged = pd.DataFrame(zip(*tagged), columns=['w', 'n'] + verbs)
    tagged.to_csv('outputs/srl_ner_tagged_{}.csv'.format(i))
    if preprocess:
        persons = tagged[tagged.n.str.endswith('PER')]
        names = persons[persons.w.apply(crude_ner) == True].copy()
        #names.w = names.w.str.strip('—')
        #print('names', names)
        names = hardcoded_clean(names)
        verbs = names.set_index(['w']).iloc[:,1:]
        if not verbs.empty:
            verbs = verbs.copy().stack().reset_index(level=1, name='arg').rename(columns={'level_1':'verb'})
            #print('verbs', verbs)
            verbs = verbs.replace({'O':None}).dropna()
            #print('verbs', verbs)
            #names['nambias'] = namlabs_df.loc[verbs.index].bias.values
            verbs['same'] = namlabs_df.loc[verbs.index].name_bias.values == r.bias
            #print('verbs', verbs)
            verbs['id'] = [i] * len(verbs)
            #verbs.arg = verbs.arg.str[2:]
            verb_df = verb_df.append(verbs)
            print('Found {} verbs'.format(len(verbs)))
            verbs.to_csv('outputs/vdberg_output_{}.csv'.format(i))

    t1 = time.time()
    total_n = t1-t0
    times.append(total_n)
    print('Took {} seconds'.format(sum(times)/len(times)))

verb_df.to_csv('vdberg_output.csv')


#grs = verb_df.groupby('same')
#for n, gr in grs:
#    print(str(n).upper())
#    for c in gr:
#        print(gr[c].value_counts().head(10))
#        print()
#    print('---')