Commit 948dc84e authored by vdberg's avatar vdberg
Browse files

Fixed lemmatization

parent 1498ff71
Loading
Loading
Loading
Loading
+46 −23
Original line number Diff line number Diff line
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import os
import os, re
from argparse import ArgumentParser
import spacy


def load_output(dir='outputs', combine=False):
@@ -17,7 +18,7 @@ def load_output(dir='outputs', combine=False):
                with open(os.path.join(p, i), encoding='utf-8') as i:
                    all_lines.extend(i.readlines()[1:])

        print('from {} file {} lines'.format(len(fs), len(all_lines)))
        print('from {} files {} lines'.format(len(fs), len(all_lines)))

        with open(outfile, 'w', encoding='utf-8') as o_add:
            for line in all_lines:
@@ -41,66 +42,88 @@ def load_nameslab():
    df = df.replace({'l': 'Left', 'r': 'Right', 'c':'Center'})
    return df, df.name_bias.to_dict()

namlabs_df, namlabs = load_nameslab()

def crude_ner(sent):
    global namlabs
    nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b'
    m = re.search(nampat, sent)
    return bool(m)

# process command line arguments
parser = ArgumentParser()
parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
parser.add_argument("-a", "--arg", default='', help="focus on an arg")
parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show")
parser.add_argument("-a", "--arg", default='ARG0', help="focus on an arg")
parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show")
parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs")
args = parser.parse_args()
MIN_FREQ = args.min_freq
N = args.n

# load files
sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
sent = sent[sent.s.apply(crude_ner) == True]
output = load_output(dir='outputs', combine=args.combine)

print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))

# filter output
output = output[output['name'] != 'Ryan']
df = output[output['arg'].str.startswith(args.arg)]
#print('Arg-0 only {}:'.format(df.shape))

# combine output
df = pd.merge(df, sent, on=['sent_id'])
df = pd.merge(output, sent, on=['sent_id'])
namlabs_df, namlabs = load_nameslab()
df = pd.merge(df, namlabs_df, on=['name'])
df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
df.verb = df.verb.str.lower()
#df = df.drop(columns='same')
print(df.shape)

# show what data is like
# display options
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 10)
print(df.columns)
print(df.shape)
pd.set_option('display.width', 1000)
#print(df.columns)
#print(df.head(n=5))

# lemmatize
def lemma(x):
    return nlp(x)[0].lemma_

lemmatize = False
if lemmatize:
    nlp = spacy.load('en')
    df.verb = df.verb.apply(lemma) #['lemma']
    df.to_csv('frequent_verbs/lemmatized.csv')
else:
    df = pd.read_csv('frequent_verbs/lemmatized.csv', index_col=0)

#nltk.download('wordnet')
#lemmatizer = WordNetLemmatizer()
#df['lemma'] = df.verb.apply(lemmatizer.lemmatize)
#df.verb = df.verb.apply(lemmatizer.lemmatize)

# compute overall counts
overall_counts = df.verb.value_counts()
# arg
df = df[df.arg.str.endswith('0')]
df.verb += ' (' + df.arg.str[2:] + ')'

# compute overall counts
overall_freq = df.verb.value_counts()
df["across"] = overall_freq.loc[df.verb].values
df = df[df.across > MIN_FREQ]
binned = pd.DataFrame()
for n, gr in df.groupby(['bias', 'name_bias']):
    n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) 
    counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"])
    #print(counts.columns)
    counts["across"] = overall_counts.loc[counts.index]
    #print(counts)
    counts["normalized"] = counts.verb / counts.across
    #print(counts)
    counts = pd.DataFrame(gr.verb.value_counts())
    counts.columns = ['within']
    counts["across"] = overall_freq.loc[counts.index].values
    counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True)
    #counts.normalized = counts.normalized.apply(round)
    #print(counts)
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n)
    print(sorted_verbs.head(2))
    #exit(0)
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N)
    sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
    binned[n] = sorted_verbs.index

print(binned.head())
print(binned.head(N))
binned.to_csv('frequent_verbs/most_freq_verbs.csv')

exit(0)