Commit 1498ff71 authored by vdberg's avatar vdberg
Browse files

documented

parent 521ec400
Loading
Loading
Loading
Loading
+22 −32
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ from argparse import ArgumentParser


def load_output(dir='outputs', combine=False):
    outfile = 'data/srl_output.csv'
    outfile = 'data/srl_preprocessed_output.csv'

    if combine:
        print('combining')
@@ -34,6 +34,7 @@ def get_dom(v):
    except KeyError:
        return None


def load_nameslab():
    namfn = 'data/names_labeled.csv'
    df = pd.read_csv(namfn, index_col=0)
@@ -41,53 +42,48 @@ def load_nameslab():
    return df, df.name_bias.to_dict()


# process command line arguments
parser = ArgumentParser()
parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
parser.add_argument("-a", "--arg", default='', help="focus on an arg")
parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show")
args = parser.parse_args()
df = load_output(dir='outputs', combine=args.combine)

print('Output size {}:'.format(df.shape))
# load files
sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
output = load_output(dir='outputs', combine=args.combine)

df = df[df['name'] != 'Ryan']
print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))

df = df[df['arg'].str.startswith(args.arg)]
# filter output
output = output[output['name'] != 'Ryan']
df = output[output['arg'].str.startswith(args.arg)]
#print('Arg-0 only {}:'.format(df.shape))


sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
# combine output
df = pd.merge(df, sent, on=['sent_id'])
namlabs_df, namlabs = load_nameslab()
df = pd.merge(df, namlabs_df, on=['name'])
df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
df.verb = df.verb.str.lower()
#df = df.drop(columns='same')

# show what data is like
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 10)
print(df.columns)
print(df.shape)
#print(df.head(n=5))

# lemmatize
#nltk.download('wordnet')
#lemmatizer = WordNetLemmatizer()
#df['lemma'] = df.verb.apply(lemmatizer.lemmatize)
df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
df.verb = df.verb.str.lower()

# compute overall counts
overall_counts = df.verb.value_counts()
def test(v):
    return overall_counts[v] > 5

df = df[df.verb.apply(test)]

lemmatizer = WordNetLemmatizer()
df.verb = df.verb.apply(lemmatizer.lemmatize)


def cust(x):
    return ' (' + str(x) + ')'

grs = pd.DataFrame()
binned = pd.DataFrame()
for n, gr in df.groupby(['bias', 'name_bias']):
    n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) 
    counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"])
@@ -99,21 +95,16 @@ for n, gr in df.groupby(['bias', 'name_bias']):
    #counts.normalized = counts.normalized.apply(round)
    #print(counts)
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n)
    #print(sorted_verbs)
    print(sorted_verbs.head(2))
    #exit(0)
    sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
    grs[n] = sorted_verbs.index
    binned[n] = sorted_verbs.index

print(grs)
grs.to_csv('frequent_verbs/most_freq_verbs.csv')
print(binned.head())
binned.to_csv('frequent_verbs/most_freq_verbs.csv')

exit(0)

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['lemma'] = df.verb.apply(lemmatizer.lemmatize)
df.arg = df.arg.str[2:]

vad = pd.read_csv('data/NRC-VAD-Lexicon.txt', delimiter='\t', index_col=0)
df['dom'] = df.lemma.apply(get_dom)

@@ -121,11 +112,10 @@ grs = df.drop_duplicates().groupby('same')
# symptoms of agency

arg_agr = pd.DataFrame(columns=[''])
dom_agr = pd.DataFrame(columns=[''])
for n, gr in grs:
    print(n, gr.dom.mean())
    arg_agr[n] = gr['arg'].value_counts(normalize=True)
    #dom_agr[n] = gr.nlargest(n=10, columns=['dom'])
    dom_agr[n] = gr.nlargest(n=10, columns=['dom'])

print(arg_agr.head(n=10))
#print(dom_agr.head(n=10))