Commit 0bbda317 authored by EstherMaria's avatar EstherMaria
Browse files

imporve normalizing

parent 4fb2c8a3
Loading
Loading
Loading
Loading
+16 −10
Original line number Diff line number Diff line
@@ -85,21 +85,27 @@ df.verb = df.verb.apply(lemmatizer.lemmatize)


def cust(x):
    return ' (' + str(round(x, 2)) + ')'
    return ' (' + str(x) + ')'

grs = pd.DataFrame()
for n, gr in df.groupby(['bias', 'name_bias']):
    n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) 
    counts = gr.verb.value_counts()
    counts = counts / overall_counts.loc[counts.index]
    counts2 = overall_counts.loc[counts.index]
    counts.index += counts.apply(cust)
    counts.index += counts2.apply(cust)
    sorted_verbs = counts.sort_values(ascending=False).head(args.n)
    sorted_verbs.to_csv("{}.csv".format(n_edit))
    counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"])
    #print(counts.columns)
    counts["across"] = overall_counts.loc[counts.index]
    #print(counts)
    counts["normalized"] = counts.verb / counts.across
    #print(counts)
    #counts.normalized = counts.normalized.apply(round)
    #print(counts)
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n)
    #print(sorted_verbs)
    #exit(0)
    sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
    grs[n] = sorted_verbs.index

print(grs)
grs.to_csv('most_freq_verbs.csv')
grs.to_csv('frequent_verbs/most_freq_verbs.csv')

exit(0)