Commit 737ca7b3 authored by vdberg's avatar vdberg
Browse files

added range option

parent 0b6318c5
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -56,7 +56,8 @@ parser = ArgumentParser()
parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
parser.add_argument("-a", "--arg", default=None, help="analyze args or not")
parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show")
parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs")
parser.add_argument("-min", "--min_freq", type=int, default=100, help="min verb frequency to eliminate rarest verbs")
parser.add_argument("-max", "--max_freq", type=int, default=2000, help="min verb frequency to eliminate rarest verbs")
parser.add_argument("-l", "--lemmatize", action="store_true", default=False, help="whether to (re)do lemmatization")
args = parser.parse_args()
MIN_FREQ = args.min_freq
@@ -77,9 +78,6 @@ output = output[output['name'] != 'Ryan']
#print('Arg-0 only {}:'.format(df.shape))

# combine output
print(output.sent_id.value_counts().head())
output['sent_id'] = output['sent_id'].astype(float)
sent['sent_id'] = sent['sent_id'].astype(float)
df = pd.merge(output, sent, on=['sent_id'])
namlabs_df, namlabs = load_nameslab()
df = pd.merge(df, namlabs_df, on=['name'])
@@ -117,6 +115,8 @@ if ARG:

# compute overall counts
overall_freq = df.verb.value_counts()
print(overall_freq.head(3))
print(overall_freq.tail(3))
df["across"] = overall_freq.loc[df.verb].values
df = df[df.across > MIN_FREQ]
binned = pd.DataFrame()
@@ -130,6 +130,8 @@ for n, gr in df.groupby(['bias', 'name_bias']):
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N)
    sorted_verbs.to_csv("frequent_verbs/{}{}.csv".format(n_edit, '_arg' if ARG else ''), sep="\t")
    binned[n] = sorted_verbs.index
    binned[n + 'across'] = sorted_verbs.across
    binned[n + 'across'] = sorted_verbs.across

print(binned.head(N))
binned.to_csv('frequent_verbs/most_freq_verbs{}.csv'.format('_arg' if ARG else ''))