Commit 2f288f16 authored by vdberg's avatar vdberg
Browse files

argument analysis added as command line

parent 948dc84e
Loading
Loading
Loading
Loading
+7 −5
Original line number Diff line number Diff line
@@ -53,12 +53,13 @@ def crude_ner(sent):
# process command line arguments
parser = ArgumentParser()
parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
parser.add_argument("-a", "--arg", default='ARG0', help="focus on an arg")
parser.add_argument("-a", "--arg", default=None, help="analyze args or not")
parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show")
parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs")
args = parser.parse_args()
MIN_FREQ = args.min_freq
N = args.n
ARG = args.arg

# load files
sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
@@ -104,7 +105,8 @@ else:
#df.verb = df.verb.apply(lemmatizer.lemmatize)

# arg
df = df[df.arg.str.endswith('0')]
if ARG:
    #df = df[df.arg.str.endswith(ARG)]
    df.verb += ' (' + df.arg.str[2:] + ')'

# compute overall counts
@@ -120,11 +122,11 @@ for n, gr in df.groupby(['bias', 'name_bias']):
    counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True)
    #counts.normalized = counts.normalized.apply(round)
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N)
    sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
    sorted_verbs.to_csv("frequent_verbs/{}{}.csv".format(n_edit, '_arg' if ARG else ''), sep="\t")
    binned[n] = sorted_verbs.index

print(binned.head(N))
binned.to_csv('frequent_verbs/most_freq_verbs.csv')
binned.to_csv('frequent_verbs/most_freq_verbs{}.csv'.format('_arg' if ARG else ''))

exit(0)