imporve normalizing (0bbda317) · Commits · vdberg / Webis_Bias_Flipper_2018

process_srl.py

+16 −10

Original line number	Diff line number	Diff line
		@@ -85,21 +85,27 @@ df.verb = df.verb.apply(lemmatizer.lemmatize)


		def cust(x):
		return ' (' + str(round(x, 2)) + ')'
		return ' (' + str(x) + ')'

		grs = pd.DataFrame()
		for n, gr in df.groupby(['bias', 'name_bias']):
		n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1])
		counts = gr.verb.value_counts()
		counts = counts / overall_counts.loc[counts.index]
		counts2 = overall_counts.loc[counts.index]
		counts.index += counts.apply(cust)
		counts.index += counts2.apply(cust)
		sorted_verbs = counts.sort_values(ascending=False).head(args.n)
		sorted_verbs.to_csv("{}.csv".format(n_edit))
		counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"])
		#print(counts.columns)
		counts["across"] = overall_counts.loc[counts.index]
		#print(counts)
		counts["normalized"] = counts.verb / counts.across
		#print(counts)
		#counts.normalized = counts.normalized.apply(round)
		#print(counts)
		sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n)
		#print(sorted_verbs)
		#exit(0)
		sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
		grs[n] = sorted_verbs.index

		print(grs)
		grs.to_csv('most_freq_verbs.csv')
		grs.to_csv('frequent_verbs/most_freq_verbs.csv')

		exit(0)