documented (1498ff71) · Commits · vdberg / Webis_Bias_Flipper_2018

process_srl.py

+22 −32

Original line number	Diff line number	Diff line
		@@ -6,7 +6,7 @@ from argparse import ArgumentParser


		def load_output(dir='outputs', combine=False):
		outfile = 'data/srl_output.csv'
		outfile = 'data/srl_preprocessed_output.csv'

		if combine:
		print('combining')
		@@ -34,6 +34,7 @@ def get_dom(v):
		except KeyError:
		return None


		def load_nameslab():
		namfn = 'data/names_labeled.csv'
		df = pd.read_csv(namfn, index_col=0)
		@@ -41,53 +42,48 @@ def load_nameslab():
		return df, df.name_bias.to_dict()


		# process command line arguments
		parser = ArgumentParser()
		parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
		parser.add_argument("-a", "--arg", default='', help="focus on an arg")
		parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show")
		args = parser.parse_args()
		df = load_output(dir='outputs', combine=args.combine)

		print('Output size {}:'.format(df.shape))
		# load files
		sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
		output = load_output(dir='outputs', combine=args.combine)

		df = df[df['name'] != 'Ryan']
		print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))

		df = df[df['arg'].str.startswith(args.arg)]
		# filter output
		output = output[output['name'] != 'Ryan']
		df = output[output['arg'].str.startswith(args.arg)]
		#print('Arg-0 only {}:'.format(df.shape))


		sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
		# combine output
		df = pd.merge(df, sent, on=['sent_id'])
		namlabs_df, namlabs = load_nameslab()
		df = pd.merge(df, namlabs_df, on=['name'])
		df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
		df.verb = df.verb.str.lower()
		#df = df.drop(columns='same')

		# show what data is like
		pd.set_option('display.max_colwidth', 200)
		pd.set_option('display.max_columns', 10)
		print(df.columns)
		print(df.shape)
		#print(df.head(n=5))

		# lemmatize
		#nltk.download('wordnet')
		#lemmatizer = WordNetLemmatizer()
		#df['lemma'] = df.verb.apply(lemmatizer.lemmatize)
		df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
		df.verb = df.verb.str.lower()

		# compute overall counts
		overall_counts = df.verb.value_counts()
		def test(v):
		return overall_counts[v] > 5

		df = df[df.verb.apply(test)]

		lemmatizer = WordNetLemmatizer()
		df.verb = df.verb.apply(lemmatizer.lemmatize)


		def cust(x):
		return ' (' + str(x) + ')'

		grs = pd.DataFrame()
		binned = pd.DataFrame()
		for n, gr in df.groupby(['bias', 'name_bias']):
		n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1])
		counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"])
		@@ -99,21 +95,16 @@ for n, gr in df.groupby(['bias', 'name_bias']):
		#counts.normalized = counts.normalized.apply(round)
		#print(counts)
		sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n)
		#print(sorted_verbs)
		print(sorted_verbs.head(2))
		#exit(0)
		sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
		grs[n] = sorted_verbs.index
		binned[n] = sorted_verbs.index

		print(grs)
		grs.to_csv('frequent_verbs/most_freq_verbs.csv')
		print(binned.head())
		binned.to_csv('frequent_verbs/most_freq_verbs.csv')

		exit(0)

		nltk.download('wordnet')
		lemmatizer = WordNetLemmatizer()
		df['lemma'] = df.verb.apply(lemmatizer.lemmatize)
		df.arg = df.arg.str[2:]

		vad = pd.read_csv('data/NRC-VAD-Lexicon.txt', delimiter='\t', index_col=0)
		df['dom'] = df.lemma.apply(get_dom)

		@@ -121,11 +112,10 @@ grs = df.drop_duplicates().groupby('same')
		# symptoms of agency

		arg_agr = pd.DataFrame(columns=[''])
		dom_agr = pd.DataFrame(columns=[''])
		for n, gr in grs:
		print(n, gr.dom.mean())
		arg_agr[n] = gr['arg'].value_counts(normalize=True)
		#dom_agr[n] = gr.nlargest(n=10, columns=['dom'])
		dom_agr[n] = gr.nlargest(n=10, columns=['dom'])

		print(arg_agr.head(n=10))
		#print(dom_agr.head(n=10))