Fixed lemmatization (948dc84e) · Commits · vdberg / Webis_Bias_Flipper_2018

process_srl.py

+46 −23

Original line number	Diff line number	Diff line
		import pandas as pd
		import nltk
		from nltk.stem import WordNetLemmatizer
		import os
		import os, re
		from argparse import ArgumentParser
		import spacy


		def load_output(dir='outputs', combine=False):
		@@ -17,7 +18,7 @@ def load_output(dir='outputs', combine=False):
		with open(os.path.join(p, i), encoding='utf-8') as i:
		all_lines.extend(i.readlines()[1:])

		print('from {} file {} lines'.format(len(fs), len(all_lines)))
		print('from {} files {} lines'.format(len(fs), len(all_lines)))

		with open(outfile, 'w', encoding='utf-8') as o_add:
		for line in all_lines:
		@@ -41,66 +42,88 @@ def load_nameslab():
		df = df.replace({'l': 'Left', 'r': 'Right', 'c':'Center'})
		return df, df.name_bias.to_dict()

		namlabs_df, namlabs = load_nameslab()

		def crude_ner(sent):
		global namlabs
		nampat = '\\b' + '\\b\|\\b'.join([i for i in namlabs]) + '\\b'
		m = re.search(nampat, sent)
		return bool(m)

		# process command line arguments
		parser = ArgumentParser()
		parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
		parser.add_argument("-a", "--arg", default='', help="focus on an arg")
		parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show")
		parser.add_argument("-a", "--arg", default='ARG0', help="focus on an arg")
		parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show")
		parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs")
		args = parser.parse_args()
		MIN_FREQ = args.min_freq
		N = args.n

		# load files
		sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
		sent = sent[sent.s.apply(crude_ner) == True]
		output = load_output(dir='outputs', combine=args.combine)

		print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))

		# filter output
		output = output[output['name'] != 'Ryan']
		df = output[output['arg'].str.startswith(args.arg)]
		#print('Arg-0 only {}:'.format(df.shape))

		# combine output
		df = pd.merge(df, sent, on=['sent_id'])
		df = pd.merge(output, sent, on=['sent_id'])
		namlabs_df, namlabs = load_nameslab()
		df = pd.merge(df, namlabs_df, on=['name'])
		df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
		df.verb = df.verb.str.lower()
		#df = df.drop(columns='same')
		print(df.shape)

		# show what data is like
		# display options
		pd.set_option('display.max_colwidth', 200)
		pd.set_option('display.max_columns', 10)
		print(df.columns)
		print(df.shape)
		pd.set_option('display.width', 1000)
		#print(df.columns)
		#print(df.head(n=5))

		# lemmatize
		def lemma(x):
		return nlp(x)[0].lemma_

		lemmatize = False
		if lemmatize:
		nlp = spacy.load('en')
		df.verb = df.verb.apply(lemma) #['lemma']
		df.to_csv('frequent_verbs/lemmatized.csv')
		else:
		df = pd.read_csv('frequent_verbs/lemmatized.csv', index_col=0)

		#nltk.download('wordnet')
		#lemmatizer = WordNetLemmatizer()
		#df['lemma'] = df.verb.apply(lemmatizer.lemmatize)
		#df.verb = df.verb.apply(lemmatizer.lemmatize)

		# compute overall counts
		overall_counts = df.verb.value_counts()
		# arg
		df = df[df.arg.str.endswith('0')]
		df.verb += ' (' + df.arg.str[2:] + ')'

		# compute overall counts
		overall_freq = df.verb.value_counts()
		df["across"] = overall_freq.loc[df.verb].values
		df = df[df.across > MIN_FREQ]
		binned = pd.DataFrame()
		for n, gr in df.groupby(['bias', 'name_bias']):
		n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1])
		counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"])
		#print(counts.columns)
		counts["across"] = overall_counts.loc[counts.index]
		#print(counts)
		counts["normalized"] = counts.verb / counts.across
		#print(counts)
		counts = pd.DataFrame(gr.verb.value_counts())
		counts.columns = ['within']
		counts["across"] = overall_freq.loc[counts.index].values
		counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True)
		#counts.normalized = counts.normalized.apply(round)
		#print(counts)
		sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n)
		print(sorted_verbs.head(2))
		#exit(0)
		sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N)
		sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t")
		binned[n] = sorted_verbs.index

		print(binned.head())
		print(binned.head(N))
		binned.to_csv('frequent_verbs/most_freq_verbs.csv')

		exit(0)