documentation (da33038d) · Commits · vdberg / Webis_Bias_Flipper_2018

Original line number	Diff line number	Diff line
		@@ -6,8 +6,8 @@ from argparse import ArgumentParser
		import spacy


		def load_output(dir='outputs', combine=False):
		outfile = 'data/srl_preprocessed_output.csv'
		def load_raw_output(dir='outputs', combine=False):
		outfile = 'outputs/srl_ner_tagged_preprocessed.csv'

		if combine:
		print('combining')
		@@ -51,7 +51,9 @@ def crude_ner(sent):
		m = re.search(nampat, sent)
		return bool(m)


		# process command line arguments

		parser = ArgumentParser()
		parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
		parser.add_argument("-a", "--arg", default=None, help="analyze args or not")
		@@ -60,84 +62,97 @@ parser.add_argument("-min", "--min_freq", type=int, default=100, help="min verb
		parser.add_argument("-max", "--max_freq", type=int, default=2000, help="min verb frequency to eliminate rarest verbs")
		parser.add_argument("-l", "--lemmatize", action="store_true", default=False, help="whether to (re)do lemmatization")
		args = parser.parse_args()

		MIN_FREQ = args.min_freq
		MAX_FREQ = args.max_freq
		N = args.n
		ARG = args.arg
		LEMMATIZE = args.lemmatize
		print(args)

		# load files
		sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])

		# create srl tagged df files

		# load inputs

		sent = pd.read_csv('data/sentence_splitted.csv', names=['sent_id','bias','doc_id','s'])
		sent = sent[sent.s.apply(crude_ner) == True]
		output = load_output(dir='outputs', combine=args.combine)

		print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))
		namlabs_df, namlabs = load_nameslab()

		# filter output
		output = load_raw_output(dir='outputs', combine=args.combine)
		output = output[output['name'] != 'Ryan']
		#print('Arg-0 only {}:'.format(df.shape))
		print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))

		# merge & clean-up

		# combine output
		df = pd.merge(output, sent, on=['sent_id'])
		namlabs_df, namlabs = load_nameslab()
		df = pd.merge(df, namlabs_df, on=['name'])

		df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
		df.verb = df.verb.str.lower()
		#df = df.drop(columns='same')
		print(df.shape)

		# display options
		pd.set_option('display.max_colwidth', 200)
		pd.set_option('display.max_columns', 10)
		pd.set_option('display.width', 1000)
		#print(df.columns)
		#print(df.head(n=5))

		# lemmatize

		def lemma(x):
		return nlp(x)[0].lemma_

		if LEMMATIZE:
		nlp = spacy.load('en')
		df.verb = df.verb.apply(lemma) #['lemma']
		df.to_csv('frequent_verbs/lemmatized.csv')
		df.to_csv('data/lemmatized.csv')
		else:
		df = pd.read_csv('frequent_verbs/lemmatized.csv', index_col=0)
		df = pd.read_csv('data/lemmatized.csv', index_col=0)

		#nltk.download('wordnet')
		#lemmatizer = WordNetLemmatizer()
		#df.verb = df.verb.apply(lemmatizer.lemmatize)
		# focus on arg if wanted

		# arg
		if ARG:
		#df = df[df.arg.str.endswith(ARG)]
		df.verb += ' (' + df.arg.str[2:] + ')'

		# compute overall counts

		# compute overall counts and filter out rare words and stop words

		overall_freq = df.verb.value_counts()
		print(overall_freq.head(3))
		print(overall_freq.tail(3))
		df["across"] = overall_freq.loc[df.verb].values
		df = df[df.across > MIN_FREQ]
		df = df[df.across < MAX_FREQ]

		# group and count frequent verbs per bin

		binned = pd.DataFrame()
		for n, gr in df.groupby(['bias', 'name_bias']):
		n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1])

		# make counts
		counts = pd.DataFrame(gr.verb.value_counts())
		counts.columns = ['within']
		counts["across"] = overall_freq.loc[counts.index].values

		# normalize & sort
		counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True)
		#counts.normalized = counts.normalized.apply(round)
		sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N)
		sorted_verbs.to_csv("frequent_verbs/{}{}.csv".format(n_edit, '_arg' if ARG else ''), sep="\t")
		#counts.normalized = counts.normalized.apply(round)

		# store
		sorted_verbs.to_csv("verb_lists/{}{}.csv".format(n_edit, '_arg' if ARG else ''), sep="\t")
		binned[n] = sorted_verbs.index

		# display

		pd.set_option('display.max_colwidth', 200)
		pd.set_option('display.max_columns', 10)
		pd.set_option('display.width', 1000)
		#print(df.columns)
		#print(df.head(n=5))

		print(binned.head(N))
		binned.to_csv('frequent_verbs/most_freq_verbs{}.csv'.format('_arg' if ARG else ''))

		exit(0)

		# for adding dominance

		vad = pd.read_csv('data/NRC-VAD-Lexicon.txt', delimiter='\t', index_col=0)
		df['dom'] = df.lemma.apply(get_dom)

		@@ -152,3 +167,5 @@ for n, gr in grs:

		print(arg_agr.head(n=10))
		#print(dom_agr.head(n=10))

		# for adding super senses
		No newline at end of file

verb_lists/Center_coverage_of_Left_person.csv

0 → 100644

+51 −0

Original line number	Diff line number	Diff line
		within across normalized
		email 20 20 1.0
		signal 5 6 0.8333333333333334
		highlight 6 8 0.75
		gut 4 6 0.6666666666666666
		reveal 21 32 0.65625
		veto 5 10 0.5
		lie 4 8 0.5
		enter 3 6 0.5
		struggle 3 6 0.5
		pay 6 12 0.5
		inform 4 8 0.5
		delay 7 14 0.5
		know 27 55 0.4909090909090909
		kill 4 9 0.4444444444444444
		relate 4 9 0.4444444444444444
		defeat 4 9 0.4444444444444444
		roll 6 14 0.42857142857142855
		violate 3 7 0.42857142857142855
		think 7 17 0.4117647058823529
		hear 4 10 0.4
		negotiate 4 10 0.4
		investigate 8 20 0.4
		intend 3 8 0.375
		complain 4 11 0.36363636363636365
		cast 4 11 0.36363636363636365
		rig 4 11 0.36363636363636365
		appoint 13 36 0.3611111111111111
		describe 6 18 0.3333333333333333
		watch 7 21 0.3333333333333333
		create 5 15 0.3333333333333333
		launch 5 15 0.3333333333333333
		enact 2 6 0.3333333333333333
		erase 2 6 0.3333333333333333
		quote 2 6 0.3333333333333333
		rescind 5 15 0.3333333333333333
		direct 3 9 0.3333333333333333
		rename 2 6 0.3333333333333333
		sit 3 9 0.3333333333333333
		pursue 3 9 0.3333333333333333
		reverse 10 30 0.3333333333333333
		address 4 12 0.3333333333333333
		concern 2 6 0.3333333333333333
		examine 2 6 0.3333333333333333
		look 11 34 0.3235294117647059
		question 5 16 0.3125
		appear 12 39 0.3076923076923077
		face 4 13 0.3076923076923077
		believe 8 26 0.3076923076923077
		begin 7 23 0.30434782608695654
		learn 3 10 0.3