Implement code for inspection of BPEmb (f96bf1e9) · Commits · Aileen Reichelt / Analysing and Mitigating Origin Bias in German Word Embeddings

standard_eval_bpemb.py

0 → 100644

+43 −0

Original line number	Diff line number	Diff line
		# Running standard evaluation on imported byte-pair embeddings to get baseline for reimplementation

		# Importing byte-pair embeddings for German from https://github.com/bheinzerling/bpemb
		import numpy as np
		from itertools import chain
		from bpemb import BPEmb
		from sklearn.metrics.pairwise import cosine_similarity as cosine

		bpemb_de = BPEmb(lang="de", dim=300, vs=200000)
		print(type(bpemb_de))

		# Check presence of WEAT words in vocab
		A = ["Peter", "Daniel", "Hans", "Thomas", "Andreas", "Martin", "Markus", "Michael",
		"Maria", "Anna", "Ursula", "Ruth", "Monika", "Elisabeth", "Verena", "Sandra"]
		B = ["Ladina", "Fatima", "Fatma", "Alma", "Soraya", "Svetlana", "Elif", "Vesna",
		"Mehmet", "Mustafa", "Aleksandar", "Mohamed", "Ibrahim", "Dragan", "Hasan", "Mohammad"]
		X = ["Spaß", "Liebe", "Frieden", "wunderbar", "Freude", "Lachen", "Glück"]
		Y = ["Qual", "furchtbar", "schrecklich", "übel", "böse", "Krieg", "scheußlich", "Versagen"]

		WEAT_words = list(chain(*[A, B, X, Y]))

		for w in WEAT_words:
		num_subwords = len(bpemb_de.encode_ids(w))
		if num_subwords != 1:
		print("Wort: ", w, ", subwords: ", bpemb_de.encode(w))


		def get_word_embedding(word, model):
		"""Calculate word embeddings from subword embeddings of a word"""
		subword_embeddings = model.embed(word) # may also be just one
		return np.sum(subword_embeddings, axis=0) # sum and mean only minimial differences in similarities and no difference in correlation scores


		# Create SemEval output
		# guideline: 62.6 (harmonic mean of person & spearman) -- my result: 56.270 (Pearson: 55.717, Spearman: 56.834)
		with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/output/de.test.bpemb_og.output-mean.txt", mode="w") as output:
		with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/data/de.test.data.txt", mode="r") as input:
		for line in input.readlines():
		word1, word2 = line.split("\t")
		embd1 = get_word_embedding(word1, bpemb_de)
		embd2 = get_word_embedding(word2, bpemb_de)
		sim = cosine(embd1.reshape(1, -1), embd2.reshape(1, -1))[0][0] # reshape as requested by sklearn
		output.write(str(sim)+"\n")