Loading standard_eval_bpemb.py 0 → 100644 +43 −0 Original line number Diff line number Diff line # Running standard evaluation on imported byte-pair embeddings to get baseline for reimplementation # Importing byte-pair embeddings for German from https://github.com/bheinzerling/bpemb import numpy as np from itertools import chain from bpemb import BPEmb from sklearn.metrics.pairwise import cosine_similarity as cosine bpemb_de = BPEmb(lang="de", dim=300, vs=200000) print(type(bpemb_de)) # Check presence of WEAT words in vocab A = ["Peter", "Daniel", "Hans", "Thomas", "Andreas", "Martin", "Markus", "Michael", "Maria", "Anna", "Ursula", "Ruth", "Monika", "Elisabeth", "Verena", "Sandra"] B = ["Ladina", "Fatima", "Fatma", "Alma", "Soraya", "Svetlana", "Elif", "Vesna", "Mehmet", "Mustafa", "Aleksandar", "Mohamed", "Ibrahim", "Dragan", "Hasan", "Mohammad"] X = ["Spaß", "Liebe", "Frieden", "wunderbar", "Freude", "Lachen", "Glück"] Y = ["Qual", "furchtbar", "schrecklich", "übel", "böse", "Krieg", "scheußlich", "Versagen"] WEAT_words = list(chain(*[A, B, X, Y])) for w in WEAT_words: num_subwords = len(bpemb_de.encode_ids(w)) if num_subwords != 1: print("Wort: ", w, ", subwords: ", bpemb_de.encode(w)) def get_word_embedding(word, model): """Calculate word embeddings from subword embeddings of a word""" subword_embeddings = model.embed(word) # may also be just one return np.sum(subword_embeddings, axis=0) # sum and mean only minimial differences in similarities and no difference in correlation scores # Create SemEval output # guideline: 62.6 (harmonic mean of person & spearman) -- my result: 56.270 (Pearson: 55.717, Spearman: 56.834) with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/output/de.test.bpemb_og.output-mean.txt", mode="w") as output: with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/data/de.test.data.txt", mode="r") as input: for line in input.readlines(): word1, word2 = line.split("\t") embd1 = get_word_embedding(word1, bpemb_de) embd2 = get_word_embedding(word2, bpemb_de) sim = cosine(embd1.reshape(1, -1), embd2.reshape(1, -1))[0][0] # reshape as requested by sklearn output.write(str(sim)+"\n") Loading
standard_eval_bpemb.py 0 → 100644 +43 −0 Original line number Diff line number Diff line # Running standard evaluation on imported byte-pair embeddings to get baseline for reimplementation # Importing byte-pair embeddings for German from https://github.com/bheinzerling/bpemb import numpy as np from itertools import chain from bpemb import BPEmb from sklearn.metrics.pairwise import cosine_similarity as cosine bpemb_de = BPEmb(lang="de", dim=300, vs=200000) print(type(bpemb_de)) # Check presence of WEAT words in vocab A = ["Peter", "Daniel", "Hans", "Thomas", "Andreas", "Martin", "Markus", "Michael", "Maria", "Anna", "Ursula", "Ruth", "Monika", "Elisabeth", "Verena", "Sandra"] B = ["Ladina", "Fatima", "Fatma", "Alma", "Soraya", "Svetlana", "Elif", "Vesna", "Mehmet", "Mustafa", "Aleksandar", "Mohamed", "Ibrahim", "Dragan", "Hasan", "Mohammad"] X = ["Spaß", "Liebe", "Frieden", "wunderbar", "Freude", "Lachen", "Glück"] Y = ["Qual", "furchtbar", "schrecklich", "übel", "böse", "Krieg", "scheußlich", "Versagen"] WEAT_words = list(chain(*[A, B, X, Y])) for w in WEAT_words: num_subwords = len(bpemb_de.encode_ids(w)) if num_subwords != 1: print("Wort: ", w, ", subwords: ", bpemb_de.encode(w)) def get_word_embedding(word, model): """Calculate word embeddings from subword embeddings of a word""" subword_embeddings = model.embed(word) # may also be just one return np.sum(subword_embeddings, axis=0) # sum and mean only minimial differences in similarities and no difference in correlation scores # Create SemEval output # guideline: 62.6 (harmonic mean of person & spearman) -- my result: 56.270 (Pearson: 55.717, Spearman: 56.834) with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/output/de.test.bpemb_og.output-mean.txt", mode="w") as output: with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/data/de.test.data.txt", mode="r") as input: for line in input.readlines(): word1, word2 = line.split("\t") embd1 = get_word_embedding(word1, bpemb_de) embd2 = get_word_embedding(word2, bpemb_de) sim = cosine(embd1.reshape(1, -1), embd2.reshape(1, -1))[0][0] # reshape as requested by sklearn output.write(str(sim)+"\n")