Commit f96bf1e9 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Implement code for inspection of BPEmb

parent 358380d1
Loading
Loading
Loading
Loading

standard_eval_bpemb.py

0 → 100644
+43 −0
Original line number Diff line number Diff line
# Running standard evaluation on imported byte-pair embeddings to get baseline for reimplementation

# Importing byte-pair embeddings for German from https://github.com/bheinzerling/bpemb
import numpy as np
from itertools import chain
from bpemb import BPEmb
from sklearn.metrics.pairwise import cosine_similarity as cosine

bpemb_de = BPEmb(lang="de", dim=300, vs=200000)
print(type(bpemb_de))

# Check presence of WEAT words in vocab
A = ["Peter", "Daniel", "Hans", "Thomas", "Andreas", "Martin", "Markus", "Michael",
"Maria", "Anna", "Ursula", "Ruth", "Monika", "Elisabeth", "Verena", "Sandra"]
B = ["Ladina", "Fatima", "Fatma", "Alma", "Soraya", "Svetlana", "Elif", "Vesna",
"Mehmet", "Mustafa", "Aleksandar", "Mohamed", "Ibrahim", "Dragan", "Hasan", "Mohammad"]
X = ["Spaß", "Liebe", "Frieden", "wunderbar", "Freude", "Lachen", "Glück"]
Y = ["Qual", "furchtbar", "schrecklich", "übel", "böse", "Krieg", "scheußlich", "Versagen"]

WEAT_words = list(chain(*[A, B, X, Y]))

for w in WEAT_words:
    num_subwords = len(bpemb_de.encode_ids(w))
    if num_subwords != 1:
        print("Wort: ", w, ", subwords: ", bpemb_de.encode(w))


def get_word_embedding(word, model):
    """Calculate word embeddings from subword embeddings of a word"""
    subword_embeddings = model.embed(word)  # may also be just one
    return np.sum(subword_embeddings, axis=0)  # sum and mean only minimial differences in similarities and no difference in correlation scores


# Create SemEval output
# guideline: 62.6 (harmonic mean of person & spearman) -- my result: 56.270 (Pearson: 55.717, Spearman: 56.834)
with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/output/de.test.bpemb_og.output-mean.txt", mode="w") as output:
    with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/data/de.test.data.txt", mode="r") as input:
        for line in input.readlines():
            word1, word2 = line.split("\t")
            embd1 = get_word_embedding(word1, bpemb_de)
            embd2 = get_word_embedding(word2, bpemb_de)
            sim = cosine(embd1.reshape(1, -1), embd2.reshape(1, -1))[0][0]  # reshape as requested by sklearn
            output.write(str(sim)+"\n")