Commit fc9579b8 authored by schaper's avatar schaper
Browse files
parents 63850d7d 813b5502
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
models/*
miniconda3/*
.DS_Store
__pycache__/
sentence_transformers/*
+2 −2
Original line number Diff line number Diff line
# Semantic Textual Similarity

## ❓Purpose
The overarching goal of this project is to increase the understanding of masked language models (MLM) and bring forth new ideas about their way of functioning. We will specifically look at SBERT models, which are fine-tuned only on NLI. For more details, please see our outline `project_outline_sts.pdf`. 
The overarching goal of this project is to increase the understanding of masked language models (MLM) and bring forth new ideas about their way of functioning. We will specifically look at SBERT models, which are fine-tuned only on NLI. For more details, please see our [project outline](https://gitlab.cl.uni-heidelberg.de/reichelt/semantic-textual-similarity/-/blob/master/project_outline_sts.pdf). 

## 🔬 Results

For our results, please see our project report `project_report_sts.pdf`.
For our results, please see our [project report](https://gitlab.cl.uni-heidelberg.de/reichelt/semantic-textual-similarity/-/blob/master/project_report_sts.pdf).

## 👥 Authors
Daniel Podrażka \

SBERT_Model.py

0 → 100644
+74 −0
Original line number Diff line number Diff line
from sentence_transformers import SentenceTransformer, util

from scipy.stats import spearmanr, pearsonr
from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from random import randrange

class SBERT_Model:
    def __init__(self, name, filepath, dataset, probing=False):
        """
        param: dataset must be pandas dataframe
        """
        self.name = name
        self.filepath = filepath
        self.dataset = dataset
        self.model = SentenceTransformer(filepath)
        self.sentences1 = dataset['sentence1'].tolist()
        self.sentences2 = dataset['sentence2'].tolist()
    
        self.labels = dataset['label'].tolist()
        self.embeddings1 = self.get_embeddings(self.sentences1)
        self.embeddings2 = self.get_embeddings(self.sentences2)
        self.cosine_scores = self.get_cosine_scores()
        self.preds = self.get_preds()
    
    def visualize_embeddings(self):
        sentences = self.sentences1 + self.sentences2
        embs = self.get_embeddings(sentences)
        pca = PCA(n_components=2)
        X = pca.fit_transform(embs_together)
        plt.figure(figsize=(20,10))
        plt.scatter(X[:, 0], X[:, 1])
        for x, y in X:
            label = "{:.2f}".format(y)
            name = i
            label = f"{name}\n({x:.2f},{y:.2f})"

            plt.annotate(label, # this is the text
                        (x,y), # this is the point to label
                        textcoords="offset points", # how to position the text
                        xytext=(0,10), # distance from text to points (x,y)
                        ha='center') # horizontal alignment can be left, right or center
            i = i + 1
        plt.savefig(f'{randrange(0,10000)}plot.png')


    def get_embeddings(self, sentences):
        return self.model.encode(sentences, convert_to_tensor=True)

    def get_preds(self):
        """This method extracts the scores for similarity between sentence pairs
        Cosine_scores have scores for similarity between all of the sentences, 
        but we only need similarity between each sentence pair.
        """
        preds = []
        for i in range(len(self.cosine_scores[0])):
            preds.append(self.cosine_scores[i][i])
        return preds

    def get_cosine_scores(self):
        return util.pytorch_cos_sim(self.embeddings1, self.embeddings2)

    def get_pearson(self):
        return pearsonr(self.labels, self.preds)[0]

    def get_spearman(self):
        return spearmanr(self.labels, self.preds)[0]

    def get_MSE(self):
        return metrics.mean_squared_error(self.labels, self.preds)
    
    def print_statistics(self):
        print(f"{self.name}: MSE:{self.get_MSE()}; Pearson:{self.get_pearson()}; Spearman:{self.get_spearman()}")

STSB_Dataset.py

0 → 100644
+44 −0
Original line number Diff line number Diff line
from lit_nlp.api import dataset as lit_dataset
from lit_nlp.api import types as lit_types
import pandas as pd

class STSB_Dataset(lit_dataset.Dataset):
    
    # this is a LIT-specific wrapper for our STS-B Dataset.
    
    def __init__(self, path):

        with open(path,
                  'r', encoding='utf-8') as d:
            dataset = d.read().splitlines()

        self.data = []
        self.labels = []


        for line in dataset:
            datapoint = line.split('\t')
            score = float(datapoint[4])
            score = score/5

            st1 = datapoint[5]
            st2 = datapoint[6]
            self.data.append([st1,st2])
            self.labels.append(score)

            
        # Store as a list of dicts, conforming to self.spec()
        self._examples = [{
          'sentence1': dp[0],
          'sentence2': dp[1],
          'label': label,
        } for dp, label in zip(self.data, self.labels)]

        self.as_dataframe = pd.DataFrame.from_dict(self._examples)

    def spec(self):
        return {
          'sentence1': lit_types.TextSegment(),
          'sentence2': lit_types.TextSegment(),
          'label': lit_types.RegressionScore(),
        }

probing.py

0 → 100644
+67 −0
Original line number Diff line number Diff line
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
logging.basicConfig(level=logging.INFO)

USE_GPU = 1
# Device configuration
device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu')

# Load pre-trained model tokenizer (vocabulary)
pretrained_model = 'bert-base-cased'
# pretrained_model = 'models/stsb-bert-large/0_BERT/'
tokenizer = BertTokenizer.from_pretrained(pretrained_model)
text = "[CLS] A man is playing a trumpet [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
mask1 = 4
#mask2 = 14
#mask3 = 15
tokenized_text[mask1] = '[MASK]'
#tokenized_text[mask2] = '[MASK]'
#tokenized_text[mask3] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(pretrained_model)
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to(device)
segments_tensors = segments_tensors.to(device)
model.to(device)

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# get predicted tokens

#prediction for mask1
predicted_index = torch.argmax(predictions[0, mask1]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token) #returns "baseball"


#prediction for mask2
#predicted_index = torch.argmax(predictions[0, mask2]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#print(predicted_token) #returns "actor"


#prediction for mask3
#predicted_index = torch.argmax(predictions[0, mask3]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#print(predicted_token) # returns "."
Loading