Commit 24704545 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Added code documentation

parent 7852d498
Loading
Loading
Loading
Loading
+12 −4
Original line number Diff line number Diff line
"""Creation of attention heat maps

This script loads a pretrained BERT model, extracts its used attention for the embedding of a sentence specified
via command line argument and then creates a heat map for the amount of attention each token in the sentence receives.
"""
import torch
from transformers import BertConfig, BertTokenizer, BertModel
import seaborn as sns
import matplotlib.pyplot as plt
import sys

# load model and tokenizer
config = BertConfig.from_pretrained('../models/stsb-bert-large/0_BERT')
config.output_attentions = True
model = BertModel.from_pretrained('../models/stsb-bert-large/0_BERT', config=config).to('cpu')
tokenizer = BertTokenizer.from_pretrained('../models/stsb-bert-large/0_BERT')

text = '[CLS] US drone strike kills 10 in Pakistan'
# tokenize text given via command line argument
text = sys.argv[1]
tok = tokenizer.tokenize(text)
pos = 3
pos = 1  # change this to change which token you would like to see the attention for

# extract attention and change format for plotting
ids = torch.tensor(tokenizer.convert_tokens_to_ids(tok)).unsqueeze(0).to('cpu')
with torch.no_grad():
    output = model(ids)
@@ -23,8 +32,7 @@ seqlen = len(attentions)

attentions_pos = attentions[pos]

print(f'Attention weights for token {tok[pos]}')

# plot attention
avg_attention = attentions_pos.mean(dim=0)
sns.heatmap(avg_attention, vmin=0, vmax=1, xticklabels=tok)

+26 −7
Original line number Diff line number Diff line
"""Test of effects of sentence length

This script measures the length of sentences in a data set, gives statistics about the length (average, standard
deviation, distribution) and most importantly creates three sub-datasets according to the length of a data point's
sentences. This is done to measure the model's performance on sentences of different lengths.
"""
from src.SBERT_Model import SBERT_Model
from STSB_Dataset import STSB_Dataset
import math
import pandas


def add_length_column(dataset) -> pandas.DataFrame:
    """Creates new column containing length of sentences

def add_length_column(dataset):
    A new column is appended to the given dataframe. It consists of the length of both sentences of
    this row combined.
    """
    sentence_1_lengths = dataset['sentence1'].apply(lambda sentence: len(sentence.split(" ")))
    sentence_2_lengths = dataset['sentence2'].apply(lambda sentence: len(sentence.split(" ")))
    dataset['total length'] = sentence_1_lengths
@@ -12,6 +24,11 @@ def add_length_column(dataset):


def get_length_distribution(dataset) -> dict:
    """Counts how often each sentence length appears

    For the given dataframe, first combines both 'sentence' columns, then counts length of this column for each row.
    Returns counts of every found length as a dict.
    """
    dataset['sentences combined'] = dataset['sentence1'] + " " + dataset['sentence2']
    sentences = dataset['sentences combined'].to_numpy()
    lengths = dict()
@@ -24,27 +41,29 @@ def get_length_distribution(dataset) -> dict:


if __name__ == "__main__":
    complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe
    complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe  # adjust used dataset here
    dataset_with_lengths = add_length_column(complete_stsb_dataset)

    average_length = dataset_with_lengths['total length'].mean()
    standard_deviation = dataset_with_lengths['total length'].std()
    print(f"Average length: {average_length}, standard deviation: {standard_deviation}")
    average_length = math.ceil(average_length)
    print(f"Average length: {average_length}, standard deviation: {standard_deviation}")  # first print precise values
    average_length = math.ceil(average_length)  # then use next highest int to get usable values
    standard_deviation = math.ceil(standard_deviation)

    for length, occurrence in sorted(get_length_distribution(dataset_with_lengths).items()):
        print(f"Length: {length} | Occurrence: {occurrence}")

    short_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] <= (average_length - standard_deviation)]
    # create data sets corresponding to sentence length
    short_sentences = dataset_with_lengths.loc[
        dataset_with_lengths['total length'] <= (average_length - standard_deviation)]
    medium_length_sentences = dataset_with_lengths.loc[
        (dataset_with_lengths['total length'] > (average_length - standard_deviation)) &
        (dataset_with_lengths['total length'] < (average_length + standard_deviation))]
    long_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] >= (average_length + standard_deviation)]
    long_sentences = dataset_with_lengths.loc[
        dataset_with_lengths['total length'] >= (average_length + standard_deviation)]

    datasets = [short_sentences, medium_length_sentences, long_sentences]

    for d in datasets:
        # print("Entries in data set: {}".format(len(d)))
        ft_model = SBERT_Model("Fine-tuned Model", '../models/stsb-bert-large/', d)
        ft_model.print_statistics()