Loading src/attention_visualization.py +12 −4 Original line number Diff line number Diff line """Creation of attention heat maps This script loads a pretrained BERT model, extracts its used attention for the embedding of a sentence specified via command line argument and then creates a heat map for the amount of attention each token in the sentence receives. """ import torch from transformers import BertConfig, BertTokenizer, BertModel import seaborn as sns import matplotlib.pyplot as plt import sys # load model and tokenizer config = BertConfig.from_pretrained('../models/stsb-bert-large/0_BERT') config.output_attentions = True model = BertModel.from_pretrained('../models/stsb-bert-large/0_BERT', config=config).to('cpu') tokenizer = BertTokenizer.from_pretrained('../models/stsb-bert-large/0_BERT') text = '[CLS] US drone strike kills 10 in Pakistan' # tokenize text given via command line argument text = sys.argv[1] tok = tokenizer.tokenize(text) pos = 3 pos = 1 # change this to change which token you would like to see the attention for # extract attention and change format for plotting ids = torch.tensor(tokenizer.convert_tokens_to_ids(tok)).unsqueeze(0).to('cpu') with torch.no_grad(): output = model(ids) Loading @@ -23,8 +32,7 @@ seqlen = len(attentions) attentions_pos = attentions[pos] print(f'Attention weights for token {tok[pos]}') # plot attention avg_attention = attentions_pos.mean(dim=0) sns.heatmap(avg_attention, vmin=0, vmax=1, xticklabels=tok) Loading src/sentence_length.py +26 −7 Original line number Diff line number Diff line """Test of effects of sentence length This script measures the length of sentences in a data set, gives statistics about the length (average, standard deviation, distribution) and most importantly creates three sub-datasets according to the length of a data point's sentences. This is done to measure the model's performance on sentences of different lengths. """ from src.SBERT_Model import SBERT_Model from STSB_Dataset import STSB_Dataset import math import pandas def add_length_column(dataset) -> pandas.DataFrame: """Creates new column containing length of sentences def add_length_column(dataset): A new column is appended to the given dataframe. It consists of the length of both sentences of this row combined. """ sentence_1_lengths = dataset['sentence1'].apply(lambda sentence: len(sentence.split(" "))) sentence_2_lengths = dataset['sentence2'].apply(lambda sentence: len(sentence.split(" "))) dataset['total length'] = sentence_1_lengths Loading @@ -12,6 +24,11 @@ def add_length_column(dataset): def get_length_distribution(dataset) -> dict: """Counts how often each sentence length appears For the given dataframe, first combines both 'sentence' columns, then counts length of this column for each row. Returns counts of every found length as a dict. """ dataset['sentences combined'] = dataset['sentence1'] + " " + dataset['sentence2'] sentences = dataset['sentences combined'].to_numpy() lengths = dict() Loading @@ -24,27 +41,29 @@ def get_length_distribution(dataset) -> dict: if __name__ == "__main__": complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe # adjust used dataset here dataset_with_lengths = add_length_column(complete_stsb_dataset) average_length = dataset_with_lengths['total length'].mean() standard_deviation = dataset_with_lengths['total length'].std() print(f"Average length: {average_length}, standard deviation: {standard_deviation}") average_length = math.ceil(average_length) print(f"Average length: {average_length}, standard deviation: {standard_deviation}") # first print precise values average_length = math.ceil(average_length) # then use next highest int to get usable values standard_deviation = math.ceil(standard_deviation) for length, occurrence in sorted(get_length_distribution(dataset_with_lengths).items()): print(f"Length: {length} | Occurrence: {occurrence}") short_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] <= (average_length - standard_deviation)] # create data sets corresponding to sentence length short_sentences = dataset_with_lengths.loc[ dataset_with_lengths['total length'] <= (average_length - standard_deviation)] medium_length_sentences = dataset_with_lengths.loc[ (dataset_with_lengths['total length'] > (average_length - standard_deviation)) & (dataset_with_lengths['total length'] < (average_length + standard_deviation))] long_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] >= (average_length + standard_deviation)] long_sentences = dataset_with_lengths.loc[ dataset_with_lengths['total length'] >= (average_length + standard_deviation)] datasets = [short_sentences, medium_length_sentences, long_sentences] for d in datasets: # print("Entries in data set: {}".format(len(d))) ft_model = SBERT_Model("Fine-tuned Model", '../models/stsb-bert-large/', d) ft_model.print_statistics() Loading
src/attention_visualization.py +12 −4 Original line number Diff line number Diff line """Creation of attention heat maps This script loads a pretrained BERT model, extracts its used attention for the embedding of a sentence specified via command line argument and then creates a heat map for the amount of attention each token in the sentence receives. """ import torch from transformers import BertConfig, BertTokenizer, BertModel import seaborn as sns import matplotlib.pyplot as plt import sys # load model and tokenizer config = BertConfig.from_pretrained('../models/stsb-bert-large/0_BERT') config.output_attentions = True model = BertModel.from_pretrained('../models/stsb-bert-large/0_BERT', config=config).to('cpu') tokenizer = BertTokenizer.from_pretrained('../models/stsb-bert-large/0_BERT') text = '[CLS] US drone strike kills 10 in Pakistan' # tokenize text given via command line argument text = sys.argv[1] tok = tokenizer.tokenize(text) pos = 3 pos = 1 # change this to change which token you would like to see the attention for # extract attention and change format for plotting ids = torch.tensor(tokenizer.convert_tokens_to_ids(tok)).unsqueeze(0).to('cpu') with torch.no_grad(): output = model(ids) Loading @@ -23,8 +32,7 @@ seqlen = len(attentions) attentions_pos = attentions[pos] print(f'Attention weights for token {tok[pos]}') # plot attention avg_attention = attentions_pos.mean(dim=0) sns.heatmap(avg_attention, vmin=0, vmax=1, xticklabels=tok) Loading
src/sentence_length.py +26 −7 Original line number Diff line number Diff line """Test of effects of sentence length This script measures the length of sentences in a data set, gives statistics about the length (average, standard deviation, distribution) and most importantly creates three sub-datasets according to the length of a data point's sentences. This is done to measure the model's performance on sentences of different lengths. """ from src.SBERT_Model import SBERT_Model from STSB_Dataset import STSB_Dataset import math import pandas def add_length_column(dataset) -> pandas.DataFrame: """Creates new column containing length of sentences def add_length_column(dataset): A new column is appended to the given dataframe. It consists of the length of both sentences of this row combined. """ sentence_1_lengths = dataset['sentence1'].apply(lambda sentence: len(sentence.split(" "))) sentence_2_lengths = dataset['sentence2'].apply(lambda sentence: len(sentence.split(" "))) dataset['total length'] = sentence_1_lengths Loading @@ -12,6 +24,11 @@ def add_length_column(dataset): def get_length_distribution(dataset) -> dict: """Counts how often each sentence length appears For the given dataframe, first combines both 'sentence' columns, then counts length of this column for each row. Returns counts of every found length as a dict. """ dataset['sentences combined'] = dataset['sentence1'] + " " + dataset['sentence2'] sentences = dataset['sentences combined'].to_numpy() lengths = dict() Loading @@ -24,27 +41,29 @@ def get_length_distribution(dataset) -> dict: if __name__ == "__main__": complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe # adjust used dataset here dataset_with_lengths = add_length_column(complete_stsb_dataset) average_length = dataset_with_lengths['total length'].mean() standard_deviation = dataset_with_lengths['total length'].std() print(f"Average length: {average_length}, standard deviation: {standard_deviation}") average_length = math.ceil(average_length) print(f"Average length: {average_length}, standard deviation: {standard_deviation}") # first print precise values average_length = math.ceil(average_length) # then use next highest int to get usable values standard_deviation = math.ceil(standard_deviation) for length, occurrence in sorted(get_length_distribution(dataset_with_lengths).items()): print(f"Length: {length} | Occurrence: {occurrence}") short_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] <= (average_length - standard_deviation)] # create data sets corresponding to sentence length short_sentences = dataset_with_lengths.loc[ dataset_with_lengths['total length'] <= (average_length - standard_deviation)] medium_length_sentences = dataset_with_lengths.loc[ (dataset_with_lengths['total length'] > (average_length - standard_deviation)) & (dataset_with_lengths['total length'] < (average_length + standard_deviation))] long_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] >= (average_length + standard_deviation)] long_sentences = dataset_with_lengths.loc[ dataset_with_lengths['total length'] >= (average_length + standard_deviation)] datasets = [short_sentences, medium_length_sentences, long_sentences] for d in datasets: # print("Entries in data set: {}".format(len(d))) ft_model = SBERT_Model("Fine-tuned Model", '../models/stsb-bert-large/', d) ft_model.print_statistics()