Added code documentation (24704545) · Commits · Aileen Reichelt / Semantic Textual Similarity

src/attention_visualization.py

+12 −4

Original line number	Diff line number	Diff line
		"""Creation of attention heat maps

		This script loads a pretrained BERT model, extracts its used attention for the embedding of a sentence specified
		via command line argument and then creates a heat map for the amount of attention each token in the sentence receives.
		"""
		import torch
		from transformers import BertConfig, BertTokenizer, BertModel
		import seaborn as sns
		import matplotlib.pyplot as plt
		import sys

		# load model and tokenizer
		config = BertConfig.from_pretrained('../models/stsb-bert-large/0_BERT')
		config.output_attentions = True
		model = BertModel.from_pretrained('../models/stsb-bert-large/0_BERT', config=config).to('cpu')
		tokenizer = BertTokenizer.from_pretrained('../models/stsb-bert-large/0_BERT')

		text = '[CLS] US drone strike kills 10 in Pakistan'
		# tokenize text given via command line argument
		text = sys.argv[1]
		tok = tokenizer.tokenize(text)
		pos = 3
		pos = 1 # change this to change which token you would like to see the attention for

		# extract attention and change format for plotting
		ids = torch.tensor(tokenizer.convert_tokens_to_ids(tok)).unsqueeze(0).to('cpu')
		with torch.no_grad():
		output = model(ids)
		@@ -23,8 +32,7 @@ seqlen = len(attentions)

		attentions_pos = attentions[pos]

		print(f'Attention weights for token {tok[pos]}')

		# plot attention
		avg_attention = attentions_pos.mean(dim=0)
		sns.heatmap(avg_attention, vmin=0, vmax=1, xticklabels=tok)

src/sentence_length.py

+26 −7

Original line number	Diff line number	Diff line
		"""Test of effects of sentence length

		This script measures the length of sentences in a data set, gives statistics about the length (average, standard
		deviation, distribution) and most importantly creates three sub-datasets according to the length of a data point's
		sentences. This is done to measure the model's performance on sentences of different lengths.
		"""
		from src.SBERT_Model import SBERT_Model
		from STSB_Dataset import STSB_Dataset
		import math
		import pandas


		def add_length_column(dataset) -> pandas.DataFrame:
		"""Creates new column containing length of sentences

		def add_length_column(dataset):
		A new column is appended to the given dataframe. It consists of the length of both sentences of
		this row combined.
		"""
		sentence_1_lengths = dataset['sentence1'].apply(lambda sentence: len(sentence.split(" ")))
		sentence_2_lengths = dataset['sentence2'].apply(lambda sentence: len(sentence.split(" ")))
		dataset['total length'] = sentence_1_lengths
		@@ -12,6 +24,11 @@ def add_length_column(dataset):


		def get_length_distribution(dataset) -> dict:
		"""Counts how often each sentence length appears

		For the given dataframe, first combines both 'sentence' columns, then counts length of this column for each row.
		Returns counts of every found length as a dict.
		"""
		dataset['sentences combined'] = dataset['sentence1'] + " " + dataset['sentence2']
		sentences = dataset['sentences combined'].to_numpy()
		lengths = dict()
		@@ -24,27 +41,29 @@ def get_length_distribution(dataset) -> dict:


		if __name__ == "__main__":
		complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe
		complete_stsb_dataset = STSB_Dataset('../data/stsbenchmark/sts-test.csv').as_dataframe # adjust used dataset here
		dataset_with_lengths = add_length_column(complete_stsb_dataset)

		average_length = dataset_with_lengths['total length'].mean()
		standard_deviation = dataset_with_lengths['total length'].std()
		print(f"Average length: {average_length}, standard deviation: {standard_deviation}")
		average_length = math.ceil(average_length)
		print(f"Average length: {average_length}, standard deviation: {standard_deviation}") # first print precise values
		average_length = math.ceil(average_length) # then use next highest int to get usable values
		standard_deviation = math.ceil(standard_deviation)

		for length, occurrence in sorted(get_length_distribution(dataset_with_lengths).items()):
		print(f"Length: {length} \| Occurrence: {occurrence}")

		short_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] <= (average_length - standard_deviation)]
		# create data sets corresponding to sentence length
		short_sentences = dataset_with_lengths.loc[
		dataset_with_lengths['total length'] <= (average_length - standard_deviation)]
		medium_length_sentences = dataset_with_lengths.loc[
		(dataset_with_lengths['total length'] > (average_length - standard_deviation)) &
		(dataset_with_lengths['total length'] < (average_length + standard_deviation))]
		long_sentences = dataset_with_lengths.loc[dataset_with_lengths['total length'] >= (average_length + standard_deviation)]
		long_sentences = dataset_with_lengths.loc[
		dataset_with_lengths['total length'] >= (average_length + standard_deviation)]

		datasets = [short_sentences, medium_length_sentences, long_sentences]

		for d in datasets:
		# print("Entries in data set: {}".format(len(d)))
		ft_model = SBERT_Model("Fine-tuned Model", '../models/stsb-bert-large/', d)
		ft_model.print_statistics()