Commit 274ecc70 authored by schaper's avatar schaper
Browse files

Initial commit

parent f4e0d08d
Loading
Loading
Loading
Loading

paws_training.py

0 → 100644
+53 −0
Original line number Diff line number Diff line
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses, InputExample
from datetime import datetime
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import torch
import math

# Read the dataset
model_name = 'bert-base-nli-mean-tokens'
train_batch_size = 128
num_epochs = 1
model_save_path = 'output/training_paws_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Load a pre-trained sentence transformer model
model = SentenceTransformer('models/stsb-bert-large')

# Convert the dataset to a DataLoader ready for training
train_samples = []
dev_samples = []

train_dataset = load_dataset('paws', 'labeled_final', split='train')
train_dataset.set_format(type='pandas')
train_dataset = train_dataset[:]
for index, row in train_dataset.iterrows():
    x = torch.FloatTensor([row['label']])
    train_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=x))

dev_dataset = load_dataset('paws', 'labeled_final', split='validation')
dev_dataset.set_format(type='pandas')
dev_dataset = dev_dataset[:]


for index, row in dev_dataset.iterrows():
    dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=row['label']))


train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

# Development set: Measure correlation between cosine score and gold labels
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='paws-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)