Commit bc0008f5 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Merge remote-tracking branch 'origin/master'

parents a4161080 daa6a697
Loading
Loading
Loading
Loading

masking.py

0 → 100644
+67 −0
Original line number Diff line number Diff line
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
logging.basicConfig(level=logging.INFO)

USE_GPU = 1
# Device configuration
device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu')

# Load pre-trained model tokenizer (vocabulary)
pretrained_model = 'bert-base-cased'
# pretrained_model = 'models/stsb-bert-large/0_BERT/'
tokenizer = BertTokenizer.from_pretrained(pretrained_model)
text = "[CLS] A man is playing a trumpet [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
mask1 = 4
#mask2 = 14
#mask3 = 15
tokenized_text[mask1] = '[MASK]'
#tokenized_text[mask2] = '[MASK]'
#tokenized_text[mask3] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(pretrained_model)
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to(device)
segments_tensors = segments_tensors.to(device)
model.to(device)

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# get predicted tokens

#prediction for mask1
predicted_index = torch.argmax(predictions[0, mask1]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token) #returns "baseball"


#prediction for mask2
#predicted_index = torch.argmax(predictions[0, mask2]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#print(predicted_token) #returns "actor"


#prediction for mask3
#predicted_index = torch.argmax(predictions[0, mask3]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#print(predicted_token) # returns "."
+122 −67
Original line number Diff line number Diff line
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
logging.basicConfig(level=logging.INFO)

USE_GPU = 1
# Device configuration
device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu')

# Load pre-trained model tokenizer (vocabulary)
pretrained_model = 'bert-base-cased'
# pretrained_model = 'models/stsb-bert-large/0_BERT/'
tokenizer = BertTokenizer.from_pretrained(pretrained_model)
text = "[CLS] A man is playing a trumpet [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
mask1 = 4
#mask2 = 14
#mask3 = 15
tokenized_text[mask1] = '[MASK]'
#tokenized_text[mask2] = '[MASK]'
#tokenized_text[mask3] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(pretrained_model)
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to(device)
segments_tensors = segments_tensors.to(device)
model.to(device)

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# get predicted tokens

#prediction for mask1
predicted_index = torch.argmax(predictions[0, mask1]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token) #returns "baseball"


#prediction for mask2
#predicted_index = torch.argmax(predictions[0, mask2]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#print(predicted_token) #returns "actor"


#prediction for mask3
#predicted_index = torch.argmax(predictions[0, mask3]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
#print(predicted_token) # returns "."
from STSB_Dataset import STSB_Dataset
import pandas as pd
import spacy
import random


df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe

nlp = spacy.load("en_core_web_sm")

sentences1 = df['sentence1'].tolist()
sentences2 = df['sentence2'].tolist()

sentences1_nlp = [nlp(sent) for sent in sentences1]
sentences2_nlp = [nlp(sent) for sent in sentences2]

sent1_roots = []
sent2_roots = []

sent1_root_children = []
sent2_root_children = []

for sent1 in sentences1_nlp:

    root1 = [(token.text, token, token.pos_) for token in sent1 if token.dep_ == "ROOT"]
    sent1_roots.append((root1[0][0], root1[0][2]))
    root_children1 = [(child.text, child.pos_) for child in root1[0][1].children]
    sent1_root_children.append(root_children1)

for sent2 in sentences2_nlp:

    root2 = [(token.text, token, token.pos_) for token in sent2 if token.dep_ == "ROOT"]
    sent2_roots.append((root2[0][0], root2[0][2]))
    root_children2 = [(child.text, child.pos_) for child in root2[0][1].children]
    sent2_root_children.append(root_children2)

assert len(sent1_roots) == len(sent2_roots)
assert len(sent1_root_children) == len(sent2_root_children)


def mask_token(pos_tag):
    del_rows = []     #rows to be deleted
    
    for i in range(len(sent1_roots)):
        if sent1_roots[i][1] == pos_tag:
            df.iloc[i, 0] = df.iloc[i, 0].replace(sent1_roots[i][0], "[MASK]")
        else:
            for e in range(len(sent1_root_children[i])):
                if pos_tag in sent1_root_children[i][e]:
                    df.iloc[i, 0] = df.iloc[i, 0].replace(sent1_root_children[i][e][0], "[MASK]")
                    break
                elif e == len(sent1_root_children[i]) - 1:
                    del_rows.append(i)

    for i in range(len(sent2_roots)):
        if sent1_roots[i][1] == pos_tag:
            df.iloc[i, 1] = df.iloc[i, 1].replace(sent2_roots[i][0], "[MASK]")
        else:
            for e in range(len(sent2_root_children[i])):
                if pos_tag in sent2_root_children[i][e]:
                    df.iloc[i, 1] = df.iloc[i, 1].replace(sent2_root_children[i][e][0], "[MASK]")
                    break
                elif e == len(sent2_root_children[i]) - 1:
                    if i not in del_rows:
                        del_rows.append(i)

    df.drop(del_rows, inplace=True)
    return df    


def mask_first(pos_tag):

    i = -1
    del_rows = []

    for sent1 in sentences1_nlp:
        i += 1
        pos_tags_sent = [token.pos_ for token in sent1]
        pos_tag_tokens = [token.text for token in sent1]

        if pos_tag in pos_tags_sent:
            e = pos_tags_sent.index(pos_tag)
            df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], "[MASK]")
        else:
            if i not in del_rows:
                del_rows.append(i)
    
    i = -1
    for sent2 in sentences2_nlp:
        i += 1
        pos_tags_sent = [token.pos_ for token in sent2]
        pos_tag_tokens = [token.text for token in sent2]

        if pos_tag in pos_tags_sent:
            e = pos_tags_sent.index(pos_tag)
            df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], "[MASK]")
        else:
            if i not in del_rows:
                del_rows.append(i)
        
    df.drop(del_rows, inplace=True)
    return df

    
    

            





#print(sent1_roots)

#print(sent2_roots)

#print(sent1_root_children)

#print(sent2_root_children)

# mask_token("VERB")
print(mask_first("NUM"))