Loading masking.py 0 → 100644 +67 −0 Original line number Diff line number Diff line import torch from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM import logging logging.basicConfig(level=logging.INFO) USE_GPU = 1 # Device configuration device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu') # Load pre-trained model tokenizer (vocabulary) pretrained_model = 'bert-base-cased' # pretrained_model = 'models/stsb-bert-large/0_BERT/' tokenizer = BertTokenizer.from_pretrained(pretrained_model) text = "[CLS] A man is playing a trumpet [SEP]" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` mask1 = 4 #mask2 = 14 #mask3 = 15 tokenized_text[mask1] = '[MASK]' #tokenized_text[mask2] = '[MASK]' #tokenized_text[mask3] = '[MASK]' # assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 0] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained(pretrained_model) model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to(device) segments_tensors = segments_tensors.to(device) model.to(device) # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] # get predicted tokens #prediction for mask1 predicted_index = torch.argmax(predictions[0, mask1]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) #returns "baseball" #prediction for mask2 #predicted_index = torch.argmax(predictions[0, mask2]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) #returns "actor" #prediction for mask3 #predicted_index = torch.argmax(predictions[0, mask3]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) # returns "." probing.py +122 −67 Original line number Diff line number Diff line import torch from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM import logging logging.basicConfig(level=logging.INFO) USE_GPU = 1 # Device configuration device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu') # Load pre-trained model tokenizer (vocabulary) pretrained_model = 'bert-base-cased' # pretrained_model = 'models/stsb-bert-large/0_BERT/' tokenizer = BertTokenizer.from_pretrained(pretrained_model) text = "[CLS] A man is playing a trumpet [SEP]" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` mask1 = 4 #mask2 = 14 #mask3 = 15 tokenized_text[mask1] = '[MASK]' #tokenized_text[mask2] = '[MASK]' #tokenized_text[mask3] = '[MASK]' # assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 0] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained(pretrained_model) model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to(device) segments_tensors = segments_tensors.to(device) model.to(device) # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] # get predicted tokens #prediction for mask1 predicted_index = torch.argmax(predictions[0, mask1]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) #returns "baseball" #prediction for mask2 #predicted_index = torch.argmax(predictions[0, mask2]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) #returns "actor" #prediction for mask3 #predicted_index = torch.argmax(predictions[0, mask3]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) # returns "." from STSB_Dataset import STSB_Dataset import pandas as pd import spacy import random df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe nlp = spacy.load("en_core_web_sm") sentences1 = df['sentence1'].tolist() sentences2 = df['sentence2'].tolist() sentences1_nlp = [nlp(sent) for sent in sentences1] sentences2_nlp = [nlp(sent) for sent in sentences2] sent1_roots = [] sent2_roots = [] sent1_root_children = [] sent2_root_children = [] for sent1 in sentences1_nlp: root1 = [(token.text, token, token.pos_) for token in sent1 if token.dep_ == "ROOT"] sent1_roots.append((root1[0][0], root1[0][2])) root_children1 = [(child.text, child.pos_) for child in root1[0][1].children] sent1_root_children.append(root_children1) for sent2 in sentences2_nlp: root2 = [(token.text, token, token.pos_) for token in sent2 if token.dep_ == "ROOT"] sent2_roots.append((root2[0][0], root2[0][2])) root_children2 = [(child.text, child.pos_) for child in root2[0][1].children] sent2_root_children.append(root_children2) assert len(sent1_roots) == len(sent2_roots) assert len(sent1_root_children) == len(sent2_root_children) def mask_token(pos_tag): del_rows = [] #rows to be deleted for i in range(len(sent1_roots)): if sent1_roots[i][1] == pos_tag: df.iloc[i, 0] = df.iloc[i, 0].replace(sent1_roots[i][0], "[MASK]") else: for e in range(len(sent1_root_children[i])): if pos_tag in sent1_root_children[i][e]: df.iloc[i, 0] = df.iloc[i, 0].replace(sent1_root_children[i][e][0], "[MASK]") break elif e == len(sent1_root_children[i]) - 1: del_rows.append(i) for i in range(len(sent2_roots)): if sent1_roots[i][1] == pos_tag: df.iloc[i, 1] = df.iloc[i, 1].replace(sent2_roots[i][0], "[MASK]") else: for e in range(len(sent2_root_children[i])): if pos_tag in sent2_root_children[i][e]: df.iloc[i, 1] = df.iloc[i, 1].replace(sent2_root_children[i][e][0], "[MASK]") break elif e == len(sent2_root_children[i]) - 1: if i not in del_rows: del_rows.append(i) df.drop(del_rows, inplace=True) return df def mask_first(pos_tag): i = -1 del_rows = [] for sent1 in sentences1_nlp: i += 1 pos_tags_sent = [token.pos_ for token in sent1] pos_tag_tokens = [token.text for token in sent1] if pos_tag in pos_tags_sent: e = pos_tags_sent.index(pos_tag) df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], "[MASK]") else: if i not in del_rows: del_rows.append(i) i = -1 for sent2 in sentences2_nlp: i += 1 pos_tags_sent = [token.pos_ for token in sent2] pos_tag_tokens = [token.text for token in sent2] if pos_tag in pos_tags_sent: e = pos_tags_sent.index(pos_tag) df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], "[MASK]") else: if i not in del_rows: del_rows.append(i) df.drop(del_rows, inplace=True) return df #print(sent1_roots) #print(sent2_roots) #print(sent1_root_children) #print(sent2_root_children) # mask_token("VERB") print(mask_first("NUM")) Loading
masking.py 0 → 100644 +67 −0 Original line number Diff line number Diff line import torch from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM import logging logging.basicConfig(level=logging.INFO) USE_GPU = 1 # Device configuration device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu') # Load pre-trained model tokenizer (vocabulary) pretrained_model = 'bert-base-cased' # pretrained_model = 'models/stsb-bert-large/0_BERT/' tokenizer = BertTokenizer.from_pretrained(pretrained_model) text = "[CLS] A man is playing a trumpet [SEP]" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` mask1 = 4 #mask2 = 14 #mask3 = 15 tokenized_text[mask1] = '[MASK]' #tokenized_text[mask2] = '[MASK]' #tokenized_text[mask3] = '[MASK]' # assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 0] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained(pretrained_model) model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to(device) segments_tensors = segments_tensors.to(device) model.to(device) # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] # get predicted tokens #prediction for mask1 predicted_index = torch.argmax(predictions[0, mask1]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) #returns "baseball" #prediction for mask2 #predicted_index = torch.argmax(predictions[0, mask2]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) #returns "actor" #prediction for mask3 #predicted_index = torch.argmax(predictions[0, mask3]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) # returns "."
probing.py +122 −67 Original line number Diff line number Diff line import torch from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM import logging logging.basicConfig(level=logging.INFO) USE_GPU = 1 # Device configuration device = torch.device('cuda' if (torch.cuda.is_available() and USE_GPU) else 'cpu') # Load pre-trained model tokenizer (vocabulary) pretrained_model = 'bert-base-cased' # pretrained_model = 'models/stsb-bert-large/0_BERT/' tokenizer = BertTokenizer.from_pretrained(pretrained_model) text = "[CLS] A man is playing a trumpet [SEP]" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` mask1 = 4 #mask2 = 14 #mask3 = 15 tokenized_text[mask1] = '[MASK]' #tokenized_text[mask2] = '[MASK]' #tokenized_text[mask3] = '[MASK]' # assert tokenized_text == ['[CLS]', 'Who', 'was', 'Jim', 'Hen', '##son', '?', '[SEP]', 'Jim', 'Hen', '##son', 'was', 'a', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 0] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained(pretrained_model) model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to(device) segments_tensors = segments_tensors.to(device) model.to(device) # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0] # get predicted tokens #prediction for mask1 predicted_index = torch.argmax(predictions[0, mask1]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) #returns "baseball" #prediction for mask2 #predicted_index = torch.argmax(predictions[0, mask2]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) #returns "actor" #prediction for mask3 #predicted_index = torch.argmax(predictions[0, mask3]).item() #predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #print(predicted_token) # returns "." from STSB_Dataset import STSB_Dataset import pandas as pd import spacy import random df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe nlp = spacy.load("en_core_web_sm") sentences1 = df['sentence1'].tolist() sentences2 = df['sentence2'].tolist() sentences1_nlp = [nlp(sent) for sent in sentences1] sentences2_nlp = [nlp(sent) for sent in sentences2] sent1_roots = [] sent2_roots = [] sent1_root_children = [] sent2_root_children = [] for sent1 in sentences1_nlp: root1 = [(token.text, token, token.pos_) for token in sent1 if token.dep_ == "ROOT"] sent1_roots.append((root1[0][0], root1[0][2])) root_children1 = [(child.text, child.pos_) for child in root1[0][1].children] sent1_root_children.append(root_children1) for sent2 in sentences2_nlp: root2 = [(token.text, token, token.pos_) for token in sent2 if token.dep_ == "ROOT"] sent2_roots.append((root2[0][0], root2[0][2])) root_children2 = [(child.text, child.pos_) for child in root2[0][1].children] sent2_root_children.append(root_children2) assert len(sent1_roots) == len(sent2_roots) assert len(sent1_root_children) == len(sent2_root_children) def mask_token(pos_tag): del_rows = [] #rows to be deleted for i in range(len(sent1_roots)): if sent1_roots[i][1] == pos_tag: df.iloc[i, 0] = df.iloc[i, 0].replace(sent1_roots[i][0], "[MASK]") else: for e in range(len(sent1_root_children[i])): if pos_tag in sent1_root_children[i][e]: df.iloc[i, 0] = df.iloc[i, 0].replace(sent1_root_children[i][e][0], "[MASK]") break elif e == len(sent1_root_children[i]) - 1: del_rows.append(i) for i in range(len(sent2_roots)): if sent1_roots[i][1] == pos_tag: df.iloc[i, 1] = df.iloc[i, 1].replace(sent2_roots[i][0], "[MASK]") else: for e in range(len(sent2_root_children[i])): if pos_tag in sent2_root_children[i][e]: df.iloc[i, 1] = df.iloc[i, 1].replace(sent2_root_children[i][e][0], "[MASK]") break elif e == len(sent2_root_children[i]) - 1: if i not in del_rows: del_rows.append(i) df.drop(del_rows, inplace=True) return df def mask_first(pos_tag): i = -1 del_rows = [] for sent1 in sentences1_nlp: i += 1 pos_tags_sent = [token.pos_ for token in sent1] pos_tag_tokens = [token.text for token in sent1] if pos_tag in pos_tags_sent: e = pos_tags_sent.index(pos_tag) df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], "[MASK]") else: if i not in del_rows: del_rows.append(i) i = -1 for sent2 in sentences2_nlp: i += 1 pos_tags_sent = [token.pos_ for token in sent2] pos_tag_tokens = [token.text for token in sent2] if pos_tag in pos_tags_sent: e = pos_tags_sent.index(pos_tag) df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], "[MASK]") else: if i not in del_rows: del_rows.append(i) df.drop(del_rows, inplace=True) return df #print(sent1_roots) #print(sent2_roots) #print(sent1_root_children) #print(sent2_root_children) # mask_token("VERB") print(mask_first("NUM"))