Loading Code/train.py +20 −82 Original line number Diff line number Diff line Loading @@ -22,16 +22,8 @@ torch.cuda.empty_cache() #with torch.autocast("cuda"): def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size): """Write Train loop for model with certain train dataset""" #set_seed(seed) #if model_name[0] == "b": # model=BertForWordClassification.from_pretrained(model_name).to("cuda") #elif model_name[0] == "r": # model=RobertaForWordClassification.from_pretrained(model_name),to("cuda") print("batch size: ", batch_size) print("test batch size: ", test_batch_size) print("mix up: ", mixup) def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size): """Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model.""" model.train().to("cuda") train_sampler = RandomSampler(train_dataset) train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) Loading @@ -41,14 +33,7 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps) model.zero_grad() #progress_bar=tqdm(range(num_training_steps)) print("imdb: ", imdb) print("tmix: ", tmix) print("mixlayer:", mixlayer) for epoch in range(num_epochs): #for param_tensor in model.state_dict(): # print(param_tensor, "\t", model.state_dict()[param_tensor]) print("Epoche: ", epoch) index=0 for batch in train_dataloader: Loading Loading @@ -126,56 +111,29 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, start_positions=batch[2] end_positions=batch[3] outputs=model(**inputs) #print("outputs: ", outputs) #print("outputs 0: ", outputs[0]) loss=outputs[0] #print("length of outputs; ", len(outputs)) #for i in range(len(outputs)): print("Loss: ", loss) if mixup == True: #and epoch>=mixup_epoch-1: #loss.backward(retain_graph=True) loss.backward() #print("epoch: {0}, retained".format(epoch)) else: loss.backward() #if (index+1)%gradient_accumulation_steps==0: optimizer.step() lr_scheduler.step() optimizer.zero_grad() model.zero_grad() # # print("outputs {0}: {1}".format(i, outputs[i].size())) if epoch==mixepoch: #also make choosing epoch for tmix available if epoch==mixepoch: print("mixepoch") if mixup == True: print("length of outputs: ", len(outputs)) print("outputs: ", outputs) #calculate new last hidden states and predictions(logits) new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value, threshold) #for matrix in new_matrix_batchi print("new matrix batch size: ", new_matrix_batch.size()) new_matrix_batch.to("cuda") new_labels_batch.to("cuda") span_output=torch.randn(new_matrix_batch.shape[0], new_matrix_batch.shape[-1]).to("cuda") for i in range(new_matrix_batch.shape[0]): span_output[i]=new_matrix_batch[i][start_positions[i]:end_positions[i]].mean(dim=0) #print("span output size: ", span_output.size()) #print("span output: ", span_output) logits=model.classifier(span_output.detach()) #target_value? #print("logits: ", logits) print("logits shape: ", list(logits.shape)) # print("Newlabels: ", new_labels_batch) print("labels shape: ", list(new_labels_batch.shape)) logits=model.classifier(span_output.detach()) logits = logits.view(-1, 2).to("cuda") print("logits: ", logits) target = new_labels_batch.view(-1).to("cuda") print("Newlabels: ", new_labels_batch) loss_2 = cross_entropy(logits, target, lambda_value) #loss_2 = SoftCrossEntropyLoss(logits.view(-1, 2).to("cuda"), new_labels_batch.view(-1).to("cuda")) #loss_2 = torch.nn.functional.cross_entropy(preds, target.long()) print("MixUp Loss: ", loss_2) #update entire model loss_2.backward() optimizer.step() Loading @@ -183,20 +141,13 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, optimizer.zero_grad() model.zero_grad() #print(outputs[2].size()) #print(outputs[0].size()) #progress_bar.update(1) #print("one epoch done") torch.save(model, "./saved_models/bert_baseline.pt") #print(model_name) #evaluate trained model evaluation_test = evaluation.evaluate_model(model, name, test_dataset, learning_rate, test_batch_size, imdb) evaluation_train = evaluation.evaluate_model(model, name, train_dataset, learning_rate, test_batch_size, imdb) print("DEV: ", evaluation_test) print("TEST: ", evaluation_test) print("TRAIN: ", evaluation_train) return evaluation_test, evaluation_train Loading Loading @@ -243,49 +194,36 @@ def cross_entropy(logits, target, l): def mixup_function(batch_of_matrices, batch_of_labels, l, t): def mixup_function(batch_of_matrices, batch_of_labels, l): """Function to perform mixup on a batch of matrices and labels with a given lambda """ runs = math.floor(batch_of_matrices.size()[0]/2) counter=0 results=[] result_labels=[] for i in range(runs): print("doing interpolation with lambda: {0} and threshold: {1}...".format(l, t)) #get matrices and labels out of batch matrix1=batch_of_matrices[counter] label1=batch_of_labels[counter] matrix2=batch_of_matrices[counter+1] label2=batch_of_labels[counter+1] new_matrix, new_label=interpolate(matrix1, label1, matrix2, label2, l, t) #do interpolation new_matrix=matrix1*l + (1-l)*matrix2 new_label=l*label1 + (1-l)*label2 if new_matrix != None: results.append(new_matrix) result_labels.append(new_label) counter+=2 results=torch.stack(results) result_labels= torch.stack(result_labels) #torch.LongTensor(result_labels) #print("mixup done") return results, result_labels def interpolate(matrix1, label1, matrix2, label2, l, threshold): new_matrix=(matrix1*l)+(matrix2 * (1-l)) new_label=(label1*l)+(label2*(1-l)) #if new_label > 0.5+threshold: # new_label=1 #elif new_label < 0.5-threshold: # new_label=0 #else: # print("in undefinded zone") # return None, None return new_matrix, new_label#torch.tensor([new_label]) def train_salami(model, seed, train_set, test_set, batch_size, test_batch_size, learning_rate, epochs): """Train loop of the salami group""" results=[] #for num_run, seed in enumerate(random.sample(range(1, 100), num_runs)): #if model_name[0]=="b": # model=BertForWordClassification.from_pretrained(model_name) #else: # model=RobertaForWordClassification.from_pretrained(model_name) #set_seed(seed) training_args = TrainingArguments( output_dir="./results", # output directory num_train_epochs=epochs, # total # of training epochs Loading main.py +5 −30 Original line number Diff line number Diff line Loading @@ -11,10 +11,6 @@ from typing import List def run(raw_args): #print("parsing") #args=_parse_args(raw_args) #print("parsed arguments") #load test and train dataset as well as tokenizers and models... #Datasets Loading Loading @@ -58,9 +54,6 @@ def run(raw_args): print("train dataset preprocessing ") print(args.tcontext) train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext) #print("test dataset preprocesssing ") #print(args.vcontext) test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False) elif args.tokenizer=="li": Loading @@ -70,15 +63,16 @@ def run(raw_args): else: print("non eligible tokenizer selected") #train... #train&evaluate... print("training..") if args.train_loop=="swp": evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.gradient_accumulation_steps, args.mix_up, args.threshold, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size) evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size) elif args.train_loop=="salami": evaluation_test = train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs) else: print("no eligible train loop selected") #(evaluate... is done internally) but could maybe be implemented here to make average over multiple random seeds #save if isinstance(args.save_directory, str): with open(args.save_directory, "x") as f: f.write(str(args)) Loading Loading @@ -139,24 +133,19 @@ if __name__ == "__main__": "-tc", "--tcontext", action="store_true", #default=False, #type=bool, help="whether or not to preprocess train set with context") parser.add_argument( "-vc", "--vcontext", #default=False, #type=bool, action="store_true", help="whether or not to preprocess the test set with context") parser.add_argument( "--masking", #default=False, #type=bool, action="store_true", help="whether or not to mask the target word") parser.add_argument( "-max", "--max_length", Loading Loading @@ -196,26 +185,12 @@ if __name__ == "__main__": type=int, default=32) parser.add_argument( "-gras", "--gradient_accumulation_steps", help="gradient accumulation steps for training", type=int, default=1) parser.add_argument( "-mixup", "--mix_up", help="whether or not to apply mixup during training", action="store_true") parser.add_argument( "-threshold", "--threshold", help="specifies the value for mixup threshold", type=float, default=0.05) parser.add_argument( "-lambda", "--lambda_value", Loading Loading
Code/train.py +20 −82 Original line number Diff line number Diff line Loading @@ -22,16 +22,8 @@ torch.cuda.empty_cache() #with torch.autocast("cuda"): def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size): """Write Train loop for model with certain train dataset""" #set_seed(seed) #if model_name[0] == "b": # model=BertForWordClassification.from_pretrained(model_name).to("cuda") #elif model_name[0] == "r": # model=RobertaForWordClassification.from_pretrained(model_name),to("cuda") print("batch size: ", batch_size) print("test batch size: ", test_batch_size) print("mix up: ", mixup) def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size): """Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model.""" model.train().to("cuda") train_sampler = RandomSampler(train_dataset) train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) Loading @@ -41,14 +33,7 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps) model.zero_grad() #progress_bar=tqdm(range(num_training_steps)) print("imdb: ", imdb) print("tmix: ", tmix) print("mixlayer:", mixlayer) for epoch in range(num_epochs): #for param_tensor in model.state_dict(): # print(param_tensor, "\t", model.state_dict()[param_tensor]) print("Epoche: ", epoch) index=0 for batch in train_dataloader: Loading Loading @@ -126,56 +111,29 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, start_positions=batch[2] end_positions=batch[3] outputs=model(**inputs) #print("outputs: ", outputs) #print("outputs 0: ", outputs[0]) loss=outputs[0] #print("length of outputs; ", len(outputs)) #for i in range(len(outputs)): print("Loss: ", loss) if mixup == True: #and epoch>=mixup_epoch-1: #loss.backward(retain_graph=True) loss.backward() #print("epoch: {0}, retained".format(epoch)) else: loss.backward() #if (index+1)%gradient_accumulation_steps==0: optimizer.step() lr_scheduler.step() optimizer.zero_grad() model.zero_grad() # # print("outputs {0}: {1}".format(i, outputs[i].size())) if epoch==mixepoch: #also make choosing epoch for tmix available if epoch==mixepoch: print("mixepoch") if mixup == True: print("length of outputs: ", len(outputs)) print("outputs: ", outputs) #calculate new last hidden states and predictions(logits) new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value, threshold) #for matrix in new_matrix_batchi print("new matrix batch size: ", new_matrix_batch.size()) new_matrix_batch.to("cuda") new_labels_batch.to("cuda") span_output=torch.randn(new_matrix_batch.shape[0], new_matrix_batch.shape[-1]).to("cuda") for i in range(new_matrix_batch.shape[0]): span_output[i]=new_matrix_batch[i][start_positions[i]:end_positions[i]].mean(dim=0) #print("span output size: ", span_output.size()) #print("span output: ", span_output) logits=model.classifier(span_output.detach()) #target_value? #print("logits: ", logits) print("logits shape: ", list(logits.shape)) # print("Newlabels: ", new_labels_batch) print("labels shape: ", list(new_labels_batch.shape)) logits=model.classifier(span_output.detach()) logits = logits.view(-1, 2).to("cuda") print("logits: ", logits) target = new_labels_batch.view(-1).to("cuda") print("Newlabels: ", new_labels_batch) loss_2 = cross_entropy(logits, target, lambda_value) #loss_2 = SoftCrossEntropyLoss(logits.view(-1, 2).to("cuda"), new_labels_batch.view(-1).to("cuda")) #loss_2 = torch.nn.functional.cross_entropy(preds, target.long()) print("MixUp Loss: ", loss_2) #update entire model loss_2.backward() optimizer.step() Loading @@ -183,20 +141,13 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, optimizer.zero_grad() model.zero_grad() #print(outputs[2].size()) #print(outputs[0].size()) #progress_bar.update(1) #print("one epoch done") torch.save(model, "./saved_models/bert_baseline.pt") #print(model_name) #evaluate trained model evaluation_test = evaluation.evaluate_model(model, name, test_dataset, learning_rate, test_batch_size, imdb) evaluation_train = evaluation.evaluate_model(model, name, train_dataset, learning_rate, test_batch_size, imdb) print("DEV: ", evaluation_test) print("TEST: ", evaluation_test) print("TRAIN: ", evaluation_train) return evaluation_test, evaluation_train Loading Loading @@ -243,49 +194,36 @@ def cross_entropy(logits, target, l): def mixup_function(batch_of_matrices, batch_of_labels, l, t): def mixup_function(batch_of_matrices, batch_of_labels, l): """Function to perform mixup on a batch of matrices and labels with a given lambda """ runs = math.floor(batch_of_matrices.size()[0]/2) counter=0 results=[] result_labels=[] for i in range(runs): print("doing interpolation with lambda: {0} and threshold: {1}...".format(l, t)) #get matrices and labels out of batch matrix1=batch_of_matrices[counter] label1=batch_of_labels[counter] matrix2=batch_of_matrices[counter+1] label2=batch_of_labels[counter+1] new_matrix, new_label=interpolate(matrix1, label1, matrix2, label2, l, t) #do interpolation new_matrix=matrix1*l + (1-l)*matrix2 new_label=l*label1 + (1-l)*label2 if new_matrix != None: results.append(new_matrix) result_labels.append(new_label) counter+=2 results=torch.stack(results) result_labels= torch.stack(result_labels) #torch.LongTensor(result_labels) #print("mixup done") return results, result_labels def interpolate(matrix1, label1, matrix2, label2, l, threshold): new_matrix=(matrix1*l)+(matrix2 * (1-l)) new_label=(label1*l)+(label2*(1-l)) #if new_label > 0.5+threshold: # new_label=1 #elif new_label < 0.5-threshold: # new_label=0 #else: # print("in undefinded zone") # return None, None return new_matrix, new_label#torch.tensor([new_label]) def train_salami(model, seed, train_set, test_set, batch_size, test_batch_size, learning_rate, epochs): """Train loop of the salami group""" results=[] #for num_run, seed in enumerate(random.sample(range(1, 100), num_runs)): #if model_name[0]=="b": # model=BertForWordClassification.from_pretrained(model_name) #else: # model=RobertaForWordClassification.from_pretrained(model_name) #set_seed(seed) training_args = TrainingArguments( output_dir="./results", # output directory num_train_epochs=epochs, # total # of training epochs Loading
main.py +5 −30 Original line number Diff line number Diff line Loading @@ -11,10 +11,6 @@ from typing import List def run(raw_args): #print("parsing") #args=_parse_args(raw_args) #print("parsed arguments") #load test and train dataset as well as tokenizers and models... #Datasets Loading Loading @@ -58,9 +54,6 @@ def run(raw_args): print("train dataset preprocessing ") print(args.tcontext) train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext) #print("test dataset preprocesssing ") #print(args.vcontext) test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False) elif args.tokenizer=="li": Loading @@ -70,15 +63,16 @@ def run(raw_args): else: print("non eligible tokenizer selected") #train... #train&evaluate... print("training..") if args.train_loop=="swp": evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.gradient_accumulation_steps, args.mix_up, args.threshold, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size) evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size) elif args.train_loop=="salami": evaluation_test = train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs) else: print("no eligible train loop selected") #(evaluate... is done internally) but could maybe be implemented here to make average over multiple random seeds #save if isinstance(args.save_directory, str): with open(args.save_directory, "x") as f: f.write(str(args)) Loading Loading @@ -139,24 +133,19 @@ if __name__ == "__main__": "-tc", "--tcontext", action="store_true", #default=False, #type=bool, help="whether or not to preprocess train set with context") parser.add_argument( "-vc", "--vcontext", #default=False, #type=bool, action="store_true", help="whether or not to preprocess the test set with context") parser.add_argument( "--masking", #default=False, #type=bool, action="store_true", help="whether or not to mask the target word") parser.add_argument( "-max", "--max_length", Loading Loading @@ -196,26 +185,12 @@ if __name__ == "__main__": type=int, default=32) parser.add_argument( "-gras", "--gradient_accumulation_steps", help="gradient accumulation steps for training", type=int, default=1) parser.add_argument( "-mixup", "--mix_up", help="whether or not to apply mixup during training", action="store_true") parser.add_argument( "-threshold", "--threshold", help="specifies the value for mixup threshold", type=float, default=0.05) parser.add_argument( "-lambda", "--lambda_value", Loading