Commit 1bb38a02 authored by kulcsar's avatar kulcsar
Browse files

doc changes in main and train

parent 3017363c
Loading
Loading
Loading
Loading
+20 −82
Original line number Diff line number Diff line
@@ -22,16 +22,8 @@ torch.cuda.empty_cache()

#with torch.autocast("cuda"):

def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold, lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size):
	"""Write Train loop for model with certain train dataset"""
	#set_seed(seed)
	#if model_name[0] == "b":
	#	model=BertForWordClassification.from_pretrained(model_name).to("cuda")
	#elif model_name[0] == "r":
	#	model=RobertaForWordClassification.from_pretrained(model_name),to("cuda")
	print("batch size: ", batch_size)
	print("test batch size: ", test_batch_size)
	print("mix up: ", mixup)
def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size):
	"""Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model."""
	model.train().to("cuda")
	train_sampler = RandomSampler(train_dataset)
	train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
@@ -41,14 +33,7 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold,
	lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps)

	model.zero_grad()
	#progress_bar=tqdm(range(num_training_steps))
	print("imdb: ", imdb)
	print("tmix: ", tmix)
	print("mixlayer:", mixlayer)
	for epoch in range(num_epochs):
		#for param_tensor in model.state_dict():
		#	print(param_tensor, "\t", model.state_dict()[param_tensor])
		print("Epoche: ", epoch)
		index=0
		
		for batch in train_dataloader:
@@ -126,56 +111,29 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold,
				start_positions=batch[2]
				end_positions=batch[3]
			outputs=model(**inputs)
			#print("outputs: ", outputs)
			#print("outputs 0: ", outputs[0])
			loss=outputs[0]
			#print("length of outputs; ", len(outputs))
			#for i in range(len(outputs)):
			print("Loss: ", loss)
			if mixup == True: #and epoch>=mixup_epoch-1:
				#loss.backward(retain_graph=True)
			loss.backward()
				#print("epoch: {0}, retained".format(epoch))
			else:
				loss.backward()
			#if (index+1)%gradient_accumulation_steps==0:
			optimizer.step()
			lr_scheduler.step()
			optimizer.zero_grad()
			model.zero_grad()
			#	#	print("outputs {0}: {1}".format(i, outputs[i].size()))
			if epoch==mixepoch: #also make choosing epoch for tmix available

			if epoch==mixepoch:
				print("mixepoch")
				if mixup == True:
					print("length of outputs: ", len(outputs))
					print("outputs: ", outputs)
					#calculate new last hidden states and predictions(logits)
					new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value, threshold)
               		#for matrix in new_matrix_batchi
					print("new matrix batch size: ", new_matrix_batch.size())

					new_matrix_batch.to("cuda")
					new_labels_batch.to("cuda")

					span_output=torch.randn(new_matrix_batch.shape[0], new_matrix_batch.shape[-1]).to("cuda")
					for i in range(new_matrix_batch.shape[0]):
						span_output[i]=new_matrix_batch[i][start_positions[i]:end_positions[i]].mean(dim=0)
					#print("span output size: ", span_output.size())
					#print("span output: ", span_output)
					logits=model.classifier(span_output.detach()) #target_value?
						
					#print("logits: ", logits)
					print("logits shape: ", list(logits.shape))
					# print("Newlabels: ", new_labels_batch)
					print("labels shape: ", list(new_labels_batch.shape))

					logits=model.classifier(span_output.detach())
					logits = logits.view(-1, 2).to("cuda")
					print("logits: ", logits)
					target = new_labels_batch.view(-1).to("cuda")
					print("Newlabels: ", new_labels_batch)
					loss_2 = cross_entropy(logits, target, lambda_value)
					#loss_2 = SoftCrossEntropyLoss(logits.view(-1, 2).to("cuda"), new_labels_batch.view(-1).to("cuda"))
					#loss_2 = torch.nn.functional.cross_entropy(preds, target.long())
					print("MixUp Loss: ", loss_2)
					
					#update entire model
					loss_2.backward()
					optimizer.step()
@@ -183,20 +141,13 @@ def train(model, name, imdb, seed,gradient_accumulation_steps,mixup, threshold,
					optimizer.zero_grad()
					model.zero_grad()
	

				#print(outputs[2].size())

			#print(outputs[0].size())
			#progress_bar.update(1)
		#print("one epoch done")
	
	torch.save(model, "./saved_models/bert_baseline.pt")

	#print(model_name)
	#evaluate trained model
	evaluation_test = evaluation.evaluate_model(model, name,  test_dataset, learning_rate, test_batch_size, imdb)
	evaluation_train = evaluation.evaluate_model(model, name, train_dataset, learning_rate, test_batch_size, imdb)
 
	print("DEV: ", evaluation_test)
	print("TEST: ", evaluation_test)
	print("TRAIN: ", evaluation_train)

	return evaluation_test, evaluation_train
@@ -243,49 +194,36 @@ def cross_entropy(logits, target, l):



def mixup_function(batch_of_matrices, batch_of_labels, l, t):
def mixup_function(batch_of_matrices, batch_of_labels, l):
	"""Function to perform mixup on a batch of matrices and labels with a given lambda
	"""
	runs = math.floor(batch_of_matrices.size()[0]/2)
	counter=0
	results=[]
	result_labels=[]
	for i in range(runs):
		print("doing interpolation with lambda: {0} and threshold: {1}...".format(l, t))
		#get matrices and labels out of batch
		matrix1=batch_of_matrices[counter]
		label1=batch_of_labels[counter]
		matrix2=batch_of_matrices[counter+1]
		label2=batch_of_labels[counter+1]
		new_matrix, new_label=interpolate(matrix1, label1, matrix2, label2, l, t)

		#do interpolation
		new_matrix=matrix1*l + (1-l)*matrix2
		new_label=l*label1 + (1-l)*label2
		
		if new_matrix != None:
			results.append(new_matrix)
			result_labels.append(new_label)
		counter+=2
	results=torch.stack(results)
	result_labels= torch.stack(result_labels) #torch.LongTensor(result_labels)
	#print("mixup done")
	return results, result_labels

def interpolate(matrix1, label1, matrix2, label2, l, threshold):
	new_matrix=(matrix1*l)+(matrix2 * (1-l))
	new_label=(label1*l)+(label2*(1-l))

	#if new_label > 0.5+threshold:
	#	new_label=1
	#elif new_label < 0.5-threshold:
	#	new_label=0
	#else:
	#	print("in undefinded zone")
	#	return None, None
	return new_matrix, new_label#torch.tensor([new_label])
	
def train_salami(model, seed, train_set, test_set, batch_size, test_batch_size, learning_rate, epochs):
	"""Train loop of the salami group"""
	results=[]
	#for num_run, seed in enumerate(random.sample(range(1, 100), num_runs)):
		#if model_name[0]=="b":
		#	model=BertForWordClassification.from_pretrained(model_name)
		#else:
		#	model=RobertaForWordClassification.from_pretrained(model_name)

		#set_seed(seed)
	training_args = TrainingArguments(
		output_dir="./results",  # output directory
		num_train_epochs=epochs,  # total # of training epochs
+5 −30
Original line number Diff line number Diff line
@@ -11,10 +11,6 @@ from typing import List


def run(raw_args):
	#print("parsing")
	#args=_parse_args(raw_args)
	#print("parsed arguments")

	#load test and train dataset as well as tokenizers and models...

	#Datasets
@@ -58,9 +54,6 @@ def run(raw_args):
		print("train dataset preprocessing ")        
		print(args.tcontext)
		train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext)
		
		#print("test dataset preprocesssing ")
		#print(args.vcontext)
		test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False) 
	
	elif args.tokenizer=="li":
@@ -70,15 +63,16 @@ def run(raw_args):
	else:
		print("non eligible tokenizer selected")

	#train...
	#train&evaluate...
	print("training..")
	if args.train_loop=="swp":
		evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.gradient_accumulation_steps, args.mix_up, args.threshold, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size)
		evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size)
	elif args.train_loop=="salami":
		evaluation_test = train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
	else:
		print("no eligible train loop selected")
	#(evaluate... is done internally) but could maybe be implemented here to make average over multiple random seeds
	
	#save
	if isinstance(args.save_directory, str): 
		with open(args.save_directory, "x") as f:
			f.write(str(args))
@@ -139,24 +133,19 @@ if __name__ == "__main__":
		"-tc",
		"--tcontext",
		action="store_true",
		#default=False,
		#type=bool,
		help="whether or not to preprocess train set with context")

	parser.add_argument(
		"-vc",
		"--vcontext",
		#default=False,
		#type=bool,
        action="store_true",
		help="whether or not to preprocess the test set with context")

	parser.add_argument(
		"--masking",
		#default=False,
		#type=bool,
		action="store_true",
		help="whether or not to mask the target word")
	
	parser.add_argument(
		"-max",
		"--max_length",
@@ -196,26 +185,12 @@ if __name__ == "__main__":
		type=int,
		default=32)

	parser.add_argument(
		"-gras",
		"--gradient_accumulation_steps",
		help="gradient accumulation steps for training",
		type=int,
		default=1)

	parser.add_argument(
		"-mixup",
		"--mix_up",
		help="whether or not to apply mixup during training",
		action="store_true")

	parser.add_argument(
		"-threshold",
		"--threshold",
		help="specifies the value for mixup threshold",
		type=float,
		default=0.05)

	parser.add_argument(
		"-lambda",
		"--lambda_value",