inital commit (f3d18979) · Commits · rueh / Metrics for Sign Language Translation

analysis.py

0 → 100644

+306 −0

Original line number	Diff line number	Diff line
		import random
		import pandas as pd
		import numpy as np
		import math
		from scipy.stats import pearsonr

		def get_scores(year, systems, type):
		"""Read scores of specific metric/human for all systems

		Parameters:
		year (int): WMT year (22 or 23)
		systems (list of str): names of systems
		type (str): name of evaluation metric (automatic or human)

		Returns:
		scores (dict): list of type's scores per system

		"""
		df = read_scores_data(year)
		scores = {}
		for sys in systems:
		scores[sys] = df[sys+"-"+type].tolist()
		return scores

		def get_allhyps(year, systems):
		"""Read MT output translations (hyps) for all systems

		Parameters:
		year (int): WMT year (22 or 23)
		systems (list of str): names of systems

		Returns:
		hyps (dict): list of MT outputs per system

		"""
		df= read_output_data(year)
		hyps = {}
		for sys in systems:
		raw_hyps = df[sys].tolist()
		hyps[sys] = ["" if isinstance(x, float) and math.isnan(x) else x for x in raw_hyps]
		return hyps

		def get_allrefs(year):
		"""Read all reference translations for specific year

		Parameters:
		year (int): WMT year (22 or 23)

		Returns:
		(list of str): reference translations

		"""
		df = read_output_data(year)
		return df["ref"].tolist()

		def read_output_data(year):
		"""Read csv file with MT outputs and references

		Parameters:
		year (int): WMT year (22 or 23)

		Returns:
		(DataFrame): df with MT outputs and references

		"""
		if year==22:
		path = "wmt22_outputs_meanscores.csv"
		elif year==23:
		path = "wmt23_outputs_meanscores.csv"

		return pd.read_csv(path)

		def read_scores_data(year):
		"""Read csv file with automatic and human scores

		Parameters:
		year (int): WMT year (22 or 23)

		Returns:
		(DataFrame): df with automatic and human scores

		"""
		if year==22:
		path = "wmt22_all_scores.csv"
		elif year==23:
		path = "wmt23_all_scores.csv"

		return pd.read_csv(path)

		def compute_corr(metric, human, automatic):
		"""Compute correlation between one metric and human scores

		Parameters:
		metric (str): name of the metric
		human (dict): list of human scores per system
		automatic (dict): list of metric scores per system

		Returns:
		r (float): Pearson correlation coefficient
		p (float): p-value
		"""
		#for corpusbleu compute correlation with mean human scores per system
		if metric=="corpusbleu":
		human_means = []
		corpusbleu = []
		for sys in human.keys():
		human_means.append(sum(human[sys]) / len(human[sys]))
		corpusbleu.append(automatic[sys][0])
		#print(human_means)
		#print(corpusbleu)
		r, p = pearsonr(np.array(human_means), np.array(corpusbleu))

		else:
		human_list = []
		automatic_list = []
		for sys in human.keys():
		human_list = human_list + human[sys]
		automatic_list = automatic_list + automatic[sys]
		#print(len(human_list))
		#print(len(automatic_list))
		r, p = pearsonr(np.array(human_list), np.array(automatic_list))
		return (r, p)

		def max_min_diffs(metric, year, systems):
		"""Find examples with largest and smalles differences in scores between human and write to text file

		Parameters:
		metric (str): name of the metric
		year (int): WMT year (22 or 23)
		systems (list of str): names of systems

		Returns:
		None
		"""

		df_scores = read_scores_data(year)
		df_outputs = read_output_data(year)

		df_scores = standardize_scores(df_scores, systems, metric)

		max_diffs, min_diffs = find_max_min(df_scores, metric, systems)

		max_path = "max_diff_instances.txt"
		min_path = "min_diff_instances.txt"
		write_diff_outputs(max_diffs, df_outputs, max_path, year, metric)
		write_diff_outputs(min_diffs, df_outputs, min_path, year, metric)

		return


		def standardize_scores(df, systems, metric):
		"""Rescale scores to range between 0 and 100 for specific metric (if necessary)

		Parameters:
		df (DataFrame): df with all scores
		systems (list of str): names of systems
		metric (str): name of the metric

		Returns:
		df (DataFrame): df with rescaled scores (for one metric)
		"""
		for sys in systems:
		if metric in ("bleurt", "bertscore"):
		df[sys+"-"+metric] = df[sys+"-"+metric] * 100
		elif metric == "llama":
		df[sys+"-"+metric] = df[sys+"-"+metric] * (100/6)
		return df

		def find_max_min(df_scores, metric, systems):
		"""Compute score differences between one metric and human and add low/high difference instances (15 or more) to dataframe

		Parameters:
		df_scores (DataFrame): df with all scores
		metric (str): name of the metric
		systems (list of str): names of systems

		Returns:
		max_diffs (DataFrame): df with high difference examples (id, system, scores, score difference)
		min_diffs (DataFrame): df with low difference examples (id, system, scores, score difference)
		"""
		diffs_list = []

		#combine MT outputs of all systems to be able to compare score differences
		for sys in systems:
		docid = df_scores["docid"]
		segid = df_scores["segid"]
		system_rows = [sys] * len(df_scores)
		metric_score = df_scores[sys + "-" + metric]
		human_score = df_scores[sys + "-human"]
		diff = (df_scores[sys + "-" + metric] - df_scores[sys + "-human"]).abs()

		temp_df = pd.DataFrame({
		"docid": docid,
		"segid": segid,
		"system": system_rows,
		metric : metric_score,
		"human" : human_score,
		"diff": diff
		})

		diffs_list.append(temp_df)

		diffs = pd.concat(diffs_list, ignore_index=True)

		#print(diffs)

		max_diffs = diffs.nlargest(15, "diff", keep="all")
		min_diffs = diffs.nsmallest(15, "diff", keep="all")

		#print(max_diffs)
		#print(type(max_diffs))
		#print(min_diffs)
		return max_diffs, min_diffs

		def write_diff_outputs(diffs, df_outputs, path, year, metric):
		"""Add low/high difference example to txt file (with id, system, scores, difference, reference and candidate translation text)

		Parameters:
		diffs (DataFrame): df with low/high difference examples (id, system, scores, score difference)
		path (str): file path of txt output file
		year (int): WMT year (22 or 23)
		df_outputs (DataFrame): df with MT outputs and references
		metric (str): name of the metric

		Returns:
		None
		"""
		with open(path, 'a') as file:
		header = ["############################################\n", "YEAR "+str(year)+" " +metric+"\n", "############################################\n"]
		file.writelines(header)

		#select random indices if more than 20 instances
		if len(diffs)>20:
		select_i = random.sample(range(len(diffs)), 20)
		else:
		select_i = range(0, len(diffs))

		counter = 0

		for index, row in diffs.iterrows():
		#only add selected instances
		if counter in select_i:
		docid = row["docid"]
		segid = row["segid"]
		system = row["system"]
		diff = row["diff"]
		metric_score = row[metric]
		human_score = row["human"]

		#find reference and candidate translation text for specific instance
		mask = (df_outputs["docid"]==docid) & (df_outputs["segid"]==segid)
		if mask.any():
		ref = df_outputs.loc[mask, "ref"].values[0]
		hyp = df_outputs.loc[mask, system].values[0]
		else:
		ref = "No reference found"
		hyp = "No hypothesis found"

		instance = [docid+" "+str(segid)+", system: "+system+", human score: "+str(human_score)+", "+metric+" score: "+str(metric_score)+", difference: "+str(diff)+"\n",
		"Ref: " + str(ref)+"\n",
		"Hyp: " + str(hyp)+"\n",
		"----------------------------------------------------------------\n"]
		file.writelines(instance)

		counter += 1



		if __name__ == "__main__":

		metrics = ["sentencebleu", "bleurt", "bertscore", "llama"]

		years = [22, 23]

		correlations = pd.DataFrame(columns=years, index=metrics.append("corpusbleu"))


		for year in years:
		if year == 22:
		systems = ["dfki-mlt", "msmunich", "slattic", "upc", "baseline", "dfki-slt", "njupt"]
		elif year == 23:
		systems = ["baseline", "casia", "knowcomp", "ttic"]

		human_scores = get_scores(year, systems, "human")

		for metric in metrics:
		metric_scores = get_scores(year, systems, metric)

		correlations.loc[metric, year] = compute_corr(metric, human_scores, metric_scores)
		max_min_diffs(metric, year, systems)


		#print(correlations)
		correlations.to_csv("correlations_metrics_human.csv")


		#try to reproduce original paper results with their z-values

		#human = [-0.24124, -0.28997, -0.31883, -0.12476, -0.33228, -0.16612, -0.32959]
		#bleurt = [0.102, 0.109, 0.830, 0.150, 0.127, 0.740, 0.111]
		#human1 = [2.07467, 2.00780, 0.52048, 0.43730, 0.33916, 0.20652, 0.04112]

		#print(pearsonr(human, bleurt))
		#print(pearsonr(human1, bleurt))

		No newline at end of file

automatic_scores.py

0 → 100644

+339 −0

Original line number	Diff line number	Diff line
		import re
		from string import Template
		import ollama
		import pandas as pd
		import math
		from data import sysdict22, sysdict23
		from sacrebleu.metrics import BLEU
		import evaluate
		import os



		#print(df22)
		#print(df23)
		#print(df22.mean(numeric_only=True))
		#print(df23.mean(numeric_only=True))



		#print(df22["ref"].tolist())


		def get_data():
		"""Read references and MT system outputs (hyps) from csv file

		Parameters:

		Returns:
		ref22 (list of str): 2022 reference translations
		ref23 (list of str): 2023 reference translations
		hyp22 (dict): 2022 MT outputs per system
		hyp23 (dict): 2023 MT outputs per system

		"""

		df22 = pd.read_csv("wmt22_outputs_meanscores.csv")
		df23 = pd.read_csv("wmt23_outputs_meanscores.csv")

		ref22 = df22["ref"].tolist()
		ref23 = df23["ref"].tolist()

		hyp22 = get_hyp(df22, sysdict22.keys())
		hyp23 = get_hyp(df23, sysdict23.keys())

		path22 = "wmt22_all_scores.csv"
		path23 = "wmt23_all_scores.csv"

		#initialize csv files for all human and automatic scores with sentence ids

		if not os.path.exists(path22):

		scores22 = df22.iloc[:, :2]
		scores23 = df23.iloc[:, :2]

		scores22.to_csv(path22, index=None)
		scores23.to_csv(path23, index=None)

		return ref22, ref23, hyp22, hyp23


		def get_hyp(df, systems):
		"""Map empty hyps (NaN) to empty string

		Parameters:
		df (DataFrame): df with all hyps and refs
		systems (list of str): lnames of systems

		Returns:
		hyp (dict): cleaned hyps per system

		"""
		hyp = {}
		for sys in systems:
		raw_hyp = df[sys].tolist()
		hyp[sys] = ["" if isinstance(x, float) and math.isnan(x) else x for x in raw_hyp]
		return hyp

		def read_df(year):
		"""Read already saved scores from csv file

		Returns:
		DataFrame: df with previously computed scores per system

		"""

		if year==22:
		path = "wmt22_all_scores.csv"
		elif year==23:
		path = "wmt23_all_scores.csv"

		return pd.read_csv(path)

		def save_df(year, df):
		"""Save score df as csv file

		Returns:
		None

		"""

		if year==22:
		path = "wmt22_all_scores.csv"
		elif year==23:
		path = "wmt23_all_scores.csv"

		df.to_csv(path, index=None)

		def bleu_scores(year, refs, hyps):
		"""Compute corpus and sentenceBLEU scores and add to score csv file

		Parameters:
		year (int): WMT year (22 or 23)
		refs (list of str): reference translations
		hyps (dict): MT outputs per system

		Returns:
		None

		"""
		df = read_df(year)
		bleu_c = BLEU()
		bleu_s = BLEU(effective_order=True)

		for sys in hyps.keys():
		if not sys=="ref":
		#print(sys)
		#print(type(hyps[sys]))
		#print(len(hyps[sys]))
		#print(len(refs))
		bleu_corpus = bleu_c.corpus_score(hypotheses=hyps[sys], references=[refs])
		df[sys+"-corpusbleu"] = None
		#assign corpuslevel score to first row
		df.loc[0, sys+"-corpusbleu"]= bleu_corpus.score
		#print("Corpus: " + str(bleu_corpus.score))
		#print(bleu.get_signature())

		bleu_sentence = []

		for hyp, ref in zip(hyps[sys], refs):
		score = bleu_s.sentence_score(hypothesis=hyp, references=[ref])
		#print("Sentence: " + str(score.score))
		bleu_sentence.append(score.score)

		df[sys+"-sentencebleu"] = bleu_sentence
		#bleu_sentence_mean = sum(bleu_sentence)/len(bleu_sentence)
		#print(bleu_sentence_mean)
		#print(bleu.get_signature)

		#print(min(sentence_bleu_scores))
		#print(max(sentence_bleu_scores))

		save_df(year, df)

		def bleurt_scores(year, refs, hyps):
		"""Compute BLEURT scores and add to score csv file

		Parameters:
		year (int): WMT year (22 or 23)
		refs (list of str): reference translations
		hyps (dict): MT outputs per system

		Returns:
		None

		"""
		df = read_df(year)
		bleurt = evaluate.load("bleurt", "BLEURT-20", module_type="metric")

		for sys in hyps.keys():
		if not sys=="ref":
		#print(sys)
		scores = bleurt.compute(predictions=hyps[sys], references=refs)
		df[sys+"-bleurt"] = scores["scores"]
		#mean = sum(scores["scores"])/len(scores["scores"])
		print(sys+" done")

		save_df(year, df)

		def bertscores(year, refs, hyps):
		"""Compute BERTscore and add to score csv file

		Parameters:
		year (int): WMT year (22 or 23)
		refs (list of str): reference translations
		hyps (dict): MT outputs per system

		Returns:
		None

		"""
		df = read_df(year)
		bertscore = evaluate. load("bertscore")
		for sys in hyps.keys():
		if not sys == "ref":
		#print(sys)
		scores = bertscore.compute(predictions=hyps[sys], references=refs, lang="de")
		df[sys+"-bertscore"] = scores["f1"]
		#mean = sum(scores["f1"])/len(scores["f1"])
		#print(mean)
		save_df(year, df)




		def generate_prompt(template, reference, candidate_translation):
		"""Generate prompt text with reference and hypothesis text

		Parameters:
		template (template: str): prompt template with placeholders
		reference (str): single reference translation text
		candidate_translation (str): single candidate translation text

		Returns:
		(template: str): prompt text with reference and hypothesis

		"""
		return template.substitute(reference=reference, candidate_translation=candidate_translation)


		def extract_score(response_text):
		"""Extract score from Llama generated response

		Parameters:
		response_text (str): generated response text

		Returns:
		score (int) if found, else None

		"""
		match = re.search(r'Score:\s*([0-6])', response_text)
		if match:
		return int(match.group(1))
		else:
		return None

		def llama(year, refs, hyps):
		"""Generate scores with Llama3 and add to score csv file

		Parameters:
		year (int): WMT year (22 or 23)
		refs (list of str): reference translations
		hyps (dict): MT outputs per system

		Returns:
		None

		"""
		df = read_df(year)

		prompt_template = Template(
		"Below you see a German reference sentence and its corresponding candidate translation in German that has been translated from Swiss-German Sign Language."
		"Score the candidate sentence translation with regard to the reference."
		"Assess the translation quality on a discrete scale using the quality levels described as follows:\n"
		"0: Nonsense/No meaning preserved: Nearly all information is lost between the translation and reference. Grammar is irrelevant.\n"
		"1: In between scores 0 and 2\n"
		"2: Some Meaning Preserved: The translation preserves some of the meaning of the references but misses significant parts. The narrative is hard to follow due to fundamental errors. Grammar may be poor.\n"
		"3: In between scores 2 and 4\n"
		"4: Most Meaning Preserved and Few Grammar Mistakes: The translation retains most of the meaning of the reference. It may have some grammar mistakes or minor inconsistencies.\n"
		"5: In between scores 4 and 6\n"
		"6: Perfect Meaning and Grammar: The meaning of the translation is completely consistent with the reference. The grammar is also correct.\n"
		"Please output only a single score in the format 'Score: X' (where 'X' is the number between 0 and 6).\n"
		"Reference: '${reference}'\n"
		"Candidate Translation: '${candidate_translation}'\n"
		)

		for sys in hyps.keys():
		if not sys == "ref":
		#if sys=="msmunich":
		print(sys)
		i=0
		llama_scores = []

		for hyp, ref in zip(hyps[sys], refs):
		i+=1

		prompt = generate_prompt(prompt_template, ref, hyp)
		response = ollama.generate(model='llama3', options={'temperature': 0, 'seed': 42}, prompt=prompt)

		print(i, "Full response:", response["response"])
		#print(ref)
		#print(hyp)
		score = extract_score(response["response"])
		llama_scores.append(score)
		print(i, "Extracted score:", score)

		df[sys+"-llama"] = llama_scores
		save_df(year, df)

		def add_human_scores(year, hyps):
		"""Add human evaluatin scores to automatic score csv file

		Parameters:
		year (int): WMT year (22 or 23)
		hyps (dict): MT outputs per system

		Returns:
		None

		"""
		df = read_df(year)

		if year == 22:
		path = "wmt22_outputs_meanscores.csv"
		elif year == 23:
		path = "wmt23_outputs_meanscores.csv"

		df_outputs = pd.read_csv(path)

		for sys in hyps.keys():
		if not sys == "ref":
		df[sys+"-human"] = df_outputs[sys+"-score"]
		#print(df)

		save_df(year, df)

		if __name__ == "__main__":

		ref22, ref23, hyp22, hyp23 = get_data()


		bleu_scores(22, ref22, hyp22)
		bleu_scores(23, ref23, hyp23)

		bleurt_scores(23, ref23, hyp23)
		bleurt_scores(22, ref22, hyp22)


		bertscores(22, ref22, hyp22)
		bertscores(23, ref23, hyp23)

		llama(22, ref22, hyp22)
		llama(23, ref23, hyp23)

		add_human_scores(22, hyp22)
		add_human_scores(23, hyp23)

correlations_metrics_human.csv

0 → 100644

+6 −0

Original line number	Diff line number	Diff line
		,22,23
		sentencebleu,"(0.6972372255859451, 0.0)","(0.6045151292382057, 5.284142602012019e-198)"
		bleurt,"(0.6450140638927215, 0.0)","(0.2629819627133834, 9.680483369594377e-33)"
		bertscore,"(0.3978383262147786, 6.312169999256138e-130)","(0.12834061837900057, 9.654011877530597e-09)"
		llama,"(0.4557267366926947, 7.816766094103858e-175)","(0.22368324745366885, 6.430425293212174e-24)"
		corpusbleu,"(0.500830566569911, 0.2522549253401538)","(0.9752122826139026, 0.02478771738609753)"

data.py

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.

max_diff_instances.txt

0 → 100644

+0 −0

File added.

Preview size limit exceeded, changes collapsed.