Commit f3d18979 authored by rueh's avatar rueh
Browse files

inital commit

parents
Loading
Loading
Loading
Loading

analysis.py

0 → 100644
+306 −0
Original line number Diff line number Diff line
import random
import pandas as pd
import numpy as np
import math
from scipy.stats import pearsonr

def get_scores(year, systems, type):
    """Read scores of specific metric/human for all systems

    Parameters:
    year (int):  WMT year (22 or 23)
    systems (list of str): names of systems
    type (str): name of evaluation metric (automatic or human)

    Returns:
    scores (dict): list of type's scores per system

   """
    df = read_scores_data(year)
    scores = {}
    for sys in systems:
        scores[sys] = df[sys+"-"+type].tolist()
    return scores

def get_allhyps(year, systems):
    """Read MT output translations (hyps) for all systems

    Parameters:
    year (int):  WMT year (22 or 23)
    systems (list of str): names of systems

    Returns:
    hyps (dict): list of MT outputs per system

   """
    df= read_output_data(year)
    hyps = {}
    for sys in systems:
        raw_hyps = df[sys].tolist()
        hyps[sys] = ["" if isinstance(x, float) and math.isnan(x) else x for x in raw_hyps]
    return hyps

def get_allrefs(year):
    """Read all reference translations for specific year

    Parameters:
    year (int):  WMT year (22 or 23)

    Returns:
    (list of str): reference translations

   """
    df = read_output_data(year)
    return df["ref"].tolist()

def read_output_data(year):
    """Read csv file with MT outputs and references 

    Parameters:
    year (int):  WMT year (22 or 23)

    Returns:
    (DataFrame): df with MT outputs and references

   """
    if year==22:
        path = "wmt22_outputs_meanscores.csv"
    elif year==23:
        path = "wmt23_outputs_meanscores.csv"

    return pd.read_csv(path)

def read_scores_data(year):
    """Read csv file with automatic and human scores

    Parameters:
    year (int):  WMT year (22 or 23)

    Returns:
    (DataFrame): df with automatic and human scores

   """
    if year==22:
        path = "wmt22_all_scores.csv"
    elif year==23:
        path = "wmt23_all_scores.csv"
    
    return pd.read_csv(path)

def compute_corr(metric, human, automatic):
    """Compute correlation between one metric and human scores

    Parameters:
    metric (str): name of the metric
    human (dict): list of human scores per system
    automatic (dict): list of metric scores per system

    Returns:
    r (float): Pearson correlation coefficient
    p (float): p-value
   """
    #for corpusbleu compute correlation with mean human scores per system
    if metric=="corpusbleu":
        human_means = []
        corpusbleu = []
        for sys in human.keys():
            human_means.append(sum(human[sys]) / len(human[sys]))
            corpusbleu.append(automatic[sys][0])
        #print(human_means)
        #print(corpusbleu)
        r, p = pearsonr(np.array(human_means), np.array(corpusbleu))

    else:
        human_list = []
        automatic_list = []
        for sys in human.keys():
            human_list = human_list + human[sys]
            automatic_list = automatic_list + automatic[sys]
        #print(len(human_list))
        #print(len(automatic_list))
        r, p = pearsonr(np.array(human_list), np.array(automatic_list))
    return (r, p)

def max_min_diffs(metric, year, systems):
    """Find examples with largest and smalles differences in scores between human and write to text file

    Parameters:
    metric (str): name of the metric
    year (int):  WMT year (22 or 23)
    systems (list of str): names of systems

    Returns:
    None
   """
    
    df_scores = read_scores_data(year)
    df_outputs = read_output_data(year)

    df_scores = standardize_scores(df_scores, systems, metric)

    max_diffs, min_diffs = find_max_min(df_scores, metric, systems)

    max_path = "max_diff_instances.txt"
    min_path = "min_diff_instances.txt"
    write_diff_outputs(max_diffs, df_outputs, max_path, year, metric)
    write_diff_outputs(min_diffs, df_outputs, min_path, year, metric)

    return


def standardize_scores(df, systems, metric):
    """Rescale scores to range between 0 and 100 for specific metric (if necessary)

    Parameters:
    df (DataFrame): df with all scores
    systems (list of str): names of systems
    metric (str): name of the metric
    
    Returns:
    df (DataFrame): df with rescaled scores (for one metric)
   """
    for sys in systems:
        if metric in ("bleurt", "bertscore"):
            df[sys+"-"+metric] = df[sys+"-"+metric] * 100
        elif metric == "llama":
            df[sys+"-"+metric] = df[sys+"-"+metric] * (100/6)
    return df

def find_max_min(df_scores, metric, systems):
    """Compute score differences between one metric and human and add low/high difference instances (15 or more) to dataframe

    Parameters:
    df_scores (DataFrame): df with all scores
    metric (str): name of the metric
    systems (list of str): names of systems
    
    Returns:
    max_diffs (DataFrame): df with high difference examples (id, system, scores, score difference)
    min_diffs (DataFrame): df with low difference examples (id, system, scores, score difference)
   """
    diffs_list = []

    #combine MT outputs of all systems to be able to compare score differences
    for sys in systems:
        docid = df_scores["docid"]
        segid = df_scores["segid"]
        system_rows = [sys] * len(df_scores)
        metric_score = df_scores[sys + "-" + metric]
        human_score = df_scores[sys + "-human"]
        diff = (df_scores[sys + "-" + metric] - df_scores[sys + "-human"]).abs()
        
        temp_df = pd.DataFrame({
            "docid": docid,
            "segid": segid,
            "system": system_rows,
            metric : metric_score,
            "human" : human_score,
            "diff": diff
        })
        
        diffs_list.append(temp_df)

    diffs = pd.concat(diffs_list, ignore_index=True)

    #print(diffs)

    max_diffs = diffs.nlargest(15, "diff", keep="all")
    min_diffs = diffs.nsmallest(15, "diff", keep="all")

    #print(max_diffs)
    #print(type(max_diffs))
    #print(min_diffs)
    return max_diffs, min_diffs

def write_diff_outputs(diffs, df_outputs, path, year, metric):
    """Add low/high difference example to txt file (with id, system, scores, difference, reference and candidate translation text)

    Parameters:
    diffs (DataFrame): df with low/high difference examples (id, system, scores, score difference)
    path (str): file path of txt output file
    year (int):  WMT year (22 or 23)
    df_outputs (DataFrame): df with MT outputs and references
    metric (str): name of the metric
    
    Returns:
    None
   """
    with open(path, 'a') as file:
        header = ["############################################\n", "YEAR "+str(year)+" " +metric+"\n", "############################################\n"]
        file.writelines(header)

        #select random indices if more than 20 instances
        if len(diffs)>20:
            select_i = random.sample(range(len(diffs)), 20)
        else:
            select_i = range(0, len(diffs))
        
        counter = 0

        for index, row in diffs.iterrows():
            #only add selected instances
            if counter in select_i:
                docid = row["docid"]
                segid = row["segid"]
                system = row["system"]
                diff = row["diff"]
                metric_score = row[metric]
                human_score = row["human"]

                #find reference and candidate translation text for specific instance
                mask = (df_outputs["docid"]==docid) & (df_outputs["segid"]==segid)
                if mask.any():
                    ref = df_outputs.loc[mask, "ref"].values[0]
                    hyp = df_outputs.loc[mask, system].values[0]
                else:
                    ref = "No reference found"
                    hyp = "No hypothesis found"

                instance = [docid+" "+str(segid)+", system: "+system+", human score: "+str(human_score)+", "+metric+" score: "+str(metric_score)+", difference: "+str(diff)+"\n",
                            "Ref: " + str(ref)+"\n",
                            "Hyp: " + str(hyp)+"\n",
                            "----------------------------------------------------------------\n"]
                file.writelines(instance)

            counter += 1



if __name__ == "__main__":

    metrics = ["sentencebleu", "bleurt", "bertscore", "llama"]

    years = [22, 23]

    correlations = pd.DataFrame(columns=years, index=metrics.append("corpusbleu"))

    
    for year in years:
        if year == 22:
            systems = ["dfki-mlt", "msmunich", "slattic", "upc", "baseline", "dfki-slt", "njupt"]
        elif year == 23:
            systems = ["baseline", "casia", "knowcomp", "ttic"]

        human_scores = get_scores(year, systems, "human")

        for metric in metrics:
            metric_scores = get_scores(year, systems, metric)
        
            correlations.loc[metric, year] = compute_corr(metric, human_scores, metric_scores)
            max_min_diffs(metric, year, systems)
            
    
    #print(correlations)
    correlations.to_csv("correlations_metrics_human.csv")


    #try to reproduce original paper results with their z-values

    #human = [-0.24124, -0.28997, -0.31883, -0.12476, -0.33228, -0.16612, -0.32959]
    #bleurt = [0.102, 0.109, 0.830, 0.150, 0.127, 0.740, 0.111]
    #human1 = [2.07467, 2.00780, 0.52048, 0.43730, 0.33916, 0.20652, 0.04112]

    #print(pearsonr(human, bleurt))
    #print(pearsonr(human1, bleurt))
    
 No newline at end of file

automatic_scores.py

0 → 100644
+339 −0
Original line number Diff line number Diff line
import re
from string import Template
import ollama
import pandas as pd
import math
from data import sysdict22, sysdict23
from sacrebleu.metrics import BLEU
import evaluate
import os



#print(df22)
#print(df23)
#print(df22.mean(numeric_only=True))
#print(df23.mean(numeric_only=True))



#print(df22["ref"].tolist())


def get_data():
    """Read references and MT system outputs (hyps) from csv file

    Parameters:

    Returns:
    ref22 (list of str): 2022 reference translations
    ref23 (list of str): 2023 reference translations
    hyp22 (dict): 2022 MT outputs per system
    hyp23 (dict): 2023 MT outputs per system

   """

    df22 = pd.read_csv("wmt22_outputs_meanscores.csv")
    df23 = pd.read_csv("wmt23_outputs_meanscores.csv")

    ref22 = df22["ref"].tolist()
    ref23 = df23["ref"].tolist()

    hyp22 = get_hyp(df22, sysdict22.keys())
    hyp23 = get_hyp(df23, sysdict23.keys())

    path22 = "wmt22_all_scores.csv"
    path23 = "wmt23_all_scores.csv"

    #initialize csv files for all human and automatic scores with sentence ids

    if not os.path.exists(path22):

        scores22 = df22.iloc[:, :2]
        scores23 = df23.iloc[:, :2]

        scores22.to_csv(path22, index=None)
        scores23.to_csv(path23, index=None)

    return ref22, ref23, hyp22, hyp23


def get_hyp(df, systems):
    """Map empty hyps (NaN) to empty string

    Parameters:
    df (DataFrame): df with all hyps and refs
    systems (list of str): lnames of systems

    Returns:
    hyp (dict): cleaned hyps per system

   """
    hyp = {}
    for sys in systems:
        raw_hyp = df[sys].tolist()
        hyp[sys] = ["" if isinstance(x, float) and math.isnan(x) else x for x in raw_hyp]
    return hyp

def read_df(year):
    """Read already saved scores from csv file

    Returns:
    DataFrame: df with previously computed scores per system

   """

    if year==22:
        path = "wmt22_all_scores.csv"
    elif year==23:
        path = "wmt23_all_scores.csv"

    return pd.read_csv(path)

def save_df(year, df):
    """Save score df as csv file

    Returns:
    None

   """
     
    if year==22:
        path = "wmt22_all_scores.csv"
    elif year==23:
        path = "wmt23_all_scores.csv"

    df.to_csv(path, index=None)

def bleu_scores(year, refs, hyps):
    """Compute corpus and sentenceBLEU scores and add to score csv file

    Parameters:
    year (int):  WMT year (22 or 23)
    refs (list of str): reference translations
    hyps (dict): MT outputs per system

    Returns:
    None

   """
    df = read_df(year)
    bleu_c = BLEU()
    bleu_s = BLEU(effective_order=True)

    for sys in hyps.keys():
        if not sys=="ref":
            #print(sys)
            #print(type(hyps[sys]))
            #print(len(hyps[sys]))
            #print(len(refs))
            bleu_corpus = bleu_c.corpus_score(hypotheses=hyps[sys], references=[refs])
            df[sys+"-corpusbleu"] = None
            #assign corpuslevel score to first row
            df.loc[0, sys+"-corpusbleu"]= bleu_corpus.score
            #print("Corpus: " + str(bleu_corpus.score))
            #print(bleu.get_signature())

            bleu_sentence = []

            for hyp, ref in zip(hyps[sys], refs):
                score = bleu_s.sentence_score(hypothesis=hyp, references=[ref])
                #print("Sentence: " + str(score.score))
                bleu_sentence.append(score.score)

            df[sys+"-sentencebleu"] = bleu_sentence
            #bleu_sentence_mean = sum(bleu_sentence)/len(bleu_sentence)
            #print(bleu_sentence_mean)
            #print(bleu.get_signature)

            #print(min(sentence_bleu_scores))
            #print(max(sentence_bleu_scores))

            save_df(year, df)

def bleurt_scores(year, refs, hyps):
    """Compute BLEURT scores and add to score csv file

    Parameters:
    year (int):  WMT year (22 or 23)
    refs (list of str): reference translations
    hyps (dict): MT outputs per system

    Returns:
    None

   """
    df = read_df(year)
    bleurt = evaluate.load("bleurt", "BLEURT-20", module_type="metric")

    for sys in hyps.keys():
        if not sys=="ref":
            #print(sys)
            scores = bleurt.compute(predictions=hyps[sys], references=refs)
            df[sys+"-bleurt"] = scores["scores"]
            #mean = sum(scores["scores"])/len(scores["scores"])
            print(sys+" done")

    save_df(year, df)

def bertscores(year, refs, hyps):
    """Compute BERTscore and add to score csv file

    Parameters:
    year (int):  WMT year (22 or 23)
    refs (list of str): reference translations
    hyps (dict): MT outputs per system

    Returns:
    None

   """
    df = read_df(year)
    bertscore = evaluate. load("bertscore")
    for sys in hyps.keys():
        if not sys == "ref":
            #print(sys)
            scores = bertscore.compute(predictions=hyps[sys], references=refs, lang="de")
            df[sys+"-bertscore"] = scores["f1"]
            #mean = sum(scores["f1"])/len(scores["f1"])
            #print(mean)
    save_df(year, df)




def generate_prompt(template, reference, candidate_translation):
    """Generate prompt text with reference and hypothesis text

    Parameters:
    template (template: str): prompt template with placeholders
    reference (str): single reference translation text
    candidate_translation (str): single candidate translation text

    Returns:
    (template: str): prompt text with reference and hypothesis

   """
    return template.substitute(reference=reference, candidate_translation=candidate_translation)


def extract_score(response_text):
    """Extract score from Llama generated response

    Parameters:
    response_text (str): generated response text

    Returns:
    score (int) if found, else None

   """
    match = re.search(r'Score:\s*([0-6])', response_text)
    if match:
        return int(match.group(1))
    else:
        return None
    
def llama(year, refs, hyps):
    """Generate scores with Llama3 and add to score csv file

    Parameters:
    year (int):  WMT year (22 or 23)
    refs (list of str): reference translations
    hyps (dict): MT outputs per system

    Returns:
    None

   """
    df = read_df(year)
    
    prompt_template = Template(
        "Below you see a German reference sentence and its corresponding candidate translation in German that has been translated from Swiss-German Sign Language."
        "Score the candidate sentence translation with regard to the reference."
        "Assess the translation quality on a discrete scale using the quality levels described as follows:\n"
        "0: Nonsense/No meaning preserved: Nearly all information is lost between the translation and reference. Grammar is irrelevant.\n"
        "1: In between scores 0 and 2\n"
        "2: Some Meaning Preserved: The translation preserves some of the meaning of the references but misses significant parts. The narrative is hard to follow due to fundamental errors. Grammar may be poor.\n"
        "3: In between scores 2 and 4\n"
        "4: Most Meaning Preserved and Few Grammar Mistakes: The translation retains most of the meaning of the reference. It may have some grammar mistakes or minor inconsistencies.\n"
        "5: In between scores 4 and 6\n"
        "6: Perfect Meaning and Grammar: The meaning of the translation is completely consistent with the reference. The grammar is also correct.\n"
        "Please output only a single score in the format 'Score: X' (where 'X' is the number between 0 and 6).\n"
        "Reference: '${reference}'\n"
        "Candidate Translation: '${candidate_translation}'\n"
    )

    for sys in hyps.keys():
        if not sys == "ref":
        #if sys=="msmunich":
            print(sys)
            i=0
            llama_scores = []

            for hyp, ref in zip(hyps[sys], refs):
                i+=1

                prompt = generate_prompt(prompt_template, ref, hyp)
                response = ollama.generate(model='llama3', options={'temperature': 0, 'seed': 42}, prompt=prompt)

                print(i, "Full response:", response["response"])
                #print(ref)
                #print(hyp)
                score = extract_score(response["response"])
                llama_scores.append(score)
                print(i, "Extracted score:", score)

            df[sys+"-llama"] = llama_scores
            save_df(year, df)

def add_human_scores(year, hyps):
    """Add human evaluatin scores to automatic score csv file

    Parameters:
    year (int):  WMT year (22 or 23)
    hyps (dict): MT outputs per system

    Returns:
    None

   """
    df = read_df(year)

    if year == 22:
        path = "wmt22_outputs_meanscores.csv"
    elif year == 23:
        path = "wmt23_outputs_meanscores.csv"

    df_outputs = pd.read_csv(path)

    for sys in hyps.keys():
        if not sys == "ref":
            df[sys+"-human"] = df_outputs[sys+"-score"]
            #print(df)

    save_df(year, df)

if __name__ == "__main__":

  ref22, ref23, hyp22, hyp23 = get_data()


  bleu_scores(22, ref22, hyp22)
  bleu_scores(23, ref23, hyp23)

  bleurt_scores(23, ref23, hyp23)
  bleurt_scores(22, ref22, hyp22)
  

  bertscores(22, ref22, hyp22)
  bertscores(23, ref23, hyp23)

  llama(22, ref22, hyp22)
  llama(23, ref23, hyp23)

  add_human_scores(22, hyp22)
  add_human_scores(23, hyp23)


  
  
+6 −0
Original line number Diff line number Diff line
,22,23
sentencebleu,"(0.6972372255859451, 0.0)","(0.6045151292382057, 5.284142602012019e-198)"
bleurt,"(0.6450140638927215, 0.0)","(0.2629819627133834, 9.680483369594377e-33)"
bertscore,"(0.3978383262147786, 6.312169999256138e-130)","(0.12834061837900057, 9.654011877530597e-09)"
llama,"(0.4557267366926947, 7.816766094103858e-175)","(0.22368324745366885, 6.430425293212174e-24)"
corpusbleu,"(0.500830566569911, 0.2522549253401538)","(0.9752122826139026, 0.02478771738609753)"

data.py

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.

max_diff_instances.txt

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.