Commit 2ec89e50 authored by schaper's avatar schaper
Browse files
parents 3d58bf70 69f81891
Loading
Loading
Loading
Loading
+15 −4
Original line number Diff line number Diff line
from STSB_Dataset import STSB_Dataset
import pandas as pd
import spacy
import string
import random

df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe

@@ -53,8 +55,15 @@ for sent2 in sentences2_nlp:
    dependency_hierarchy2.append(dep_hierarchy2)


def get_random_string(length):
    letters = string.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(length))
    return result_str

def mask_token(pos_tag):

    df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe
    
    for e in range(len(dependency_hierarchy1)):
        mask_token = None
        for layer in dependency_hierarchy1[e]:
@@ -63,7 +72,7 @@ def mask_token(pos_tag):
            for i in range(len(layer)):
                if layer[i].pos_ == pos_tag:
                    mask_token = layer[i].text
                    df.iloc[e, 0] = df.iloc[e, 0].replace(mask_token, "[MASK]")
                    df.iloc[e, 0] = df.iloc[e, 0].replace(mask_token, get_random_string(random.randrange(1, 9)))
                    break

    for e in range(len(dependency_hierarchy2)):
@@ -74,13 +83,15 @@ def mask_token(pos_tag):
            for i in range(len(layer)):
                if layer[i].pos_ == pos_tag:
                    mask_token = layer[i].text
                    df.iloc[e, 1] = df.iloc[e, 1].replace(mask_token, "[MASK]")
                    df.iloc[e, 1] = df.iloc[e, 1].replace(mask_token, get_random_string(random.randrange(1, 9)))
                    break
    return df
            

def mask_first(pos_tag):

    df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe

    i = -1
    del_rows = []

@@ -91,7 +102,7 @@ def mask_first(pos_tag):

        if pos_tag in pos_tags_sent:
            e = pos_tags_sent.index(pos_tag)
            df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], "[MASK]")
            df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], get_random_string(random.randrange(1, 9)))
        else:
            if i not in del_rows:
                del_rows.append(i)
@@ -104,7 +115,7 @@ def mask_first(pos_tag):

        if pos_tag in pos_tags_sent:
            e = pos_tags_sent.index(pos_tag)
            df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], "[MASK]")
            df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], random.randrange(1, 9))
        else:
            if i not in del_rows:
                del_rows.append(i)