Commit 119d02c5 authored by schnurr's avatar schnurr
Browse files

Use random strings

parent f4e0d08d
Loading
Loading
Loading
Loading
+11 −5
Original line number Diff line number Diff line
from STSB_Dataset import STSB_Dataset
import pandas as pd
import spacy

import string
import random

df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe

@@ -54,6 +55,11 @@ for sent2 in sentences2_nlp:
    dependency_hierarchy2.append(dep_hierarchy2)


def get_random_string(length):
    letters = string.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(length))
      return result_str


def mask_token(pos_tag):
    
@@ -65,7 +71,7 @@ def mask_token(pos_tag):
            for i in range(len(layer)):
                if layer[i].pos_ == pos_tag:
                    mask_token = layer[i].text
                    df.iloc[e, 0] = df.iloc[e, 0].replace(mask_token, "[MASK]")
                    df.iloc[e, 0] = df.iloc[e, 0].replace(mask_token, get_random_string(random.randrange(1, 9)))
                    break

    for e in range(len(dependency_hierarchy2)):
@@ -76,7 +82,7 @@ def mask_token(pos_tag):
            for i in range(len(layer)):
                if layer[i].pos_ == pos_tag:
                    mask_token = layer[i].text
                    df.iloc[e, 1] = df.iloc[e, 1].replace(mask_token, "[MASK]")
                    df.iloc[e, 1] = df.iloc[e, 1].replace(mask_token, get_random_string(random.randrange(1, 9)))
                    break
    return df
            
@@ -94,7 +100,7 @@ def mask_first(pos_tag):

        if pos_tag in pos_tags_sent:
            e = pos_tags_sent.index(pos_tag)
            df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], "[MASK]")
            df.iloc[i, 0] = df.iloc[i, 0].replace(pos_tag_tokens[e], get_random_string(random.randrange(1, 9)))
        else:
            if i not in del_rows:
                del_rows.append(i)
@@ -107,7 +113,7 @@ def mask_first(pos_tag):

        if pos_tag in pos_tags_sent:
            e = pos_tags_sent.index(pos_tag)
            df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], "[MASK]")
            df.iloc[i, 1] = df.iloc[i, 1].replace(pos_tag_tokens[e], random.randrange(1, 9))
        else:
            if i not in del_rows:
                del_rows.append(i)