Commit 62d983e6 authored by schnurr's avatar schnurr
Browse files

Add mask_first_token method

parent c3730278
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
@@ -118,3 +118,25 @@ def mask_first(pos_tag):
            continue
        
    return df


def mask_first_token():

    df = STSB_Dataset('data/stsbenchmark/sts-test.csv').as_dataframe

    i = -1

    for sent1 in sentences1_nlp:
        i += 1
        first_token = [token.text for token in sent1][0]
        df.iloc[i, 0] = df.iloc[i, 0].replace(first_token, get_random_string(random.randrange(1, 9)))

    
    i = -1
    for sent2 in sentences2_nlp:
        i += 1
        first_token = [token.text for token in sent1][0]
        df.iloc[i, 0] = df.iloc[i, 0].replace(first_token, get_random_string(random.randrange(1, 9)))
        
    return df