Commit 6ac79a69 authored by Mayumi Ohta's avatar Mayumi Ohta
Browse files

sample scripts for sentencepiece tokenization

parent 01dec5ef
Loading
Loading
Loading
Loading
+83 −0
Original line number Diff line number Diff line
name: "iwslt14-deen-bpe-transformer"

data:
    src: "de"
    trg: "en"
    train: "test/data/iwslt14_sp/train.sp.32000"
    dev: "test/data/iwslt14_sp/valid.sp.32000"
    test: "test/data/iwslt14_sp/test.sp.32000"
    level: "bpe"
    lowercase: True
    max_sent_length: 62
    src_vocab: "test/data/iwslt14_sp/vocab.txt"
    trg_vocab: "test/data/iwslt14_sp/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0
    postproccess: True
    bpe_type: "sentencepiece"
    sacrebleu:
        remove_whitespace: True
        tokenize: "13a"

training:
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999]
    scheduling: "plateau"
    patience: 5
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0003
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    early_stopping_metric: "eval_metric"
    epochs: 100
    validation_freq: 1000
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "models/iwslt14_deen_sp_transformer"
    overwrite: True
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3, 4]
    keep_last_ckpts: 5

model:
    initializer: "xavier"
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    init_gain: 1.0
    bias_initializer: "zeros"
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4
        embeddings:
            embedding_dim: 256
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 256
        ff_size: 1024
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4
        embeddings:
            embedding_dim: 256
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 256
        ff_size: 1024
        dropout: 0.3
+89 −0
Original line number Diff line number Diff line
name: "jparacrawl-enja-sp-transformer"

data:
  src: "en"
  trg: "ja"
  train: "test/data/jparacrawl/train.sp.32000"
  dev: "test/data/jparacrawl/dev.sp.32000"
  test: "test/data/iwslt17/test.sp.32000"
  random_train_subset: -1
  level: "bpe"
  lowercase: False
  max_sent_length: 250
  src_voc_min_freq: 1
  src_voc_limit: 32000
  trg_voc_min_freq: 1
  trg_voc_limit: 32000
  src_vocab: "test/data/jparacrawl/vocab.en"
  trg_vocab: "test/data/jparacrawl/vocab.ja"

testing:
  beam_size: 6
  alpha: 1.0
  postproccess: True
  bpe_type: "sentencepiece"
  sacrebleu:
    remove_whitespace: False
    tokenize: "ja-mecab"

training:
  random_seed: 42
  optimizer: "adam"
  normalization: "tokens"
  adam_betas: [0.9, 0.98]
  scheduling: "plateau"
  patience: 5
  decrease_factor: 0.7
  loss: "crossentropy"
  learning_rate: 0.001
  learning_rate_min: 1.0e-09
  learning_rate_warmup: 4000
  clip_grad_norm: 1.0
  weight_decay: 0.0
  label_smoothing: 0.1
  batch_multiplier: 16
  batch_size: 5000
  batch_type: "token"
  early_stopping_metric: "eval_metric"
  epochs: 5
  validation_freq: 1000
  logging_freq: 100
  eval_metric: "bleu"
  model_dir: "models/jparacrawl_enja_sp"
  overwrite: True
  shuffle: True
  use_cuda: True
  max_output_length: 100
  print_valid_sents: [2000, 2001, 2002, 2003, 2004]
  keep_last_ckpts: 8

model:
  initializer: "xavier"
  embed_initializer: "xavier"
  embed_init_gain: 1.0
  init_gain: 1.0
  bias_initializer: "zeros"
  tied_embeddings: False
  tied_softmax: False
  encoder:
    type: "transformer"
    num_layers: 6
    num_heads: 8
    embeddings:
      embedding_dim: 512
      scale: True
      dropout: 0.
    hidden_size: 512
    ff_size: 2048
    dropout: 0.3
  decoder:
    type: "transformer"
    num_layers: 6
    num_heads: 8
    embeddings:
      embedding_dim: 512
      scale: True
      dropout: 0.
    hidden_size: 512
    ff_size: 2048
    dropout: 0.3
+117 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash

# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
# Adapted from https://github.com/pytorch/fairseq/blob/master/examples/translation/prepare-iwslt14.sh

git clone https://github.com/moses-smt/mosesdecoder.git

MOSES=`pwd`/mosesdecoder

SCRIPTS=${MOSES}/scripts
TOKENIZER=${SCRIPTS}/tokenizer/tokenizer.perl
LC=${SCRIPTS}/tokenizer/lowercase.perl
CLEAN=${SCRIPTS}/training/clean-corpus-n.perl
URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
GZ=de-en.tgz

vocab_size=32000
src=de
tgt=en
lang=de-en
prep="../test/data/iwslt14_sp"
tmp=${prep}/tmp
orig=orig

mkdir -p ${orig} ${tmp} ${prep}

echo "Downloading data from ${URL}..."
cd ${orig}
curl -O "${URL}"

if [ -f ${GZ} ]; then
    echo "Data successfully downloaded."
else
    echo "Data not successfully downloaded."
    exit
fi

tar zxvf ${GZ}
cd ..

echo "pre-processing train data..."
for l in ${src} ${tgt}; do
    f=train.tags.$lang.$l
    tok=train.tags.$lang.tok.$l

    cat ${orig}/${lang}/${f} | \
    grep -v '<url>' | \
    grep -v '<talkid>' | \
    grep -v '<keywords>' | \
    sed -e 's/<title>//g' | \
    sed -e 's/<\/title>//g' | \
    sed -e 's/<description>//g' | \
    sed -e 's/<\/description>//g' | \
    perl ${TOKENIZER} -threads 8 -l $l > ${tmp}/${tok}
    echo ""
done
perl ${CLEAN} -ratio 1.5 ${tmp}/train.tags.${lang}.tok ${src} ${tgt} ${tmp}/train.tags.${lang}.clean 1 80
for l in ${src} ${tgt}; do
    perl ${LC} < ${tmp}/train.tags.${lang}.clean.${l} > ${tmp}/train.tags.${lang}.${l}
done

echo "pre-processing valid/test data..."
for l in ${src} ${tgt}; do
    for o in `ls ${orig}/${lang}/IWSLT14.TED*.${l}.xml`; do
    fname=${o##*/}
    f=${tmp}/${fname%.*}
    echo $o $f
    grep '<seg id' $o | \
        sed -e 's/<seg id="[0-9]*">\s*//g' | \
        sed -e 's/\s*<\/seg>\s*//g' | \
        sed -e "s/\’/\'/g" | \
    perl ${TOKENIZER} -threads 8 -l ${l} | \
    perl ${LC} > ${f}
    echo ""
    done
done

echo "creating train, valid, test..."
for l in ${src} ${tgt}; do
    awk '{if (NR%23 == 0)  print $0; }' ${tmp}/train.tags.de-en.${l} > ${tmp}/valid.${l}
    awk '{if (NR%23 != 0)  print $0; }' ${tmp}/train.tags.de-en.${l} > ${tmp}/train.${l}

    cat ${tmp}/IWSLT14.TED.dev2010.de-en.${l} \
        ${tmp}/IWSLT14.TEDX.dev2012.de-en.${l} \
        ${tmp}/IWSLT14.TED.tst2010.de-en.${l} \
        ${tmp}/IWSLT14.TED.tst2011.de-en.${l} \
        ${tmp}/IWSLT14.TED.tst2012.de-en.${l} \
        > ${tmp}/test.${l}
done

echo "learning * joint * SentencePiece..."
cat "${tmp}/train.${src}" "${tmp}/train.${tgt}" | shuf > ${tmp}/train.tmp
spm_train --input="${tmp}/train.tmp" --model_prefix=spm.${vocab_size} --vocab_size=${vocab_size} \
          --character_coverage=1.0 --hard_vocab_limit=false --model_type=unigram \
          --unk_piece='<unk>' --pad_piece='<pad>' --user_defined_symbols='&apos;,&quot;,&#91;,&#93;,&amp;'
rm "${tmp}/train.tmp"

echo "applying SentencePiece..."
for l in ${src} ${tgt}; do
    for p in train valid test; do
        spm_encode --model=spm.${vocab_size}.model --output_format=piece < "${tmp}/${p}.${l}" > "${prep}/${p}.sp.${vocab_size}.${l}"
    done
done

for l in ${src} ${tgt}; do
    for p in train valid test; do
        mv ${tmp}/${p}.${l} ${prep}/
    done
done

mv "spm.${vocab_size}.model" "${prep}/"
mv "spm.${vocab_size}.vocab" "${prep}/"
rm -rf ${MOSES}
rm -rf ${orig}
rm -rf ${tmp}

echo "done."
 No newline at end of file
+161 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash

joey_dir="${HOME}/joeynmt"
data_dir="${joey_dir}/test/data"
#data_dir="../test/data"
src="en"
trg="ja"


# Prepare Jparacrawl
echo "Prepare Jparacrawl (Train-Dev Data)"

jparacrawl_dir="${data_dir}/jparacrawl"

if [ ! -d "${jparacrawl_dir}" ]; then
  mkdir ${jparacrawl_dir}
fi
cd ${jparacrawl_dir}

jparacrawl_url="http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/${src}-${trg}.tar.gz"

if [ ! -f "${src}-${trg}.tar.gz" ]; then
  echo -e "\tDownloading training data from ${jparacrawl_url}..."
  wget ${jparacrawl_url}
fi

if [ ! -d "${src}-${trg}" ]; then
  tar xzvf ./en-ja.tar.gz
  #rm ./en-ja.tar.gz
fi

if [ ! -f "train.en" ]; then
  echo -e "\tPreprocessing training data ..."
  python ${joey_dir}/scripts/preprocess_jparacrawl.py --data_dir="${jparacrawl_dir}" --dev_size=5000 --seed=12345
  wc -l train.* dev.*
fi

# train SentencePiece
model_type="unigram"
vocab_size=32000
character_coverage=1.0
if [ -f "train.en" ] && [ ! -f "train.sp.${vocab_size}.en" ]; then
  echo -e "\tLearning SentencePiece..."
  for l in ${src} ${trg}; do
    if [ ${l} == "ja" ]; then
      character_coverage=0.995
    fi
    spm_train --input="train.${l}" --model_prefix=spm.${l}.${vocab_size} --vocab_size=${vocab_size} \
            --character_coverage=${character_coverage} --hard_vocab_limit=false --model_type=${model_type} \
            --unk_piece='<unk>' --pad_piece='<pad>' --input_sentence_size=1000000 --shuffle_input_sentence=true

    # vocab file
    cut -f1 -d$'\t' spm.${l}.${vocab_size}.vocab > vocab.${l}

    # apply SentencePiece
    echo -e "\tApplying SentencePiece..."

    for p in train dev; do
      spm_encode --model=spm.${l}.${vocab_size}.model --output_format=piece < ${p}.${l} > ${p}.sp.${vocab_size}.${l}
    done
  done
fi

cd ${data_dir}
# Prepare IWSLT17
echo "Prepare IWSLT17 (Test Data)"

iwslt17_dir="${data_dir}/iwslt17"

if [ ! -d "${iwslt17_dir}" ]; then
  mkdir ${iwslt17_dir}
fi
cd ${iwslt17_dir}

iwslt17_url1="https://wit3.fbk.eu/archive/2017-01-ted-test/texts/${trg}/${src}/${trg}-${src}.tgz"
iwslt17_url2="https://wit3.fbk.eu/archive/2017-01-ted-test/texts/${src}/${trg}/${src}-${trg}.tgz"

## ja-en
if [ ! -f "${trg}-${src}.tgz" ]; then
  echo -e "\tDownloading test data from ${iwslt17_url1}..."
  wget ${iwslt17_url1}
fi

if [ ! -d "${trg}-${src}" ]; then
  tar xzvf ./${trg}-${src}.tgz
  #rm ./${trg}-${src}.tar.gz
fi

## en-ja
if [ ! -f "${src}-${trg}.tgz" ]; then
  echo -e "\tDownloading test data from ${iwslt17_url2}..."
  wget ${iwslt17_url2}
fi

if [ ! -d "${src}-${trg}" ]; then
  tar xzvf ./${src}-${trg}.tgz
  #rm ./${src}-${trg}.tar.gz
fi

if [ ! -f "test.en" ] && [ ! -f "test.sp.${vocab_size}.en" ]; then
  echo -e "\tPreprocessing test data..."
  for l in ${src} ${trg}; do
    lang="${src}-${trg}"
    if [ ${l} == "ja" ]; then
      lang="${trg}-${src}"
    fi
    for o in `ls ${iwslt17_dir}/${lang}/IWSLT17.TED*.${lang}.${l}.xml`; do
        fname=${o##*/}
        f=${iwslt17_dir}/${lang}/${fname%.*}
        echo $o $f
        grep '<seg id' $o | \
            sed -e 's/<seg id="[0-9]*">\s*//g' | \
            sed -e 's/\s*<\/seg>\s*//g' | \
            sed -e "s/\’/\'/g" > ${f}
        echo ""
    done
    cat ${lang}/IWSLT17.TED*.${lang}.${l} > test.${l}

    # apply SentencePiece
    echo -e "\tApplying SentencePiece..."
    spm_encode --model=${jparacrawl_dir}/spm.${l}.${vocab_size}.model --output_format=piece < test.${l} > test.sp.${vocab_size}.${l}
  done
fi


cd ${data_dir}
# Prepare KFTT
echo "Prepare KFTT (Fine Tuning)"

kftt_dir="${data_dir}/kftt"

if [ ! -d "${kftt_dir}" ]; then
  mkdir ${kftt_dir}
fi
cd ${kftt_dir}

kftt_url=http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz
if [ ! -f "kftt-data-1.0.tar.gz" ]; then
  echo -e "\tDownloading kftt data from ${kftt_url}..."
  wget ${kftt_url}
fi

if [ ! -d "kftt-data-1.0" ]; then
  tar zxvf kftt-data-1.0.tar.gz
  #rm kftt-data-1.0.tar.gz
fi

# apply SentencePiece
if [ ! -f "test.sp.${vocab_size}.en" ]; then
  echo -e "\tApplying SentencePiece..."
  for l in ${src} ${trg}; do
    for p in train dev test tune; do
      spm_encode --model=${jparacrawl_dir}/spm.${l}.${vocab_size}.model --output_format=piece < "kftt-data-1.0/data/orig/kyoto-${p}.${l}" > ${p}.sp.${vocab_size}.${l}
    done
  done
fi


echo "done."

+48 −0
Original line number Diff line number Diff line
# coding: utf-8
"""
Preprocess JParaCrawl
"""

import os
import argparse
import pandas as pd
import numpy as np
import unicodedata
from collections import OrderedDict


def prepare(data_dir, size, seed=None):
    dtype = OrderedDict({'source': str, 'probability': float, 'en': str, 'ja': str})
    df = pd.read_csv(os.path.join(data_dir, 'en-ja', 'en-ja.bicleaner05.txt'), header=None, names=dtype.keys(),
                     sep='\t', encoding='utf8', quoting=3, keep_default_na=False, na_values='', dtype=dtype)
    df = df.drop_duplicates(subset=['en', 'ja'])
    df = df[~df['en'].str.contains('') & ~df['ja'].str.contains('')]
    df = df[['en', 'ja']].applymap(lambda x: unicodedata.normalize('NFKC', x))
    df = df.dropna(how='any')

    if seed is not None:
        np.random.seed(seed)
    test_index = np.random.choice(df.index, size=size, replace=False)
    train_index = np.setdiff1d(df.index, test_index)
    for lang in ['en', 'ja']:
        for data_set, drop_index in zip(['train', 'dev'], [test_index, train_index]):
            df[lang].drop(index=drop_index, inplace=False).to_csv(os.path.join(data_dir, data_set+'.'+lang),
                          header=False, index=False, sep='\t', encoding='utf8', quoting=3)


def main():
    PATH = os.path.dirname(os.path.abspath('__file__'))

    ap = argparse.ArgumentParser("Preprocess JParaCrawl")
    ap.add_argument("--data_dir", type=str, default=os.path.join(PATH, "../test/data/jparacrawl"),
                    help="path to data dir. default: ../test/data/jparacrawl")
    ap.add_argument("--dev_size", type=int, default=5000, help="development set size")
    ap.add_argument("--seed", type=int, default=12345, help="random seed for train-dev-split")
    args = ap.parse_args()

    prepare(args.data_dir, args.dev_size, args.seed)


if __name__ == "__main__":
    main()