Commit 1a46e724 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files
parents f200726e 52bd67ce
Loading
Loading
Loading
Loading
+147 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#SBATCH --job-name=grid_search
#SBATCH --partition=dev_gpu_4
#SBATCH --mem=32G
#SBATCH --cpus-per-task=64
#SBATCH --gres=gpu:1
#SBATCH --time=30:00
#SBATCH --output=gridsearch_study_lower_ressources_4.out
#SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --ntasks=1

#source ~/.bashrc
cd $HOME/diachronic-llms-automatic-dictionary-induction

# Load required modules
module load devel/cuda/12.4

# Set library path
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Fixed dimension
DIM=300

# Grid 
MIN_COUNTS=(2 5 10)
MIN_NS=(2 3)
MAX_NS=(2 5 10)

# Corpora paths
SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt"
NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt"

# Dictionaries
SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt
TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt
TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt

# Output directories - not saved to data/embeddings since those will likely by throwaway embeddings
EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings"
ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned"
MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier"
RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates"

# Train base nob model
srun fastText/fasttext skipgram \
         -input "$NOB_CORPUS" \
         -output "${EMB_DIR}/nob_base_model" \
         -dim "$DIM" \
         -minCount "$mc" \
         -minn "$mn" \
         -maxn "$mx" \
         -thread 64

echo "Starting n-gram based FastText grid search with ClassyMap – single dimension = $DIM"

for mc in "${MIN_COUNTS[@]}"; do
  for mn in "${MIN_NS[@]}"; do
    for mx in "${MAX_NS[@]}"; do

      echo "---------------------------------------------"
      echo "dim=$DIM | minCount=$mc | minn=$mn | maxn=$mx"
      echo "---------------------------------------------"

    
      #conda activate diac_lm
      srun fastText/fasttext skipgram \
         -input "$SME_CORPUS" \
         -output "${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \
         -dim "$DIM" \
         -minCount "$mc" \
         -minn "$mn" \
         -maxn "$mx" \
         -thread 64


      SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec"
      NOB_EMB="${EMB_DIR}/nob_base_model"
      SME_EMB_BIN="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin"
      # NOB_EMB_BIN="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin"

      SME_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/sme_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt"
      # NOB_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/nob_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt"

      tr -cs '[:alpha:]' '\n' < "$SME_CORPUS" | tr '[:upper:]' '[:lower:]' | sort | uniq -c | awk '$1 >= 3 {print $2}' > "$SME_LOW_FREQ"
      # tr -cs '[:alpha:]' '\n' < "$NOB_CORPUS" | tr '[:upper:]' '[:lower:]' | sort | uniq -c | awk '$1 >= 3 {print $2}' > "$NOB_LOW_FREQ"

      TMP_SME_EMB="${SME_EMB}.tmp"
      python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_SME_EMB --phrases "$SME_LOW_FREQ"
      mv $TMP_SME_EMB $SME_EMB

      # TMP_NOB_EMB="${NOB_EMB}.tmp"
      # python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_NOB_EMB --phrases "$NOB_LOW_FREQ"
      # mv $TMP_NOB_EMB $SME_EMB
      

      # ClassyMap training
      srun python3 ClassyMap/src/classymap.py \
        --train_dict "$SEED_DICT" \
        --in_src  "$SME_EMB" \
        --in_tar  "$NOB_EMB" \
        --src_lid sme \
        --tar_lid nob \
        --idstring "EXP_dim${dim}" \
        --out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
        --out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
        --model_filename "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" \
        --use_mnns_pooler 1

      # ClassyMap evaluation (Single Dict)
      srun python3 ClassyMap/src/eval.py \
        "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
        "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
        --dictionary "$TEST_DICT_SINGLE" \
        --output_file "${RESULTS_DIR}/single_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \
        --src_lid sme \
        --tar_lid nob \
        --idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \
        --retrieval csls \
        --seed 42 \
        --cuda \
        --super \
        --model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl"

      # ClassyMap evaluation (Multi Dict)
      srun python3 ClassyMap/src/eval.py \
        "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
        "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
        --dictionary "$TEST_DICT_MULTI" \
        --output_file "${RESULTS_DIR}/multi_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \
        --src_lid sme \
        --tar_lid nob \
        --idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \
        --retrieval csls \
        --seed 42 \
        --cuda \
        --super \
        --model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl"

      echo "Finished: dim=$DIM, mc=$mc, minn=$mn, maxn=$mx"
      echo
    done
  done
done

# Konfuzius sagt:
# "Wer viele N-Grams sät, wird viele Token ernten."
+20 −12
Original line number Diff line number Diff line
@@ -4,13 +4,13 @@ import numpy as np
import os
from tqdm import tqdm
import re
import argparse

def emb_separator_to_tab(in_file:str, out_file:str): 
    """
    Converts the separator between words and embedding vectors in a Word2Vec embedding file to \t. 
    Args: 
        in_file: Path to the Word2Vec embedding file to modify.

    Since the default embedding format uses \t to separate words, and space to separate phrases and vectors, this is also swapped. 
    """
    
@@ -39,11 +39,9 @@ def emb_separator_to_tab(in_file:str, out_file:str):
    # Replace the original file with the modified file
    print("Successfully changed separator from space to tab. ")


def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embedding_path_out:str): 
    """
    Creates embeddings for (potentially multi-word) phrases and adds those to a new vocabulary file.

    Args: 
        phrases: A list of strings to be added to the file containing word(/phrase) - embedding pairs. 
        model_path: The path to a fastText model. Should be a .bin model, since this also contains subwords. 
@@ -58,13 +56,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe
    with open(embedding_path_out, 'r') as fr: 
        # Test if file format is correct              
        header = fr.readline()
        sep = '\t'
        assert len(header.split()) == 2, "Please use a file with a header consisting of <vocab size> <embedding dim> as input"
        line = fr.readline()
        assert len(line.split('\t')) > 1, "Found a non-tab-separated-file, please use the emb_separator_to_tab() function first"
        if len(line.split(sep)) == 1: 
            print("Found a non-tab-separated-file, please use the emb_separator_to_tab() function first if you want to work with multi-word phrases")
            sep = None
        # Add words to vocab
        fr.seek(0)
        for line in tqdm(fr, desc="Creating vocab: "): 
            vocab.add(line.split('\t')[0])
            if sep is not None: 
                vocab.add(line.split(sep)[0])
            else: 
                vocab.add(line.split()[0])

    # Add new phrases to file
    c_new = 0
@@ -91,15 +95,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe

    print(f"Newly added {c_new} of {len(phrases)} phrases to file {embedding_path_in}, wrote into {embedding_path_out}")


if __name__ == "__main__":
    # Read nob ordbok (monolingual dictionary) file
    with open("data/ordbank_nob/lemma.txt", 'r', encoding='ISO-8859-1') as fr: 
    parser = argparse.ArgumentParser(description="Process embeddings and add phrases.")
    parser.add_argument("--model_path", type=str, required=True, help="Path to fastText model (.bin file)")
    parser.add_argument("--embedding_in", type=str, required=True, help="Path to input embedding file (.vec)")
    parser.add_argument("--embedding_out", type=str, required=True, help="Path to output embedding file")
    parser.add_argument("--phrases", type=str, required=False, default="data/ordbank_nob/lemma.txt", help="Path to phrases file")
    args = parser.parse_args()

    with open(args.phrases, 'r', encoding='ISO-8859-1') as fr: 
        lemmata = []
        fr.readline() # Ignore header
        for line in fr: 
            lemmata.append(line.split('\t')[2])
            lemmata.append(line.strip())
        print(lemmata[:20])

    emb_separator_to_tab("embeddings/model_nob.vec", "embeddings/model_nob_tab_sep.vec") 
    add_phrases_to_voc(lemmata, "embeddings/model_nob.bin", "embeddings/model_nob_tab_sep.vec", "embeddings/model_nob_added_ordbank.vec")
    #emb_separator_to_tab(args.embedding_in, args.embedding_out) 
    add_phrases_to_voc(lemmata, args.model_path, args.embedding_in, args.embedding_out)