Loading slurm_scripts/run_grid_search_add_back_in.sh 0 → 100644 +147 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=dev_gpu_4 #SBATCH --mem=32G #SBATCH --cpus-per-task=64 #SBATCH --gres=gpu:1 #SBATCH --time=30:00 #SBATCH --output=gridsearch_study_lower_ressources_4.out #SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc cd $HOME/diachronic-llms-automatic-dictionary-induction # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Fixed dimension DIM=300 # Grid MIN_COUNTS=(2 5 10) MIN_NS=(2 3) MAX_NS=(2 5 10) # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt" # Dictionaries SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt # Output directories - not saved to data/embeddings since those will likely by throwaway embeddings EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings" ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned" MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier" RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates" # Train base nob model srun fastText/fasttext skipgram \ -input "$NOB_CORPUS" \ -output "${EMB_DIR}/nob_base_model" \ -dim "$DIM" \ -minCount "$mc" \ -minn "$mn" \ -maxn "$mx" \ -thread 64 echo "Starting n-gram based FastText grid search with ClassyMap – single dimension = $DIM" for mc in "${MIN_COUNTS[@]}"; do for mn in "${MIN_NS[@]}"; do for mx in "${MAX_NS[@]}"; do echo "---------------------------------------------" echo "dim=$DIM | minCount=$mc | minn=$mn | maxn=$mx" echo "---------------------------------------------" #conda activate diac_lm srun fastText/fasttext skipgram \ -input "$SME_CORPUS" \ -output "${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \ -dim "$DIM" \ -minCount "$mc" \ -minn "$mn" \ -maxn "$mx" \ -thread 64 SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" NOB_EMB="${EMB_DIR}/nob_base_model" SME_EMB_BIN="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin" # NOB_EMB_BIN="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin" SME_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/sme_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" # NOB_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/nob_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" tr -cs '[:alpha:]' '\n' < "$SME_CORPUS" | tr '[:upper:]' '[:lower:]' | sort | uniq -c | awk '$1 >= 3 {print $2}' > "$SME_LOW_FREQ" # tr -cs '[:alpha:]' '\n' < "$NOB_CORPUS" | tr '[:upper:]' '[:lower:]' | sort | uniq -c | awk '$1 >= 3 {print $2}' > "$NOB_LOW_FREQ" TMP_SME_EMB="${SME_EMB}.tmp" python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_SME_EMB --phrases "$SME_LOW_FREQ" mv $TMP_SME_EMB $SME_EMB # TMP_NOB_EMB="${NOB_EMB}.tmp" # python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_NOB_EMB --phrases "$NOB_LOW_FREQ" # mv $TMP_NOB_EMB $SME_EMB # ClassyMap training srun python3 ClassyMap/src/classymap.py \ --train_dict "$SEED_DICT" \ --in_src "$SME_EMB" \ --in_tar "$NOB_EMB" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${dim}" \ --out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --model_filename "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" \ --use_mnns_pooler 1 # ClassyMap evaluation (Single Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --dictionary "$TEST_DICT_SINGLE" \ --output_file "${RESULTS_DIR}/single_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" # ClassyMap evaluation (Multi Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --dictionary "$TEST_DICT_MULTI" \ --output_file "${RESULTS_DIR}/multi_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" echo "Finished: dim=$DIM, mc=$mc, minn=$mn, maxn=$mx" echo done done done # Konfuzius sagt: # "Wer viele N-Grams sät, wird viele Token ernten." src/add_n_grams.py +20 −12 Original line number Diff line number Diff line Loading @@ -4,13 +4,13 @@ import numpy as np import os from tqdm import tqdm import re import argparse def emb_separator_to_tab(in_file:str, out_file:str): """ Converts the separator between words and embedding vectors in a Word2Vec embedding file to \t. Args: in_file: Path to the Word2Vec embedding file to modify. Since the default embedding format uses \t to separate words, and space to separate phrases and vectors, this is also swapped. """ Loading Loading @@ -39,11 +39,9 @@ def emb_separator_to_tab(in_file:str, out_file:str): # Replace the original file with the modified file print("Successfully changed separator from space to tab. ") def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embedding_path_out:str): """ Creates embeddings for (potentially multi-word) phrases and adds those to a new vocabulary file. Args: phrases: A list of strings to be added to the file containing word(/phrase) - embedding pairs. model_path: The path to a fastText model. Should be a .bin model, since this also contains subwords. Loading @@ -58,13 +56,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe with open(embedding_path_out, 'r') as fr: # Test if file format is correct header = fr.readline() sep = '\t' assert len(header.split()) == 2, "Please use a file with a header consisting of <vocab size> <embedding dim> as input" line = fr.readline() assert len(line.split('\t')) > 1, "Found a non-tab-separated-file, please use the emb_separator_to_tab() function first" if len(line.split(sep)) == 1: print("Found a non-tab-separated-file, please use the emb_separator_to_tab() function first if you want to work with multi-word phrases") sep = None # Add words to vocab fr.seek(0) for line in tqdm(fr, desc="Creating vocab: "): vocab.add(line.split('\t')[0]) if sep is not None: vocab.add(line.split(sep)[0]) else: vocab.add(line.split()[0]) # Add new phrases to file c_new = 0 Loading @@ -91,15 +95,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe print(f"Newly added {c_new} of {len(phrases)} phrases to file {embedding_path_in}, wrote into {embedding_path_out}") if __name__ == "__main__": # Read nob ordbok (monolingual dictionary) file with open("data/ordbank_nob/lemma.txt", 'r', encoding='ISO-8859-1') as fr: parser = argparse.ArgumentParser(description="Process embeddings and add phrases.") parser.add_argument("--model_path", type=str, required=True, help="Path to fastText model (.bin file)") parser.add_argument("--embedding_in", type=str, required=True, help="Path to input embedding file (.vec)") parser.add_argument("--embedding_out", type=str, required=True, help="Path to output embedding file") parser.add_argument("--phrases", type=str, required=False, default="data/ordbank_nob/lemma.txt", help="Path to phrases file") args = parser.parse_args() with open(args.phrases, 'r', encoding='ISO-8859-1') as fr: lemmata = [] fr.readline() # Ignore header for line in fr: lemmata.append(line.split('\t')[2]) lemmata.append(line.strip()) print(lemmata[:20]) emb_separator_to_tab("embeddings/model_nob.vec", "embeddings/model_nob_tab_sep.vec") add_phrases_to_voc(lemmata, "embeddings/model_nob.bin", "embeddings/model_nob_tab_sep.vec", "embeddings/model_nob_added_ordbank.vec") #emb_separator_to_tab(args.embedding_in, args.embedding_out) add_phrases_to_voc(lemmata, args.model_path, args.embedding_in, args.embedding_out) Loading
slurm_scripts/run_grid_search_add_back_in.sh 0 → 100644 +147 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=dev_gpu_4 #SBATCH --mem=32G #SBATCH --cpus-per-task=64 #SBATCH --gres=gpu:1 #SBATCH --time=30:00 #SBATCH --output=gridsearch_study_lower_ressources_4.out #SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc cd $HOME/diachronic-llms-automatic-dictionary-induction # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Fixed dimension DIM=300 # Grid MIN_COUNTS=(2 5 10) MIN_NS=(2 3) MAX_NS=(2 5 10) # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt" # Dictionaries SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt # Output directories - not saved to data/embeddings since those will likely by throwaway embeddings EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings" ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned" MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier" RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates" # Train base nob model srun fastText/fasttext skipgram \ -input "$NOB_CORPUS" \ -output "${EMB_DIR}/nob_base_model" \ -dim "$DIM" \ -minCount "$mc" \ -minn "$mn" \ -maxn "$mx" \ -thread 64 echo "Starting n-gram based FastText grid search with ClassyMap – single dimension = $DIM" for mc in "${MIN_COUNTS[@]}"; do for mn in "${MIN_NS[@]}"; do for mx in "${MAX_NS[@]}"; do echo "---------------------------------------------" echo "dim=$DIM | minCount=$mc | minn=$mn | maxn=$mx" echo "---------------------------------------------" #conda activate diac_lm srun fastText/fasttext skipgram \ -input "$SME_CORPUS" \ -output "${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \ -dim "$DIM" \ -minCount "$mc" \ -minn "$mn" \ -maxn "$mx" \ -thread 64 SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" NOB_EMB="${EMB_DIR}/nob_base_model" SME_EMB_BIN="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin" # NOB_EMB_BIN="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin" SME_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/sme_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" # NOB_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/nob_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" tr -cs '[:alpha:]' '\n' < "$SME_CORPUS" | tr '[:upper:]' '[:lower:]' | sort | uniq -c | awk '$1 >= 3 {print $2}' > "$SME_LOW_FREQ" # tr -cs '[:alpha:]' '\n' < "$NOB_CORPUS" | tr '[:upper:]' '[:lower:]' | sort | uniq -c | awk '$1 >= 3 {print $2}' > "$NOB_LOW_FREQ" TMP_SME_EMB="${SME_EMB}.tmp" python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_SME_EMB --phrases "$SME_LOW_FREQ" mv $TMP_SME_EMB $SME_EMB # TMP_NOB_EMB="${NOB_EMB}.tmp" # python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_NOB_EMB --phrases "$NOB_LOW_FREQ" # mv $TMP_NOB_EMB $SME_EMB # ClassyMap training srun python3 ClassyMap/src/classymap.py \ --train_dict "$SEED_DICT" \ --in_src "$SME_EMB" \ --in_tar "$NOB_EMB" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${dim}" \ --out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --model_filename "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" \ --use_mnns_pooler 1 # ClassyMap evaluation (Single Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --dictionary "$TEST_DICT_SINGLE" \ --output_file "${RESULTS_DIR}/single_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" # ClassyMap evaluation (Multi Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ --dictionary "$TEST_DICT_MULTI" \ --output_file "${RESULTS_DIR}/multi_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" echo "Finished: dim=$DIM, mc=$mc, minn=$mn, maxn=$mx" echo done done done # Konfuzius sagt: # "Wer viele N-Grams sät, wird viele Token ernten."
src/add_n_grams.py +20 −12 Original line number Diff line number Diff line Loading @@ -4,13 +4,13 @@ import numpy as np import os from tqdm import tqdm import re import argparse def emb_separator_to_tab(in_file:str, out_file:str): """ Converts the separator between words and embedding vectors in a Word2Vec embedding file to \t. Args: in_file: Path to the Word2Vec embedding file to modify. Since the default embedding format uses \t to separate words, and space to separate phrases and vectors, this is also swapped. """ Loading Loading @@ -39,11 +39,9 @@ def emb_separator_to_tab(in_file:str, out_file:str): # Replace the original file with the modified file print("Successfully changed separator from space to tab. ") def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embedding_path_out:str): """ Creates embeddings for (potentially multi-word) phrases and adds those to a new vocabulary file. Args: phrases: A list of strings to be added to the file containing word(/phrase) - embedding pairs. model_path: The path to a fastText model. Should be a .bin model, since this also contains subwords. Loading @@ -58,13 +56,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe with open(embedding_path_out, 'r') as fr: # Test if file format is correct header = fr.readline() sep = '\t' assert len(header.split()) == 2, "Please use a file with a header consisting of <vocab size> <embedding dim> as input" line = fr.readline() assert len(line.split('\t')) > 1, "Found a non-tab-separated-file, please use the emb_separator_to_tab() function first" if len(line.split(sep)) == 1: print("Found a non-tab-separated-file, please use the emb_separator_to_tab() function first if you want to work with multi-word phrases") sep = None # Add words to vocab fr.seek(0) for line in tqdm(fr, desc="Creating vocab: "): vocab.add(line.split('\t')[0]) if sep is not None: vocab.add(line.split(sep)[0]) else: vocab.add(line.split()[0]) # Add new phrases to file c_new = 0 Loading @@ -91,15 +95,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe print(f"Newly added {c_new} of {len(phrases)} phrases to file {embedding_path_in}, wrote into {embedding_path_out}") if __name__ == "__main__": # Read nob ordbok (monolingual dictionary) file with open("data/ordbank_nob/lemma.txt", 'r', encoding='ISO-8859-1') as fr: parser = argparse.ArgumentParser(description="Process embeddings and add phrases.") parser.add_argument("--model_path", type=str, required=True, help="Path to fastText model (.bin file)") parser.add_argument("--embedding_in", type=str, required=True, help="Path to input embedding file (.vec)") parser.add_argument("--embedding_out", type=str, required=True, help="Path to output embedding file") parser.add_argument("--phrases", type=str, required=False, default="data/ordbank_nob/lemma.txt", help="Path to phrases file") args = parser.parse_args() with open(args.phrases, 'r', encoding='ISO-8859-1') as fr: lemmata = [] fr.readline() # Ignore header for line in fr: lemmata.append(line.split('\t')[2]) lemmata.append(line.strip()) print(lemmata[:20]) emb_separator_to_tab("embeddings/model_nob.vec", "embeddings/model_nob_tab_sep.vec") add_phrases_to_voc(lemmata, "embeddings/model_nob.bin", "embeddings/model_nob_tab_sep.vec", "embeddings/model_nob_added_ordbank.vec") #emb_separator_to_tab(args.embedding_in, args.embedding_out) add_phrases_to_voc(lemmata, args.model_path, args.embedding_in, args.embedding_out)