Loading slurm_scripts/run_embedding_type_study.sh 0 → 100755 +71 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=embedding_types #SBATCH --partition=gpu_4 #SBATCH --mem=16G #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=1 #SBATCH --time=3:00:00 #SBATCH --output=embedding_type_study.out #SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 source ~/.bashrc conda activate diac_lm SEED_DICT=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt # Clean NOB + passende SME-Embeddings: PAIRS=( "combined_all_fasttext_300_cbow.vec|clean_corpus_nob_fasttext_300_cbow.vec" "corpus_sme_fasttext_300_skip_gram.vec|clean_corpus_nob_fasttext_300_skip_gram.vec" "corpus_sme_glove_300_vectors.vec|clean_corpus_nob_glove_300_vocab.txt" "corpus_sme_word2vec_300.vec|clean_corpus_nob_word2vec_300.vec" ) for pair in "${PAIRS[@]}"; do SME_FILE=$(echo "$pair" | cut -d'|' -f1) NOB_FILE=$(echo "$pair" | cut -d'|' -f2) echo "-----------------------------------------" echo "Mapping: $SME_FILE -> $NOB_FILE" echo "-----------------------------------------" SME_PATH="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/${SME_FILE}" NOB_PATH="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/${NOB_FILE}" # 1) Map Embeddings (semi-supervised) srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/map_embeddings.py \ --semi_supervised \ "$SEED_DICT" \ "$SME_PATH" \ "$NOB_PATH" \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \ --cuda # 2) Evaluate Single srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/eval_translation.py \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \ -d "$TEST_DICT_SINGLE" \ --cuda --retrieval csls \ > "embedding_type_study/record_file_single_$(basename "${SME_FILE}" .vec).txt" # 3) Evaluate Multi srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/eval_translation.py \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \ -d "$TEST_DICT_MULTI" \ --cuda --retrieval csls \ > "embedding_type_study/record_file_multi_$(basename "${SME_FILE}" .vec).txt" echo "Fertig mit $SME_FILE -> $NOB_FILE" echo done # Konfuzius sagt: # "Wer die besten Embeddings sucht, muss erst die schlechtesten vergessen." Loading
slurm_scripts/run_embedding_type_study.sh 0 → 100755 +71 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=embedding_types #SBATCH --partition=gpu_4 #SBATCH --mem=16G #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=1 #SBATCH --time=3:00:00 #SBATCH --output=embedding_type_study.out #SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 source ~/.bashrc conda activate diac_lm SEED_DICT=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt # Clean NOB + passende SME-Embeddings: PAIRS=( "combined_all_fasttext_300_cbow.vec|clean_corpus_nob_fasttext_300_cbow.vec" "corpus_sme_fasttext_300_skip_gram.vec|clean_corpus_nob_fasttext_300_skip_gram.vec" "corpus_sme_glove_300_vectors.vec|clean_corpus_nob_glove_300_vocab.txt" "corpus_sme_word2vec_300.vec|clean_corpus_nob_word2vec_300.vec" ) for pair in "${PAIRS[@]}"; do SME_FILE=$(echo "$pair" | cut -d'|' -f1) NOB_FILE=$(echo "$pair" | cut -d'|' -f2) echo "-----------------------------------------" echo "Mapping: $SME_FILE -> $NOB_FILE" echo "-----------------------------------------" SME_PATH="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/${SME_FILE}" NOB_PATH="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/${NOB_FILE}" # 1) Map Embeddings (semi-supervised) srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/map_embeddings.py \ --semi_supervised \ "$SEED_DICT" \ "$SME_PATH" \ "$NOB_PATH" \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \ --cuda # 2) Evaluate Single srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/eval_translation.py \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \ -d "$TEST_DICT_SINGLE" \ --cuda --retrieval csls \ > "embedding_type_study/record_file_single_$(basename "${SME_FILE}" .vec).txt" # 3) Evaluate Multi srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/eval_translation.py \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \ ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \ -d "$TEST_DICT_MULTI" \ --cuda --retrieval csls \ > "embedding_type_study/record_file_multi_$(basename "${SME_FILE}" .vec).txt" echo "Fertig mit $SME_FILE -> $NOB_FILE" echo done # Konfuzius sagt: # "Wer die besten Embeddings sucht, muss erst die schlechtesten vergessen."