Loading slurm_scripts/embeddings_grid_search.sh 0 → 100644 +113 −0 Original line number Diff line number Diff line okay, Finn can you have a look whether this looks plausibel? #!/usr/bin/env bash #SBATCH --job-name=grid_dims_classymap #SBATCH --partition=dev_gpu_4_a100 #SBATCH --mem=16G #SBATCH --cpus-per-task=32 #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=1 #SBATCH --time=30:00 #SBATCH --output=grid_dims_classymap.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc #conda activate diac_lm # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH SME_CORPUS="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" NOB_CORPUS="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/sme_nob_corpora/clean_corpus_nob.txt" SEED_DICT=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt EMB_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings" # Save to subfolder of slurm_scipts, not embeddings since those will probably be throwaway results ALIGN_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/aligned" MODEL_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/classifier" RESULTS_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/translation_candidates" DIMS=(50 100 300 500) for dim in "${DIMS[@]}"; do echo "---------------------------------------------" echo "Generating embeddings for dimension = $dim" echo "---------------------------------------------" # Generate FastText embeddings for SME srun fastText/fasttext skipgram \ -input "$SME_CORPUS" \ -output "${EMB_DIR}/sme_dim${dim}_skip_gram" \ -dim "$dim" \ -thread 64 # Generate FastText embeddings for NOB srun fastText/fasttext skipgram \ -input "$NOB_CORPUS" \ -output "${EMB_DIR}/nob_dim${dim}_skip_gram" \ -dim "$dim" \ -thread 64 # Paths to .vec version of embeddings SME_EMB="${EMB_DIR}/sme_dim${dim}_skip_gram.vec" NOB_EMB="${EMB_DIR}/nob_dim${dim}_skip_gram.vec" echo "Mapping embeddings for dimension = $dim using ClassyMap..." # ClassyMap training srun python3 ClassyMap/src/classymap.py \ --train_dict "$SEED_DICT" \ --in_src "$SME_EMB" \ --in_tar "$NOB_EMB" \ --src_lid sme \ --tar_lid nob \ --out_src "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \ --out_tar "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \ --model_filename "${MODEL_DIR}/classifier_model_dim${dim}.pkl" \ --use_mnns_pooler 1 \ --cuda # ClassyMap evaluation srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \ --dictionary "$TEST_DICT_SINGLE" \ --output_file "${RESULTS_DIR}/single_translation_candidates_dim${dim}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${dim}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${dim}.pkl" # ClassyMap evaluation srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \ --dictionary "$TEST_DICT_MULTI" \ --output_file "${RESULTS_DIR}/multi_translation_candidates_dim${dim}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${dim}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${dim}.pkl" done # Konfuzius sagt: # "Wer vier Dimensionen testet, braucht eine fünfte für die Geduld." No newline at end of file slurm_scripts/gen_emb.sh 0 → 100644 +36 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=dev_gpu_4_a100 #SBATCH --mem=50G #SBATCH --cpus-per-task=10 #SBATCH --gres=gpu:1 #SBATCH --time=00:15:00 #SBATCH --output=gen_sme_emb.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc cd $HOME/diachronic-llms-automatic-dictionary-induction # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Fixed dimension DIM=300 # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings" srun fastText/fasttext skipgram \ -input "$SME_CORPUS" \ -output "final_sme_corpus_fasttext_300_skipgram" \ -dim "$DIM" \ -thread 64 slurm_scripts/run_grid_search.sh +6 −6 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=gpu_4 #SBATCH --partition=gpu_8 #SBATCH --mem=50G #SBATCH --cpus-per-task=10 #SBATCH --gres=gpu:1 #SBATCH --time=15:00:00 #SBATCH --output=gridsearch_study_lower_ressources_4.out #SBATCH --time=00:10:00 #SBATCH --output=gridsearch_study_5_2_only.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 Loading @@ -23,9 +23,9 @@ export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH DIM=300 # Grid MIN_COUNTS=(2 5 10) MIN_NS=(2 3) MAX_NS=(2 5 10) MIN_COUNTS=(2) MIN_NS=(3) MAX_NS=(2) # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" Loading slurm_scripts/run_grid_search_man.sh 0 → 100644 +229 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=gpu_8 #SBATCH --mem=80G #SBATCH --gres=gpu:1 #SBATCH --time=00:50:00 #SBATCH --output=best_cov_%j.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc cd $HOME/diachronic-llms-automatic-dictionary-induction # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Fixed dimension DIM=300 # Grid mc=10 #(2 5 10) mn=3 #(2 3) mx=5 #(2 5 10) # Grid mcs=2 #(2 5 10) mns=2 #(2 3) mxs=10 #(2 5 10) # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt" # Dictionaries SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt # Output directories - not saved to data/embeddings since those will likely by throwaway embeddings EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings" ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned" MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier" RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates" # Standard embeddings SME_EMB_STANDARD="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings/combined_all_fasttext_300_skip_gram.vec" NOB_EMB_STANDARD="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings/clean_corpus_nob_fasttext_300_skip_gram.vec" # echo "---------------------------------------------" # echo "testing SME = Standard, NOB = minCount=$mc | minn=$mn | maxn=$mx" # echo "---------------------------------------------" # SME_EMB=$SME_EMB_STANDARD # NOB_EMB="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" # # ClassyMap training # srun python3 ClassyMap/src/classymap.py \ # --train_dict "$SEED_DICT" \ # --in_src "$SME_EMB" \ # --in_tar "$NOB_EMB" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \ # --out_src "${ALIGN_DIR}/aligned_sme_standard.vec" \ # --out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --model_filename "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \ # --use_mnns_pooler 1 # # ClassyMap evaluation (Single Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_standard.vec" \ # "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --dictionary "$TEST_DICT_SINGLE" \ # --output_file "${RESULTS_DIR}/single_candidates__sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \ # --retrieval csls \ # --seed 42 \ # --super \ # --cuda \ # --model "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \ # # ClassyMap evaluation (Multi Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_standard.vec" \ # "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --dictionary "$TEST_DICT_MULTI" \ # --output_file "${RESULTS_DIR}/multi_candidates_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \ # --retrieval csls \ # --seed 42 \ # --cuda \ # --super \ # --model "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \ # echo "Finished: SME = Standard, NOB = minCount=$mc | minn=$mn | maxn=$mx" # echo "Konfuzius sagt: \"Wo Hyperparameter variieren, wächst der Baum der Weisheit.\"" # echo "---------------------------------------------" # echo "testing SME = minCount=$mc | minn=$mn | maxn=$mx, NOB = Standard" # echo "---------------------------------------------" # SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" # NOB_EMB=$NOB_EMB_STANDARD # # ClassyMap training # srun python3 ClassyMap/src/classymap.py \ # --train_dict "$SEED_DICT" \ # --in_src "$SME_EMB" \ # --in_tar "$NOB_EMB" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \ # --out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --out_tar "${ALIGN_DIR}/aligned_nob_standard" \ # --model_filename "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \ # --use_mnns_pooler 1 \ # # ClassyMap evaluation (Single Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # "${ALIGN_DIR}/aligned_nob_standard" \ # --dictionary "$TEST_DICT_SINGLE" \ # --output_file "${RESULTS_DIR}/single_candidates_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard_test.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \ # --retrieval csls \ # --seed 42 \ # --cuda \ # --super \ # --model "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \ # # ClassyMap evaluation (Multi Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # "${ALIGN_DIR}/aligned_nob_standard" \ # --dictionary "$TEST_DICT_MULTI" \ # --output_file "${RESULTS_DIR}/multi_candidates_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard_test.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \ # --retrieval csls \ # --seed 42 \ # --cuda \ # --super \ # --model "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \ # echo "Finished: SME = minCount=$mc | minn=$mn | maxn=$mx, NOB = Standard" # echo "Konfuzius sagt: \"Die GPU mag mächtig sein, doch auch sie folgt den Gesetzen der Geduld.\"" echo "---------------------------------------------" echo "testing SME = minCount=$mcs | minn=$mns | maxn=$mxs, NOB = minCount=$mc | minn=$mn | maxn=$mx" echo "---------------------------------------------" SME_EMB=$SME_EMB_STANDARD #"${EMB_DIR}/sme_dim${DIM}_mc${mcs}_minn${mns}_maxn${mxs}.vec" NOB_EMB=$NOB_EMB_STANDARD #"${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" # ClassyMap training srun python3 ClassyMap/src/classymap.py \ --train_dict "$SEED_DICT" \ --in_src "$SME_EMB" \ --in_tar "$NOB_EMB" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_combined_all_classy" \ --out_src "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \ --out_tar "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \ --model_filename "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \ --use_mnns_pooler 1 # ClassyMap evaluation (Single Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \ "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \ --dictionary "$TEST_DICT_SINGLE" \ --output_file "${RESULTS_DIR}/single_candidates_combined_all_classy.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_combined_all_classy" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \ # ClassyMap evaluation (Multi Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \ "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \ --dictionary "$TEST_DICT_MULTI" \ --output_file "${RESULTS_DIR}/multi_candidates_combined_all_classy.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_combined_all_classy" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \ echo "Finished: SME = minCount=$mcs | minn=$mns | maxn=$mxs, NOB = minCount=$mc | minn=$mn | maxn=$mx" echo "Konfuzius sagt: \"Die GPU mag mächtig sein, doch auch sie folgt den Gesetzen der Geduld.\"" Loading
slurm_scripts/embeddings_grid_search.sh 0 → 100644 +113 −0 Original line number Diff line number Diff line okay, Finn can you have a look whether this looks plausibel? #!/usr/bin/env bash #SBATCH --job-name=grid_dims_classymap #SBATCH --partition=dev_gpu_4_a100 #SBATCH --mem=16G #SBATCH --cpus-per-task=32 #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=1 #SBATCH --time=30:00 #SBATCH --output=grid_dims_classymap.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc #conda activate diac_lm # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH SME_CORPUS="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" NOB_CORPUS="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/sme_nob_corpora/clean_corpus_nob.txt" SEED_DICT=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt EMB_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings" # Save to subfolder of slurm_scipts, not embeddings since those will probably be throwaway results ALIGN_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/aligned" MODEL_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/classifier" RESULTS_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/translation_candidates" DIMS=(50 100 300 500) for dim in "${DIMS[@]}"; do echo "---------------------------------------------" echo "Generating embeddings for dimension = $dim" echo "---------------------------------------------" # Generate FastText embeddings for SME srun fastText/fasttext skipgram \ -input "$SME_CORPUS" \ -output "${EMB_DIR}/sme_dim${dim}_skip_gram" \ -dim "$dim" \ -thread 64 # Generate FastText embeddings for NOB srun fastText/fasttext skipgram \ -input "$NOB_CORPUS" \ -output "${EMB_DIR}/nob_dim${dim}_skip_gram" \ -dim "$dim" \ -thread 64 # Paths to .vec version of embeddings SME_EMB="${EMB_DIR}/sme_dim${dim}_skip_gram.vec" NOB_EMB="${EMB_DIR}/nob_dim${dim}_skip_gram.vec" echo "Mapping embeddings for dimension = $dim using ClassyMap..." # ClassyMap training srun python3 ClassyMap/src/classymap.py \ --train_dict "$SEED_DICT" \ --in_src "$SME_EMB" \ --in_tar "$NOB_EMB" \ --src_lid sme \ --tar_lid nob \ --out_src "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \ --out_tar "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \ --model_filename "${MODEL_DIR}/classifier_model_dim${dim}.pkl" \ --use_mnns_pooler 1 \ --cuda # ClassyMap evaluation srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \ --dictionary "$TEST_DICT_SINGLE" \ --output_file "${RESULTS_DIR}/single_translation_candidates_dim${dim}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${dim}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${dim}.pkl" # ClassyMap evaluation srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \ "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \ --dictionary "$TEST_DICT_MULTI" \ --output_file "${RESULTS_DIR}/multi_translation_candidates_dim${dim}.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_dim${dim}" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_dim${dim}.pkl" done # Konfuzius sagt: # "Wer vier Dimensionen testet, braucht eine fünfte für die Geduld." No newline at end of file
slurm_scripts/gen_emb.sh 0 → 100644 +36 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=dev_gpu_4_a100 #SBATCH --mem=50G #SBATCH --cpus-per-task=10 #SBATCH --gres=gpu:1 #SBATCH --time=00:15:00 #SBATCH --output=gen_sme_emb.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc cd $HOME/diachronic-llms-automatic-dictionary-induction # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Fixed dimension DIM=300 # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings" srun fastText/fasttext skipgram \ -input "$SME_CORPUS" \ -output "final_sme_corpus_fasttext_300_skipgram" \ -dim "$DIM" \ -thread 64
slurm_scripts/run_grid_search.sh +6 −6 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=gpu_4 #SBATCH --partition=gpu_8 #SBATCH --mem=50G #SBATCH --cpus-per-task=10 #SBATCH --gres=gpu:1 #SBATCH --time=15:00:00 #SBATCH --output=gridsearch_study_lower_ressources_4.out #SBATCH --time=00:10:00 #SBATCH --output=gridsearch_study_5_2_only.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 Loading @@ -23,9 +23,9 @@ export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH DIM=300 # Grid MIN_COUNTS=(2 5 10) MIN_NS=(2 3) MAX_NS=(2 5 10) MIN_COUNTS=(2) MIN_NS=(3) MAX_NS=(2) # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" Loading
slurm_scripts/run_grid_search_man.sh 0 → 100644 +229 −0 Original line number Diff line number Diff line #!/usr/bin/env bash #SBATCH --job-name=grid_search #SBATCH --partition=gpu_8 #SBATCH --mem=80G #SBATCH --gres=gpu:1 #SBATCH --time=00:50:00 #SBATCH --output=best_cov_%j.out #SBATCH --mail-user=hillengass@cl.uni-heidelberg.de #SBATCH --mail-type=ALL #SBATCH --ntasks=1 #source ~/.bashrc cd $HOME/diachronic-llms-automatic-dictionary-induction # Load required modules module load devel/cuda/12.4 # Set library path export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Fixed dimension DIM=300 # Grid mc=10 #(2 5 10) mn=3 #(2 3) mx=5 #(2 5 10) # Grid mcs=2 #(2 5 10) mns=2 #(2 3) mxs=10 #(2 5 10) # Corpora paths SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt" NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt" # Dictionaries SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt # Output directories - not saved to data/embeddings since those will likely by throwaway embeddings EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings" ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned" MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier" RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates" # Standard embeddings SME_EMB_STANDARD="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings/combined_all_fasttext_300_skip_gram.vec" NOB_EMB_STANDARD="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings/clean_corpus_nob_fasttext_300_skip_gram.vec" # echo "---------------------------------------------" # echo "testing SME = Standard, NOB = minCount=$mc | minn=$mn | maxn=$mx" # echo "---------------------------------------------" # SME_EMB=$SME_EMB_STANDARD # NOB_EMB="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" # # ClassyMap training # srun python3 ClassyMap/src/classymap.py \ # --train_dict "$SEED_DICT" \ # --in_src "$SME_EMB" \ # --in_tar "$NOB_EMB" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \ # --out_src "${ALIGN_DIR}/aligned_sme_standard.vec" \ # --out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --model_filename "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \ # --use_mnns_pooler 1 # # ClassyMap evaluation (Single Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_standard.vec" \ # "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --dictionary "$TEST_DICT_SINGLE" \ # --output_file "${RESULTS_DIR}/single_candidates__sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \ # --retrieval csls \ # --seed 42 \ # --super \ # --cuda \ # --model "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \ # # ClassyMap evaluation (Multi Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_standard.vec" \ # "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --dictionary "$TEST_DICT_MULTI" \ # --output_file "${RESULTS_DIR}/multi_candidates_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \ # --retrieval csls \ # --seed 42 \ # --cuda \ # --super \ # --model "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \ # echo "Finished: SME = Standard, NOB = minCount=$mc | minn=$mn | maxn=$mx" # echo "Konfuzius sagt: \"Wo Hyperparameter variieren, wächst der Baum der Weisheit.\"" # echo "---------------------------------------------" # echo "testing SME = minCount=$mc | minn=$mn | maxn=$mx, NOB = Standard" # echo "---------------------------------------------" # SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" # NOB_EMB=$NOB_EMB_STANDARD # # ClassyMap training # srun python3 ClassyMap/src/classymap.py \ # --train_dict "$SEED_DICT" \ # --in_src "$SME_EMB" \ # --in_tar "$NOB_EMB" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \ # --out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # --out_tar "${ALIGN_DIR}/aligned_nob_standard" \ # --model_filename "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \ # --use_mnns_pooler 1 \ # # ClassyMap evaluation (Single Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # "${ALIGN_DIR}/aligned_nob_standard" \ # --dictionary "$TEST_DICT_SINGLE" \ # --output_file "${RESULTS_DIR}/single_candidates_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard_test.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \ # --retrieval csls \ # --seed 42 \ # --cuda \ # --super \ # --model "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \ # # ClassyMap evaluation (Multi Dict) # srun python3 ClassyMap/src/eval.py \ # "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \ # "${ALIGN_DIR}/aligned_nob_standard" \ # --dictionary "$TEST_DICT_MULTI" \ # --output_file "${RESULTS_DIR}/multi_candidates_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard_test.txt" \ # --src_lid sme \ # --tar_lid nob \ # --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \ # --retrieval csls \ # --seed 42 \ # --cuda \ # --super \ # --model "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \ # echo "Finished: SME = minCount=$mc | minn=$mn | maxn=$mx, NOB = Standard" # echo "Konfuzius sagt: \"Die GPU mag mächtig sein, doch auch sie folgt den Gesetzen der Geduld.\"" echo "---------------------------------------------" echo "testing SME = minCount=$mcs | minn=$mns | maxn=$mxs, NOB = minCount=$mc | minn=$mn | maxn=$mx" echo "---------------------------------------------" SME_EMB=$SME_EMB_STANDARD #"${EMB_DIR}/sme_dim${DIM}_mc${mcs}_minn${mns}_maxn${mxs}.vec" NOB_EMB=$NOB_EMB_STANDARD #"${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" # ClassyMap training srun python3 ClassyMap/src/classymap.py \ --train_dict "$SEED_DICT" \ --in_src "$SME_EMB" \ --in_tar "$NOB_EMB" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_combined_all_classy" \ --out_src "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \ --out_tar "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \ --model_filename "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \ --use_mnns_pooler 1 # ClassyMap evaluation (Single Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \ "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \ --dictionary "$TEST_DICT_SINGLE" \ --output_file "${RESULTS_DIR}/single_candidates_combined_all_classy.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_combined_all_classy" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \ # ClassyMap evaluation (Multi Dict) srun python3 ClassyMap/src/eval.py \ "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \ "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \ --dictionary "$TEST_DICT_MULTI" \ --output_file "${RESULTS_DIR}/multi_candidates_combined_all_classy.txt" \ --src_lid sme \ --tar_lid nob \ --idstring "EXP_combined_all_classy" \ --retrieval csls \ --seed 42 \ --cuda \ --super \ --model "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \ echo "Finished: SME = minCount=$mcs | minn=$mns | maxn=$mxs, NOB = minCount=$mc | minn=$mn | maxn=$mx" echo "Konfuzius sagt: \"Die GPU mag mächtig sein, doch auch sie folgt den Gesetzen der Geduld.\""