Commit 7c0740ef authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

upload slurm scripts

parent 7874c592
Loading
Loading
Loading
Loading
+113 −0
Original line number Diff line number Diff line
okay, Finn can you have a look whether this looks plausibel? 


#!/usr/bin/env bash
#SBATCH --job-name=grid_dims_classymap
#SBATCH --partition=dev_gpu_4_a100
#SBATCH --mem=16G
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
#SBATCH --time=30:00
#SBATCH --output=grid_dims_classymap.out
#SBATCH --mail-user=hillengass@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --ntasks=1

#source ~/.bashrc
#conda activate diac_lm

# Load required modules
module load devel/cuda/12.4

# Set library path
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

SME_CORPUS="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt"
NOB_CORPUS="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/sme_nob_corpora/clean_corpus_nob.txt"

SEED_DICT=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt
TEST_DICT_SINGLE=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt
TEST_DICT_MULTI=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt

EMB_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings"
# Save to subfolder of slurm_scipts, not embeddings since those will probably be throwaway results
ALIGN_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/aligned"
MODEL_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/classifier"
RESULTS_DIR="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/slurm_scripts/dim_study/translation_candidates"


DIMS=(50 100 300 500)

for dim in "${DIMS[@]}"; do
  echo "---------------------------------------------"
  echo "Generating embeddings for dimension = $dim"
  echo "---------------------------------------------"

  # Generate FastText embeddings for SME
  srun fastText/fasttext skipgram \
    -input "$SME_CORPUS" \
    -output "${EMB_DIR}/sme_dim${dim}_skip_gram" \
    -dim "$dim" \
    -thread 64

  # Generate FastText embeddings for NOB
  srun fastText/fasttext skipgram \
    -input "$NOB_CORPUS" \
    -output "${EMB_DIR}/nob_dim${dim}_skip_gram" \
    -dim "$dim" \
    -thread 64

  # Paths to .vec version of embeddings
  SME_EMB="${EMB_DIR}/sme_dim${dim}_skip_gram.vec"
  NOB_EMB="${EMB_DIR}/nob_dim${dim}_skip_gram.vec"

  echo "Mapping embeddings for dimension = $dim using ClassyMap..."

  # ClassyMap training
  srun python3 ClassyMap/src/classymap.py \
    --train_dict "$SEED_DICT" \
    --in_src  "$SME_EMB" \
    --in_tar  "$NOB_EMB" \
    --src_lid sme \
    --tar_lid nob \
    --out_src "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \
    --out_tar "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \
    --model_filename "${MODEL_DIR}/classifier_model_dim${dim}.pkl" \
    --use_mnns_pooler 1 \
    --cuda

  # ClassyMap evaluation
  srun python3 ClassyMap/src/eval.py \
    "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \
    "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \
    --dictionary "$TEST_DICT_SINGLE" \
    --output_file "${RESULTS_DIR}/single_translation_candidates_dim${dim}.txt" \
    --src_lid sme \
    --tar_lid nob \
    --idstring "EXP_dim${dim}" \
    --retrieval csls \
    --seed 42 \
    --cuda \
    --super \
    --model "${MODEL_DIR}/classifier_model_dim${dim}.pkl"

    # ClassyMap evaluation
  srun python3 ClassyMap/src/eval.py \
    "${ALIGN_DIR}/aligned_sme_dim${dim}.vec" \
    "${ALIGN_DIR}/aligned_nob_dim${dim}.vec" \
    --dictionary "$TEST_DICT_MULTI" \
    --output_file "${RESULTS_DIR}/multi_translation_candidates_dim${dim}.txt" \
    --src_lid sme \
    --tar_lid nob \
    --idstring "EXP_dim${dim}" \
    --retrieval csls \
    --seed 42 \
    --cuda \
    --super \
    --model "${MODEL_DIR}/classifier_model_dim${dim}.pkl"

done

# Konfuzius sagt:
# "Wer vier Dimensionen testet, braucht eine fünfte für die Geduld."
 No newline at end of file
+36 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#SBATCH --job-name=grid_search
#SBATCH --partition=dev_gpu_4_a100
#SBATCH --mem=50G
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1
#SBATCH --time=00:15:00
#SBATCH --output=gen_sme_emb.out
#SBATCH --mail-user=hillengass@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --ntasks=1

#source ~/.bashrc
cd $HOME/diachronic-llms-automatic-dictionary-induction

# Load required modules
module load devel/cuda/12.4

# Set library path
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Fixed dimension
DIM=300

# Corpora paths
SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt"

EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings"


srun fastText/fasttext skipgram \
    -input "$SME_CORPUS" \
    -output "final_sme_corpus_fasttext_300_skipgram" \
    -dim "$DIM" \
    -thread 64
+6 −6
Original line number Diff line number Diff line
#!/usr/bin/env bash
#SBATCH --job-name=grid_search
#SBATCH --partition=gpu_4
#SBATCH --partition=gpu_8
#SBATCH --mem=50G
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1
#SBATCH --time=15:00:00
#SBATCH --output=gridsearch_study_lower_ressources_4.out
#SBATCH --time=00:10:00
#SBATCH --output=gridsearch_study_5_2_only.out
#SBATCH --mail-user=hillengass@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --ntasks=1
@@ -23,9 +23,9 @@ export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
DIM=300

# Grid 
MIN_COUNTS=(2 5 10)
MIN_NS=(2 3)
MAX_NS=(2 5 10)
MIN_COUNTS=(2)
MIN_NS=(3)
MAX_NS=(2)

# Corpora paths
SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt"
+229 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#SBATCH --job-name=grid_search
#SBATCH --partition=gpu_8
#SBATCH --mem=80G
#SBATCH --gres=gpu:1
#SBATCH --time=00:50:00
#SBATCH --output=best_cov_%j.out
#SBATCH --mail-user=hillengass@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --ntasks=1

#source ~/.bashrc
cd $HOME/diachronic-llms-automatic-dictionary-induction

# Load required modules
module load devel/cuda/12.4

# Set library path
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Fixed dimension
DIM=300

# Grid 
mc=10
#(2 5 10)
mn=3
#(2 3)
mx=5
#(2 5 10)

# Grid 
mcs=2
#(2 5 10)
mns=2
#(2 3)
mxs=10
#(2 5 10)

# Corpora paths
SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt"
NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt"

# Dictionaries
SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt
TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt
TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt

# Output directories - not saved to data/embeddings since those will likely by throwaway embeddings
EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings"
ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned"
MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier"
RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates"

# Standard embeddings
SME_EMB_STANDARD="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings/combined_all_fasttext_300_skip_gram.vec"
NOB_EMB_STANDARD="$HOME/diachronic-llms-automatic-dictionary-induction/data/embeddings/clean_corpus_nob_fasttext_300_skip_gram.vec"



# echo "---------------------------------------------"
# echo "testing SME = Standard, NOB = minCount=$mc | minn=$mn | maxn=$mx"
# echo "---------------------------------------------"

# SME_EMB=$SME_EMB_STANDARD
# NOB_EMB="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec"

# # ClassyMap training
# srun python3 ClassyMap/src/classymap.py \
#   --train_dict "$SEED_DICT" \
#   --in_src  "$SME_EMB" \
#   --in_tar  "$NOB_EMB" \
#   --src_lid sme \
#   --tar_lid nob \
#   --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \
#   --out_src "${ALIGN_DIR}/aligned_sme_standard.vec" \
#   --out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
#   --model_filename "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \
#   --use_mnns_pooler 1 


# # ClassyMap evaluation (Single Dict)
# srun python3 ClassyMap/src/eval.py \
#   "${ALIGN_DIR}/aligned_sme_standard.vec" \
#   "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
#   --dictionary "$TEST_DICT_SINGLE" \
#   --output_file "${RESULTS_DIR}/single_candidates__sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.txt" \
#   --src_lid sme \
#   --tar_lid nob \
#   --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \
#   --retrieval csls \
#   --seed 42 \
#   --super \
#   --cuda \
#   --model "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \

# # ClassyMap evaluation (Multi Dict)
# srun python3 ClassyMap/src/eval.py \
#   "${ALIGN_DIR}/aligned_sme_standard.vec" \
#   "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
#   --dictionary "$TEST_DICT_MULTI" \
#   --output_file "${RESULTS_DIR}/multi_candidates_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.txt" \
#   --src_lid sme \
#   --tar_lid nob \
#   --idstring "EXP_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score" \
#   --retrieval csls \
#   --seed 42 \
#   --cuda \
#   --super \
#   --model "${MODEL_DIR}/classifier_model_sme_standard_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_score.pkl" \

# echo "Finished: SME = Standard, NOB = minCount=$mc | minn=$mn | maxn=$mx"

# echo "Konfuzius sagt: \"Wo Hyperparameter variieren, wächst der Baum der Weisheit.\""



# echo "---------------------------------------------"
# echo "testing SME = minCount=$mc | minn=$mn | maxn=$mx, NOB = Standard"
# echo "---------------------------------------------"

# SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec"
# NOB_EMB=$NOB_EMB_STANDARD

# # ClassyMap training
# srun python3 ClassyMap/src/classymap.py \
#   --train_dict "$SEED_DICT" \
#   --in_src  "$SME_EMB" \
#   --in_tar  "$NOB_EMB" \
#   --src_lid sme \
#   --tar_lid nob \
#   --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \
#   --out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
#   --out_tar "${ALIGN_DIR}/aligned_nob_standard" \
#   --model_filename "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \
#   --use_mnns_pooler 1 \

# # ClassyMap evaluation (Single Dict)
# srun python3 ClassyMap/src/eval.py \
#   "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
#   "${ALIGN_DIR}/aligned_nob_standard" \
#   --dictionary "$TEST_DICT_SINGLE" \
#   --output_file "${RESULTS_DIR}/single_candidates_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard_test.txt" \
#   --src_lid sme \
#   --tar_lid nob \
#   --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \
#   --retrieval csls \
#   --seed 42 \
#   --cuda \
#   --super \
#   --model "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \

# # ClassyMap evaluation (Multi Dict)
# srun python3 ClassyMap/src/eval.py \
#   "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
#   "${ALIGN_DIR}/aligned_nob_standard" \
#   --dictionary "$TEST_DICT_MULTI" \
#   --output_file "${RESULTS_DIR}/multi_candidates_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard_test.txt" \
#   --src_lid sme \
#   --tar_lid nob \
#   --idstring "EXP_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard" \
#   --retrieval csls \
#   --seed 42 \
#   --cuda \
#   --super \
#   --model "${MODEL_DIR}/classifier_model_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}_nob_standard.pkl" \

# echo "Finished: SME = minCount=$mc | minn=$mn | maxn=$mx, NOB = Standard"

# echo "Konfuzius sagt: \"Die GPU mag mächtig sein, doch auch sie folgt den Gesetzen der Geduld.\""



echo "---------------------------------------------"
echo "testing SME = minCount=$mcs | minn=$mns | maxn=$mxs, NOB = minCount=$mc | minn=$mn | maxn=$mx"
echo "---------------------------------------------"

SME_EMB=$SME_EMB_STANDARD
#"${EMB_DIR}/sme_dim${DIM}_mc${mcs}_minn${mns}_maxn${mxs}.vec"
NOB_EMB=$NOB_EMB_STANDARD
#"${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec"

# ClassyMap training
srun python3 ClassyMap/src/classymap.py \
  --train_dict "$SEED_DICT" \
  --in_src  "$SME_EMB" \
  --in_tar  "$NOB_EMB" \
  --src_lid sme \
  --tar_lid nob \
  --idstring "EXP_combined_all_classy" \
  --out_src "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \
  --out_tar "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \
  --model_filename "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \
  --use_mnns_pooler 1

# ClassyMap evaluation (Single Dict)
srun python3 ClassyMap/src/eval.py \
  "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \
  "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \
  --dictionary "$TEST_DICT_SINGLE" \
  --output_file "${RESULTS_DIR}/single_candidates_combined_all_classy.txt" \
  --src_lid sme \
  --tar_lid nob \
  --idstring "EXP_combined_all_classy" \
  --retrieval csls \
  --seed 42 \
  --cuda \
  --super \
  --model "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \

# ClassyMap evaluation (Multi Dict)
srun python3 ClassyMap/src/eval.py \
  "${ALIGN_DIR}/aligned_sme_combined_all_classy.vec" \
  "${ALIGN_DIR}/aligned_nob_combined_all_classy.vec" \
  --dictionary "$TEST_DICT_MULTI" \
  --output_file "${RESULTS_DIR}/multi_candidates_combined_all_classy.txt" \
  --src_lid sme \
  --tar_lid nob \
  --idstring "EXP_combined_all_classy" \
  --retrieval csls \
  --seed 42 \
  --cuda \
  --super \
  --model "${MODEL_DIR}/classifier_model_combined_all_classy.pkl" \

echo "Finished: SME = minCount=$mcs | minn=$mns | maxn=$mxs, NOB = minCount=$mc | minn=$mn | maxn=$mx"

echo "Konfuzius sagt: \"Die GPU mag mächtig sein, doch auch sie folgt den Gesetzen der Geduld.\""