Commit cfc09943 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files
parents 6c720a9b 261fbda5
Loading
Loading
Loading
Loading
+71 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#SBATCH --job-name=embedding_types
#SBATCH --partition=gpu_4
#SBATCH --mem=16G
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
#SBATCH --time=3:00:00
#SBATCH --output=embedding_type_study.out
#SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --ntasks=1

source ~/.bashrc
conda activate diac_lm

SEED_DICT=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt
TEST_DICT_SINGLE=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt
TEST_DICT_MULTI=~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt

# Clean NOB + passende SME-Embeddings:
PAIRS=(
  "combined_all_fasttext_300_cbow.vec|clean_corpus_nob_fasttext_300_cbow.vec"
  "corpus_sme_fasttext_300_skip_gram.vec|clean_corpus_nob_fasttext_300_skip_gram.vec"
  "corpus_sme_glove_300_vectors.vec|clean_corpus_nob_glove_300_vocab.txt"
  "corpus_sme_word2vec_300.vec|clean_corpus_nob_word2vec_300.vec"
)

for pair in "${PAIRS[@]}"; do
    SME_FILE=$(echo "$pair" | cut -d'|' -f1)
    NOB_FILE=$(echo "$pair" | cut -d'|' -f2)

    echo "-----------------------------------------"
    echo "Mapping: $SME_FILE -> $NOB_FILE"
    echo "-----------------------------------------"

    SME_PATH="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/${SME_FILE}"
    NOB_PATH="$HOME/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/${NOB_FILE}"


    # 1) Map Embeddings (semi-supervised)
    srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/map_embeddings.py \
         --semi_supervised \
         "$SEED_DICT" \
         "$SME_PATH" \
         "$NOB_PATH" \
         ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \
         ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \
         --cuda

    # 2) Evaluate Single
    srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/eval_translation.py \
         ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \
         ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \
         -d "$TEST_DICT_SINGLE" \
         --cuda --retrieval csls \
         > "embedding_type_study/record_file_single_$(basename "${SME_FILE}" .vec).txt"

    # 3) Evaluate Multi
    srun python ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/vecmap/eval_translation.py \
         ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/sme_mapped_temp \
         ~/diac_lm_n/diachronic-llms-automatic-dictionary-induction/data/embeddings/nob_mapped_temp \
         -d "$TEST_DICT_MULTI" \
         --cuda --retrieval csls \
         > "embedding_type_study/record_file_multi_$(basename "${SME_FILE}" .vec).txt"

    echo "Fertig mit $SME_FILE -> $NOB_FILE"
    echo
done

# Konfuzius sagt:
# "Wer die besten Embeddings sucht, muss erst die schlechtesten vergessen."