Merge branch 'master' of... (1a46e724) · Commits · chrysanthopoulou / Diachronic LLMs Automatic Dictionary Induction

slurm_scripts/run_grid_search_add_back_in.sh

0 → 100644

+147 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env bash
		#SBATCH --job-name=grid_search
		#SBATCH --partition=dev_gpu_4
		#SBATCH --mem=32G
		#SBATCH --cpus-per-task=64
		#SBATCH --gres=gpu:1
		#SBATCH --time=30:00
		#SBATCH --output=gridsearch_study_lower_ressources_4.out
		#SBATCH --mail-user=hoepfl@cl.uni-heidelberg.de
		#SBATCH --mail-type=ALL
		#SBATCH --ntasks=1

		#source ~/.bashrc
		cd $HOME/diachronic-llms-automatic-dictionary-induction

		# Load required modules
		module load devel/cuda/12.4

		# Set library path
		export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

		# Fixed dimension
		DIM=300

		# Grid
		MIN_COUNTS=(2 5 10)
		MIN_NS=(2 3)
		MAX_NS=(2 5 10)

		# Corpora paths
		SME_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/sami_corpora/final_sme_corpus.txt"
		NOB_CORPUS="$HOME/diachronic-llms-automatic-dictionary-induction/data/nob_corpora/clean_corpus_nob.txt"

		# Dictionaries
		SEED_DICT=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_seed_dict.txt
		TEST_DICT_SINGLE=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_single_dev_dict.txt
		TEST_DICT_MULTI=~/diachronic-llms-automatic-dictionary-induction/data/clean_dict_sme_nob/clean_multi_dev_dict.txt

		# Output directories - not saved to data/embeddings since those will likely by throwaway embeddings
		EMB_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/embeddings"
		ALIGN_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/aligned"
		MODEL_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/classifier"
		RESULTS_DIR="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/translation_candidates"

		# Train base nob model
		srun fastText/fasttext skipgram \
		-input "$NOB_CORPUS" \
		-output "${EMB_DIR}/nob_base_model" \
		-dim "$DIM" \
		-minCount "$mc" \
		-minn "$mn" \
		-maxn "$mx" \
		-thread 64

		echo "Starting n-gram based FastText grid search with ClassyMap – single dimension = $DIM"

		for mc in "${MIN_COUNTS[@]}"; do
		for mn in "${MIN_NS[@]}"; do
		for mx in "${MAX_NS[@]}"; do

		echo "---------------------------------------------"
		echo "dim=$DIM \| minCount=$mc \| minn=$mn \| maxn=$mx"
		echo "---------------------------------------------"


		#conda activate diac_lm
		srun fastText/fasttext skipgram \
		-input "$SME_CORPUS" \
		-output "${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \
		-dim "$DIM" \
		-minCount "$mc" \
		-minn "$mn" \
		-maxn "$mx" \
		-thread 64


		SME_EMB="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec"
		NOB_EMB="${EMB_DIR}/nob_base_model"
		SME_EMB_BIN="${EMB_DIR}/sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin"
		# NOB_EMB_BIN="${EMB_DIR}/nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.bin"

		SME_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/sme_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt"
		# NOB_LOW_FREQ="$HOME/diachronic-llms-automatic-dictionary-induction/slurm_scripts/ngram_study/nob_low_freq_words${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt"

		tr -cs '[:alpha:]' '\n' < "$SME_CORPUS" \| tr '[:upper:]' '[:lower:]' \| sort \| uniq -c \| awk '$1 >= 3 {print $2}' > "$SME_LOW_FREQ"
		# tr -cs '[:alpha:]' '\n' < "$NOB_CORPUS" \| tr '[:upper:]' '[:lower:]' \| sort \| uniq -c \| awk '$1 >= 3 {print $2}' > "$NOB_LOW_FREQ"

		TMP_SME_EMB="${SME_EMB}.tmp"
		python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_SME_EMB --phrases "$SME_LOW_FREQ"
		mv $TMP_SME_EMB $SME_EMB

		# TMP_NOB_EMB="${NOB_EMB}.tmp"
		# python $HOME/diachronic-llms-automatic-dictionary-induction/src/add_n_grams.py --model_path $SME_EMB_BIN --embedding_in $SME_EMB --embedding_out $TMP_NOB_EMB --phrases "$NOB_LOW_FREQ"
		# mv $TMP_NOB_EMB $SME_EMB


		# ClassyMap training
		srun python3 ClassyMap/src/classymap.py \
		--train_dict "$SEED_DICT" \
		--in_src "$SME_EMB" \
		--in_tar "$NOB_EMB" \
		--src_lid sme \
		--tar_lid nob \
		--idstring "EXP_dim${dim}" \
		--out_src "${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
		--out_tar "${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
		--model_filename "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl" \
		--use_mnns_pooler 1

		# ClassyMap evaluation (Single Dict)
		srun python3 ClassyMap/src/eval.py \
		"${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
		"${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
		--dictionary "$TEST_DICT_SINGLE" \
		--output_file "${RESULTS_DIR}/single_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \
		--src_lid sme \
		--tar_lid nob \
		--idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \
		--retrieval csls \
		--seed 42 \
		--cuda \
		--super \
		--model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl"

		# ClassyMap evaluation (Multi Dict)
		srun python3 ClassyMap/src/eval.py \
		"${ALIGN_DIR}/aligned_sme_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
		"${ALIGN_DIR}/aligned_nob_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.vec" \
		--dictionary "$TEST_DICT_MULTI" \
		--output_file "${RESULTS_DIR}/multi_candidates_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.txt" \
		--src_lid sme \
		--tar_lid nob \
		--idstring "EXP_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}" \
		--retrieval csls \
		--seed 42 \
		--cuda \
		--super \
		--model "${MODEL_DIR}/classifier_model_dim${DIM}_mc${mc}_minn${mn}_maxn${mx}.pkl"

		echo "Finished: dim=$DIM, mc=$mc, minn=$mn, maxn=$mx"
		echo
		done
		done
		done

		# Konfuzius sagt:
		# "Wer viele N-Grams sät, wird viele Token ernten."

src/add_n_grams.py

+20 −12

Original line number	Diff line number	Diff line
		@@ -4,13 +4,13 @@ import numpy as np
		import os
		from tqdm import tqdm
		import re
		import argparse

		def emb_separator_to_tab(in_file:str, out_file:str):
		"""
		Converts the separator between words and embedding vectors in a Word2Vec embedding file to \t.
		Args:
		in_file: Path to the Word2Vec embedding file to modify.

		Since the default embedding format uses \t to separate words, and space to separate phrases and vectors, this is also swapped.
		"""

		@@ -39,11 +39,9 @@ def emb_separator_to_tab(in_file:str, out_file:str):
		# Replace the original file with the modified file
		print("Successfully changed separator from space to tab. ")


		def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embedding_path_out:str):
		"""
		Creates embeddings for (potentially multi-word) phrases and adds those to a new vocabulary file.

		Args:
		phrases: A list of strings to be added to the file containing word(/phrase) - embedding pairs.
		model_path: The path to a fastText model. Should be a .bin model, since this also contains subwords.
		@@ -58,13 +56,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe
		with open(embedding_path_out, 'r') as fr:
		# Test if file format is correct
		header = fr.readline()
		sep = '\t'
		assert len(header.split()) == 2, "Please use a file with a header consisting of <vocab size> <embedding dim> as input"
		line = fr.readline()
		assert len(line.split('\t')) > 1, "Found a non-tab-separated-file, please use the emb_separator_to_tab() function first"
		if len(line.split(sep)) == 1:
		print("Found a non-tab-separated-file, please use the emb_separator_to_tab() function first if you want to work with multi-word phrases")
		sep = None
		# Add words to vocab
		fr.seek(0)
		for line in tqdm(fr, desc="Creating vocab: "):
		vocab.add(line.split('\t')[0])
		if sep is not None:
		vocab.add(line.split(sep)[0])
		else:
		vocab.add(line.split()[0])

		# Add new phrases to file
		c_new = 0
		@@ -91,15 +95,19 @@ def add_phrases_to_voc(phrases:list, model_path:str, embedding_path_in:str, embe

		print(f"Newly added {c_new} of {len(phrases)} phrases to file {embedding_path_in}, wrote into {embedding_path_out}")


		if __name__ == "__main__":
		# Read nob ordbok (monolingual dictionary) file
		with open("data/ordbank_nob/lemma.txt", 'r', encoding='ISO-8859-1') as fr:
		parser = argparse.ArgumentParser(description="Process embeddings and add phrases.")
		parser.add_argument("--model_path", type=str, required=True, help="Path to fastText model (.bin file)")
		parser.add_argument("--embedding_in", type=str, required=True, help="Path to input embedding file (.vec)")
		parser.add_argument("--embedding_out", type=str, required=True, help="Path to output embedding file")
		parser.add_argument("--phrases", type=str, required=False, default="data/ordbank_nob/lemma.txt", help="Path to phrases file")
		args = parser.parse_args()

		with open(args.phrases, 'r', encoding='ISO-8859-1') as fr:
		lemmata = []
		fr.readline() # Ignore header
		for line in fr:
		lemmata.append(line.split('\t')[2])
		lemmata.append(line.strip())
		print(lemmata[:20])

		emb_separator_to_tab("embeddings/model_nob.vec", "embeddings/model_nob_tab_sep.vec")
		add_phrases_to_voc(lemmata, "embeddings/model_nob.bin", "embeddings/model_nob_tab_sep.vec", "embeddings/model_nob_added_ordbank.vec")
		#emb_separator_to_tab(args.embedding_in, args.embedding_out)
		add_phrases_to_voc(lemmata, args.model_path, args.embedding_in, args.embedding_out)