diff --git a/README.md b/README.md index fd3699b..6ff07f7 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ conda activate sockeye cd signwriting_translation -MODEL_DIR=/shares/volk.cl.uzh/amoryo/checkpoints/signwriting-translation +MODEL_DIR=/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation DATA_DIR=/home/amoryo/sign-language/signwriting-translation/parallel DIRECTION="spoken-to-signed" @@ -47,21 +47,21 @@ sbatch prepare_data.sh # (Train without factors) sbatch train_sockeye_model.sh \ --data_dir="$DATA_DIR/$DIRECTION" \ - --model_dir="$MODEL_DIR/$DIRECTION/no-factors" \ + --model_dir="$MODEL_DIR/$DIRECTION/no-factors-gpt" \ --optimized_metric="signwriting-similarity" \ --partition lowprio # (Train with factors) sbatch train_sockeye_model.sh \ --data_dir="$DATA_DIR/$DIRECTION" \ - --model_dir="$MODEL_DIR/$DIRECTION/target-factors-v4" \ + --model_dir="$MODEL_DIR/$DIRECTION/target-factors-gpt" \ --optimized_metric="signwriting-similarity" \ --use_target_factors=true \ --partition lowprio # (Fine tune model on cleaned data) sbatch train_sockeye_model.sh \ --data_dir="$DATA_DIR-clean/$DIRECTION" \ - --model_dir="$MODEL_DIR/$DIRECTION/target-factors-tuned" \ - --base_model_dir="$MODEL_DIR/$DIRECTION/target-factors-v4" \ + --model_dir="$MODEL_DIR/$DIRECTION/target-factors-gpt-tuned" \ + --base_model_dir="$MODEL_DIR/$DIRECTION/target-factors-gpt" \ --optimized_metric="signwriting-similarity" \ --use_target_factors=true \ --partition lowprio @@ -72,7 +72,7 @@ cat "$MODEL_DIR/$DIRECTION/target-factors/model/metrics" | grep "signwriting-sim # 3. Test it yourself python -m signwriting_translation.bin \ - --model="$MODEL_DIR/$DIRECTION/target-factors-tuned/model" \ + --model="$MODEL_DIR/$DIRECTION/target-factors-gpt/model" \ --spoken-language="en" \ --signed-language="ase" \ --input="My name is John." @@ -82,17 +82,19 @@ python -m signwriting_translation.bin \ ```bash # Copy the model files to a new directory -SE_MODEL_PATH="$MODEL_DIR/$DIRECTION/target-factors-tuned" -HF_MODEL_PATH="$MODEL_DIR/$DIRECTION/huggingface/target-factors-tuned" +SE_MODEL_PATH="$MODEL_DIR/$DIRECTION/target-factors-gpt-tuned" +HF_MODEL_PATH="$MODEL_DIR/$DIRECTION/huggingface/target-factors-gpt-tuned" +rm -r "$HF_MODEL_PATH" mkdir -p "$HF_MODEL_PATH" -cp tokenizer.json "$HF_MODEL_PATH/tokenizer.json" cp "$SE_MODEL_PATH/model/params.best" "$HF_MODEL_PATH/params.best" cp "$SE_MODEL_PATH/model/version" "$HF_MODEL_PATH/version" cp "$SE_MODEL_PATH/model/metrics" "$HF_MODEL_PATH/metrics" cp "$SE_MODEL_PATH/model/config" "$HF_MODEL_PATH/config" cp "$SE_MODEL_PATH/model/args.yaml" "$HF_MODEL_PATH/args.yaml" cp "$SE_MODEL_PATH/model/vocab."* "$HF_MODEL_PATH" +# if tokenizer exists +! [ -f "tokenizer.json" ] && cp tokenizer.json "$HF_MODEL_PATH/tokenizer.json" # Upload to HuggingFace huggingface-cli login diff --git a/signwriting_translation/bin.py b/signwriting_translation/bin.py index 2b2f914..66d8954 100644 --- a/signwriting_translation/bin.py +++ b/signwriting_translation/bin.py @@ -8,7 +8,8 @@ from signwriting.tokenizer import SignWritingTokenizer from sockeye.inference import TranslatorOutput -from tokenizers import Tokenizer + +from signwriting_translation.tokenizer import tokenize_spoken_text sw_tokenizer = SignWritingTokenizer() @@ -36,9 +37,9 @@ def load_sockeye_translator(model_path: str, log_timing: bool = False): if log_timing: print("Loaded sockeye translator in", time.time() - now, "seconds") - tokenizer = Tokenizer.from_file(str(Path(model_path) / 'tokenizer.json')) + tokenizer_path = str(Path(model_path) / 'tokenizer.json') - return translator, tokenizer + return translator, tokenizer_path def translate(translator, texts: List[str], log_timing: bool = False): @@ -70,8 +71,9 @@ def signwriting_to_text(): # pylint: disable=unused-variable args = get_args() - translator, tokenizer = load_sockeye_translator(args.model) - tokenized_text = " ".join(tokenizer.encode(args.input).tokens) + translator, tokenizer_path = load_sockeye_translator(args.model) + # tokenized_text = " ".join(tokenizer.encode(args.input).tokens) + tokenized_text = tokenize_spoken_text(args.input) # , tokenizer_path) model_input = f"${args.spoken_language} ${args.signed_language} {tokenized_text}" outputs = translate(translator, [model_input]) print(outputs[0]) diff --git a/signwriting_translation/create_parallel_data.py b/signwriting_translation/create_parallel_data.py index c9da9ac..d7c6c9e 100644 --- a/signwriting_translation/create_parallel_data.py +++ b/signwriting_translation/create_parallel_data.py @@ -7,7 +7,7 @@ from signwriting.tokenizer import SignWritingTokenizer, normalize_signwriting from tqdm import tqdm -from spoken_language_tokenizer import tokenize_spoken_text +from tokenizer import tokenize_spoken_text csv.field_size_limit(int(1e6)) @@ -34,7 +34,7 @@ def load_csv(data_path: Path): CLEAN_DIRECTIONS = { "spoken-to-signed": { "more": 1, - "cleaned": 1, + "cleaned": 2, # de emphasize fingerspelling }, "signed-to-spoken": { "more": 1, diff --git a/signwriting_translation/graph_metrics.py b/signwriting_translation/graph_metrics.py index cd5440a..34ecd50 100644 --- a/signwriting_translation/graph_metrics.py +++ b/signwriting_translation/graph_metrics.py @@ -1,22 +1,25 @@ +import os from collections import defaultdict import matplotlib.pyplot as plt -MODELS_DIR = "/shares/volk.cl.uzh/amoryo/checkpoints/signwriting-translation/" +MODELS_DIR = "/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation/" DIRECTION = "spoken-to-signed" MODELS = [ - "no-factors", - "target-factors", - "target-factors-v2", - "target-factors-v4", - "target-factors-tuned" + "no-factors-gpt", + "target-factors-gpt", + "target-factors-gpt-tuned", + "target-factors-gpt-12", ] if __name__ == "__main__": models_metrics = defaultdict(lambda: defaultdict(list)) for model_name in MODELS: - metrics_file = f"{MODELS_DIR}{DIRECTION}/{model_name}/model/metrics" + model_path = f"{MODELS_DIR}{DIRECTION}/{model_name}" + if not os.path.exists(f"{model_path}/model/metrics"): + continue + metrics_file = f"{model_path}/model/metrics" with open(metrics_file, 'r', encoding="utf-8") as f: lines = f.readlines() for line in lines: @@ -27,13 +30,16 @@ except ValueError: pass - for metric in ['chrf', 'signwriting-similarity']: + print(next(iter(models_metrics.values())).keys()) + + for metric in ['chrf-val', 'signwriting-similarity-val', 'signwriting-clip-val', + 'perplexity-train', 'perplexity-val', 'decode-walltime-val', 'max-gpu-memory']: plt.figure(figsize=(10, 5)) plt.grid(axis='y', linestyle='--', linewidth=0.5) for model_name, metrics in models_metrics.items(): - plt.plot(metrics[f"{metric}-val"], label=model_name) - if metric == 'signwriting-similarity': + plt.plot(metrics[metric], label=model_name) + if metric == 'signwriting-similarity-val': plt.ylim(0.35, None) plt.legend(loc='lower right') plt.savefig(f"{metric}.png") diff --git a/signwriting_translation/old/__init__.py b/signwriting_translation/old/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/signwriting_translation/spoken_language_tokenizer.py b/signwriting_translation/old/_spoken_language_tokenizer.py similarity index 73% rename from signwriting_translation/spoken_language_tokenizer.py rename to signwriting_translation/old/_spoken_language_tokenizer.py index 4c903a3..5df7583 100644 --- a/signwriting_translation/spoken_language_tokenizer.py +++ b/signwriting_translation/old/_spoken_language_tokenizer.py @@ -1,4 +1,4 @@ -import argparse +import os from functools import lru_cache from typing import List @@ -9,7 +9,12 @@ @lru_cache(maxsize=None) def load_tokenizer(tokenizer_file: str = 'tokenizer.json'): - return Tokenizer.from_file(tokenizer_file) + # check if tokenizer_file exists + if os.path.exists(tokenizer_file): + return Tokenizer.from_file(tokenizer_file) + + from transformers import GPT2TokenizerFast + return GPT2TokenizerFast.from_pretrained('Xenova/gpt-4') @lru_cache(maxsize=int(1e7)) @@ -22,7 +27,9 @@ def tokenize_spoken_text(text: str, tokenizer_file: str = 'tokenizer.json'): def train(files: List[str], target_file: str): tokenizer = Tokenizer(BPE()) tokenizer.normalizer = normalizers.NFKD() - # Take the pre tokenizer setting from GPT-4, https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee + # Take the pre tokenizer setting from GPT-4o, + # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py#L101 # TODO + # https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split(pattern=tokenizers.Regex( "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"), @@ -31,12 +38,14 @@ def train(files: List[str], target_file: str): pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) ]) tokenizer.decoder = decoders.ByteLevel() - trainer = trainers.BpeTrainer(vocab_size=8000) + trainer = trainers.BpeTrainer(vocab_size=16000) tokenizer.train(files=files, trainer=trainer) tokenizer.save(target_file) +raise Exception("Do not use this tokenizer file") + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--files", nargs='+', type=str, help="Files to train tokenizer on") diff --git a/signwriting_translation/deteokenize_signwriting.py b/signwriting_translation/old/deteokenize_signwriting.py similarity index 78% rename from signwriting_translation/deteokenize_signwriting.py rename to signwriting_translation/old/deteokenize_signwriting.py index f059f7b..fafe144 100644 --- a/signwriting_translation/deteokenize_signwriting.py +++ b/signwriting_translation/old/deteokenize_signwriting.py @@ -26,7 +26,7 @@ def print_factored_prediction(factors_file_template: str): print(tokenizer.tokens_to_text((" ".join(symbols)).split(" "))) -print_prediction("/home/amoryo/sign-language/signwriting-translation/parallel/spoken-to-signed/test/target.txt") +# print_prediction("/home/amoryo/sign-language/signwriting-translation/parallel/spoken-to-signed/test/target.txt") print_factored_prediction( # pylint: disable=line-too-long - "/shares/volk.cl.uzh/amoryo/checkpoints/signwriting-translation/spoken-to-signed/target-factors/model/decode.output.{}.00332") + "/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation/spoken-to-signed/target-factors-gpt-tuned/model/decode.output.{}.00248") diff --git a/signwriting_translation/old/model_test.py b/signwriting_translation/old/model_test.py new file mode 100644 index 0000000..d871c5d --- /dev/null +++ b/signwriting_translation/old/model_test.py @@ -0,0 +1,31 @@ +from signwriting_translation.bin import load_sockeye_translator, translate +from signwriting_translation.tokenizer import tokenize_spoken_text + +spoken_language = "en" +signed_language = "ase" +texts = [ + "Hello world", + "World", + "Hello", + "How are you?" + "Hello, how are you?", + "Hi", + "goodbye world", + "My name is Amit.", + "test", + "Test", + "testing", + "Testing", + "Washington D.C.", + "Washington, D.C.", + "Washington DC", +] +translator, spoken_tokenizer = load_sockeye_translator( + "/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation/spoken-to-signed/target-factors-gpt-tuned/model", + log_timing=True) +tokenized_texts = [tokenize_spoken_text(text) for text in texts] +model_inputs = [f"${spoken_language} ${signed_language} {tokenized_text}" for tokenized_text in tokenized_texts] + +for translation in translate(translator, model_inputs, log_timing=True): + print(translation) + print("M500x500S38800464x496") diff --git a/signwriting_translation/prepare_data.sh b/signwriting_translation/prepare_data.sh index c2b562a..7361c8e 100644 --- a/signwriting_translation/prepare_data.sh +++ b/signwriting_translation/prepare_data.sh @@ -23,10 +23,10 @@ git clone https://github.com/sign-language-processing/signbank-plus.git "$SIGNBA [ ! -d "$SIGNBANK_DIR/data/parallel/cleaned" ] && \ python "$SIGNBANK_DIR/signbank_plus/prep_nmt.py" -# Train a tokenizer -python spoken_language_tokenizer.py \ - --files $SIGNBANK_DIR/data/parallel/cleaned/train.target $SIGNBANK_DIR/data/parallel/more/train.target \ - --output="tokenizer.json" +## Train a tokenizer +#python spoken_language_tokenizer.py \ +# --files $SIGNBANK_DIR/data/parallel/cleaned/train.target $SIGNBANK_DIR/data/parallel/more/train.target \ +# --output="tokenizer.json" # Prepare the parallel corpus (with source/target-factors) python create_parallel_data.py \ diff --git a/signwriting_translation/tokenizer.py b/signwriting_translation/tokenizer.py new file mode 100644 index 0000000..a5d5da2 --- /dev/null +++ b/signwriting_translation/tokenizer.py @@ -0,0 +1,23 @@ +from functools import lru_cache + +import tiktoken + + +@lru_cache() +def load_tokenizer(): + return tiktoken.get_encoding("cl100k_base") + + +def tokenize_spoken_text(text: str): + tokenizer = load_tokenizer() + encoding = tokenizer.encode(text) + + tokens_text = [tokenizer.decode([t]) for t in encoding] + # replace space with special token: + tokens_text = [t.replace(" ", "Ġ") for t in tokens_text] + + return " ".join(tokens_text) + + +if __name__ == "__main__": + print(tokenize_spoken_text("Hello world, my name is amit.")) diff --git a/signwriting_translation/train_sockeye_model.sh b/signwriting_translation/train_sockeye_model.sh index 528b5e8..676640d 100644 --- a/signwriting_translation/train_sockeye_model.sh +++ b/signwriting_translation/train_sockeye_model.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name=train-sockeye -#SBATCH --time=48:00:00 +#SBATCH --time=24:00:00 #SBATCH --mem=16G #SBATCH --output=train-%j.out @@ -85,19 +85,19 @@ function find_vocabulary_factors_files() { } function vocabulary_files() { - local base_model_dir=$1 + local base_vocab_dir=$1 local type=$2 # e.g., "src" or "trg" local type_short=$3 # e.g., "src" or "trg" local use_factors=$4 # Pass 'true' or 'false' to use factors - if [ -z "$base_model_dir" ]; then + if [ -z "$base_vocab_dir" ]; then return fi - echo "--${type}-vocab $base_model_dir/model/vocab.${type_short}.0.json " + echo "--${type}-vocab $base_vocab_dir/vocab.${type_short}.0.json " if [[ "$use_factors" == "true" ]]; then - echo "--${type}-factor-vocabs $(find_vocabulary_factors_files $base_model_dir/model $type_short)" + echo "--${type}-factor-vocabs $(find_vocabulary_factors_files $base_vocab_dir $type_short)" fi } @@ -107,17 +107,20 @@ max_seq_len=2048 [ "$use_target_factors" == "true" ] && max_seq_len=512 # Prepare data +if [ -n "$base_model_dir" ]; then + vocab_dir="$base_model_dir/model" +fi + TRAIN_DATA_DIR="$model_dir/train_data" [ ! -f "$TRAIN_DATA_DIR/data.version" ] && \ python -m sockeye.prepare_data \ --max-seq-len $max_seq_len:$max_seq_len \ - $(vocabulary_files "$base_model_dir" "source" "src" $use_source_factors) \ + $(vocabulary_files "$vocab_dir" "source" "src" $use_source_factors) \ $(translation_files "source" "source" "$data_dir/train" $use_source_factors) \ - $(vocabulary_files "$base_model_dir" "target" "trg" $use_target_factors) \ + $(vocabulary_files "$vocab_dir" "target" "trg" $use_target_factors) \ $(translation_files "target" "target" "$data_dir/train" $use_target_factors) \ --output $TRAIN_DATA_DIR \ -cp tokenizer.json $model_dir/tokenizer.json MODEL_DIR="$model_dir/model" rm -rf $MODEL_DIR @@ -152,7 +155,8 @@ python -m sockeye.train \ --learning-rate-reduce-factor 0.7 \ --decode-and-evaluate 1000 \ --checkpoint-interval 1000 \ - --max-num-checkpoint-not-improved 50 \ + --max-num-checkpoint-not-improved 100 \ + --min-num-epochs 10 \ --embed-dropout 0.5 \ --transformer-dropout-prepost 0.2 \ --transformer-dropout-act 0.2 \ @@ -162,3 +166,6 @@ python -m sockeye.train \ --no-bucketing \ $extra_arguments \ --output $MODEL_DIR + +[ -f tokenizer.json ] && +cp tokenizer.json $MODEL_DIR/tokenizer.json \ No newline at end of file